Spaces:
Runtime error
Runtime error
Commit
Β·
a0cefd0
1
Parent(s):
9b13367
update to allow enforcing pre-query-template
Browse files- README.md +1 -0
- src/synthetic_dataset_generator/constants.py +13 -2
README.md
CHANGED
|
@@ -87,6 +87,7 @@ Optionally, you can use different models and APIs.
|
|
| 87 |
- `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`, `https://api.openai.com/v1/`.
|
| 88 |
- `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`.
|
| 89 |
- `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the provided `HF_TOKEN` environment variable.
|
|
|
|
| 90 |
|
| 91 |
Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:
|
| 92 |
|
|
|
|
| 87 |
- `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`, `https://api.openai.com/v1/`.
|
| 88 |
- `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`.
|
| 89 |
- `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the provided `HF_TOKEN` environment variable.
|
| 90 |
+
- `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie generation to either `llama3`, `qwen2`. Not that this is only used if the model is a Qwen or Llama model. If you want to use other model families for chat data generation, feel free to [implement your own pre-query template](https://github.com/argilla-io/distilabel/pull/778/files).
|
| 91 |
|
| 92 |
Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:
|
| 93 |
|
src/synthetic_dataset_generator/constants.py
CHANGED
|
@@ -33,10 +33,20 @@ if BASE_URL != "https://api-inference.huggingface.co/v1/" and len(API_KEYS) == 0
|
|
| 33 |
raise ValueError(
|
| 34 |
"API_KEY is not set. Ensure you have set the API_KEY environment variable that has access to the Hugging Face Inference Endpoints."
|
| 35 |
)
|
| 36 |
-
|
| 37 |
llama_options = ["llama3", "llama-3", "llama 3"]
|
| 38 |
qwen_options = ["qwen2", "qwen-2", "qwen 2"]
|
| 39 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
SFT_AVAILABLE = True
|
| 41 |
MAGPIE_PRE_QUERY_TEMPLATE = "llama3"
|
| 42 |
elif MODEL.lower() in qwen_options:
|
|
@@ -49,6 +59,7 @@ else:
|
|
| 49 |
)
|
| 50 |
MAGPIE_PRE_QUERY_TEMPLATE = None
|
| 51 |
|
|
|
|
| 52 |
# Embeddings
|
| 53 |
STATIC_EMBEDDING_MODEL = "minishlab/potion-base-8M"
|
| 54 |
|
|
|
|
| 33 |
raise ValueError(
|
| 34 |
"API_KEY is not set. Ensure you have set the API_KEY environment variable that has access to the Hugging Face Inference Endpoints."
|
| 35 |
)
|
|
|
|
| 36 |
llama_options = ["llama3", "llama-3", "llama 3"]
|
| 37 |
qwen_options = ["qwen2", "qwen-2", "qwen 2"]
|
| 38 |
+
if os.getenv("MAGPIE_PRE_QUERY_TEMPLATE"):
|
| 39 |
+
SFT_AVAILABLE = True
|
| 40 |
+
passed_pre_query_template = os.getenv("MAGPIE_PRE_QUERY_TEMPLATE")
|
| 41 |
+
if passed_pre_query_template.lower() in llama_options:
|
| 42 |
+
MAGPIE_PRE_QUERY_TEMPLATE = "llama3"
|
| 43 |
+
elif passed_pre_query_template.lower() in qwen_options:
|
| 44 |
+
MAGPIE_PRE_QUERY_TEMPLATE = "qwen2"
|
| 45 |
+
else:
|
| 46 |
+
raise ValueError(
|
| 47 |
+
f"MAGPIE_PRE_QUERY_TEMPLATE must be either {llama_options} or {qwen_options}."
|
| 48 |
+
)
|
| 49 |
+
elif MODEL.lower() in llama_options:
|
| 50 |
SFT_AVAILABLE = True
|
| 51 |
MAGPIE_PRE_QUERY_TEMPLATE = "llama3"
|
| 52 |
elif MODEL.lower() in qwen_options:
|
|
|
|
| 59 |
)
|
| 60 |
MAGPIE_PRE_QUERY_TEMPLATE = None
|
| 61 |
|
| 62 |
+
|
| 63 |
# Embeddings
|
| 64 |
STATIC_EMBEDDING_MODEL = "minishlab/potion-base-8M"
|
| 65 |
|