Spaces:
Runtime error
Runtime error
Commit
Β·
e36d40b
1
Parent(s):
ff44e29
fix: mappings
Browse filesfeat: add max number of rows
src/distilabel_dataset_generator/sft.py
CHANGED
|
@@ -142,16 +142,10 @@ DEFAULT_DATASET = pd.DataFrame(
|
|
| 142 |
|
| 143 |
|
| 144 |
def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str = None):
|
| 145 |
-
|
| 146 |
-
{
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
}
|
| 150 |
-
if num_turns == 1
|
| 151 |
-
else {
|
| 152 |
-
"conversation": "messages",
|
| 153 |
-
}
|
| 154 |
-
)
|
| 155 |
with Pipeline(name="sft") as pipeline:
|
| 156 |
magpie = MagpieGenerator(
|
| 157 |
llm=InferenceEndpointsLLM(
|
|
@@ -181,7 +175,7 @@ def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str =
|
|
| 181 |
columns=list(output_mappings.values()) + ["model_name"],
|
| 182 |
)
|
| 183 |
magpie.connect(keep_columns)
|
| 184 |
-
distiset: Distiset = pipeline.run()
|
| 185 |
result_queue.put(distiset)
|
| 186 |
|
| 187 |
|
|
@@ -227,6 +221,16 @@ def generate_dataset(
|
|
| 227 |
raise gr.Error(
|
| 228 |
"Please sign in with Hugging Face to be able to push the dataset to the Hub."
|
| 229 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
gr.Info(
|
| 232 |
"Started pipeline execution. This might take a while, depending on the number of rows and turns you have selected. Don't close this page."
|
|
@@ -316,7 +320,7 @@ More information on distilabel and techniques can be found in the "FAQ" tab. The
|
|
| 316 |
num_turns = gr.Number(
|
| 317 |
value=1,
|
| 318 |
label="Number of turns in the conversation",
|
| 319 |
-
|
| 320 |
info="Whether the dataset is for a single turn with 'instruction-response' columns or a multi-turn conversation with a 'conversation' column.",
|
| 321 |
)
|
| 322 |
num_rows = gr.Number(
|
|
|
|
| 142 |
|
| 143 |
|
| 144 |
def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str = None):
|
| 145 |
+
if num_turns == 1:
|
| 146 |
+
output_mappings = {"instruction": "prompt", "response": "completion"}
|
| 147 |
+
else:
|
| 148 |
+
output_mappings = {"conversation": "messages"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
with Pipeline(name="sft") as pipeline:
|
| 150 |
magpie = MagpieGenerator(
|
| 151 |
llm=InferenceEndpointsLLM(
|
|
|
|
| 175 |
columns=list(output_mappings.values()) + ["model_name"],
|
| 176 |
)
|
| 177 |
magpie.connect(keep_columns)
|
| 178 |
+
distiset: Distiset = pipeline.run(use_cache=False)
|
| 179 |
result_queue.put(distiset)
|
| 180 |
|
| 181 |
|
|
|
|
| 221 |
raise gr.Error(
|
| 222 |
"Please sign in with Hugging Face to be able to push the dataset to the Hub."
|
| 223 |
)
|
| 224 |
+
if num_turns > 4:
|
| 225 |
+
raise gr.Info(
|
| 226 |
+
"You can only generate a dataset with 4 or fewer turns. Setting to 4."
|
| 227 |
+
)
|
| 228 |
+
num_turns = 4
|
| 229 |
+
if num_rows > 5000:
|
| 230 |
+
raise gr.Info(
|
| 231 |
+
"You can only generate a dataset with 5000 or fewer rows. Setting to 5000."
|
| 232 |
+
)
|
| 233 |
+
num_rows = 5000
|
| 234 |
|
| 235 |
gr.Info(
|
| 236 |
"Started pipeline execution. This might take a while, depending on the number of rows and turns you have selected. Don't close this page."
|
|
|
|
| 320 |
num_turns = gr.Number(
|
| 321 |
value=1,
|
| 322 |
label="Number of turns in the conversation",
|
| 323 |
+
maximum=4,
|
| 324 |
info="Whether the dataset is for a single turn with 'instruction-response' columns or a multi-turn conversation with a 'conversation' column.",
|
| 325 |
)
|
| 326 |
num_rows = gr.Number(
|