synthetic-data-generator

Runtime error

App Files Files Community

davidberenstein1957 commited on Sep 12, 2024

Commit

9b4773a

1 Parent(s): 5829740

feat: update batch size

Browse files

Files changed (3) hide show

src/distilabel_dataset_generator/apps/sft.py +3 -1
src/distilabel_dataset_generator/pipelines/sft.py +9 -11
src/distilabel_dataset_generator/utils.py +1 -1

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -74,6 +74,8 @@ def generate_dataset(
         if repo_name is not None and org_name is not None
         else None
     )
     if repo_id is not None:
         if not all([repo_id, org_name, repo_name]):
             raise gr.Error(
@@ -295,7 +297,7 @@ with gr.Blocks(
         ],
         outputs=[table],
         show_progress=True,
-    ).then(
         fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],

         if repo_name is not None and org_name is not None
         else None
     )
+    if oauth_token is None or oauth_token == "":
+        print(oauth_token, repo_id)
     if repo_id is not None:
         if not all([repo_id, org_name, repo_name]):
             raise gr.Error(
         ],
         outputs=[table],
         show_progress=True,
+    ).success(
         fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],

src/distilabel_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -138,6 +138,7 @@ _STOP_SEQUENCES = [
     "assistant",
     " \n\n",
 ]
 def _get_output_mappings(num_turns):
@@ -205,33 +206,30 @@ def get_pipeline(num_turns, num_rows, system_prompt):
                         "stop_sequences": _STOP_SEQUENCES,
                     },
                 ),
-                batch_size=2,
                 n_turns=num_turns,
                 num_rows=num_rows,
                 system_prompt=system_prompt,
                 output_mappings={"instruction": "prompt"},
-                only_instruction=True
             )
             generate_response = TextGeneration(
                 llm=InferenceEndpointsLLM(
                     model_id=MODEL,
                     tokenizer_id=MODEL,
                     api_key=os.environ["HF_TOKEN"],
-                    generation_kwargs={
-                        "temperature": 0.8,
-                        "max_new_tokens": 1024
-                    },
                 ),
                 system_prompt=system_prompt,
                 output_mappings={"generation": "completion"},
-                input_mappings={"instruction": "prompt"}
             )
             keep_columns = KeepColumns(
                 columns=list(output_mappings.values()) + ["model_name"],
             )
             magpie.connect(generate_response)
             generate_response.connect(keep_columns)
         return pipeline
@@ -250,7 +248,7 @@ def get_pipeline(num_turns, num_rows, system_prompt):
                         "stop_sequences": _STOP_SEQUENCES,
                     },
                 ),
-                batch_size=2,
                 n_turns=num_turns,
                 num_rows=num_rows,
                 system_prompt=system_prompt,

     "assistant",
     " \n\n",
 ]
+DEFAULT_BATCH_SIZE = 1
 def _get_output_mappings(num_turns):
                         "stop_sequences": _STOP_SEQUENCES,
                     },
                 ),
+                batch_size=DEFAULT_BATCH_SIZE,
                 n_turns=num_turns,
                 num_rows=num_rows,
                 system_prompt=system_prompt,
                 output_mappings={"instruction": "prompt"},
+                only_instruction=True,
             )
             generate_response = TextGeneration(
                 llm=InferenceEndpointsLLM(
                     model_id=MODEL,
                     tokenizer_id=MODEL,
                     api_key=os.environ["HF_TOKEN"],
+                    generation_kwargs={"temperature": 0.8, "max_new_tokens": 1024},
                 ),
                 system_prompt=system_prompt,
                 output_mappings={"generation": "completion"},
+                input_mappings={"instruction": "prompt"},
             )
             keep_columns = KeepColumns(
                 columns=list(output_mappings.values()) + ["model_name"],
             )
             magpie.connect(generate_response)
             generate_response.connect(keep_columns)
         return pipeline
                         "stop_sequences": _STOP_SEQUENCES,
                     },
                 ),
+                batch_size=DEFAULT_BATCH_SIZE,
                 n_turns=num_turns,
                 num_rows=num_rows,
                 system_prompt=system_prompt,

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -30,7 +30,7 @@ def get_login_button():
     return gr.LoginButton(
         value="Sign in with Hugging Face!",
         size="lg",
-    )
 def get_duplicate_button():

     return gr.LoginButton(
         value="Sign in with Hugging Face!",
         size="lg",
+    ).activate()
 def get_duplicate_button():