synthetic-data-generator

Runtime error

App Files Files Community

sdiazlor commited on Dec 3, 2024

Commit

fb096d2

verified ·

1 Parent(s): 7afacd5

update-layout-add-evaluation (#17)

Browse files

- add comment to divide functions/ui (c2fbbc311616f60f3dbc1e546d20517329b41486)
- fix typo (3ef1fed40b702eb482f9017241a2d3641cde3946)
- move sign in button to another column (ea29202ac74e3b567f8bbc83f2dd01edeb7e00b5)
- make sign in button smaller (3b5e775206b9e2276d8d5afd14d5a2ae6eb0f852)
- remove repeated import (9c1769a069aa5cadade2e120ebbfbb3524b4a71c)
- move sign in button to the right (5d91425bc46bbed12e76279e32eed828738f2e78)
- modify column width and typos (2b5c2e3fa953fcd1e7bbc5d50fa1eaa18cf51a7f)
- update successful message and pipeline code (4234ad816ad254da4dc0a2bdf4f6ee901f3ab647)
- update dataframe visualizations (7350fc6b3cdabd3c88562f7ebea772ea936b293b)
- update text and order parameter layout (45693e1d9d5340197d0a5298329a1b176836e5e9)
- typo (2673ebc69f8b3ee53ca0bea400abe8b18dcec6c7)
- add temperature for system prompt (857f1ba71f10ddb10f045923601746daed130b19)
- update textcat (separate prompt and labels) and use input parameters (4e193106207eda3f59650448038a680c25075972)
- update sft and use input parameters (dea11022bc5c78e08481e4e90bbb73b0402cdadc)
- update push dataset (49d5948eb076fc8b3354a9d4acdaac477fc0c398)
- add evaluation task (34371d30aa99cdb709c9def84739ab3b8b7fa611)
- hide pipeline ui each time it generates (c26510fcff621c6a144917e1a56d5f87dd41fd41)
- move order hide pipeline ui (1b00519115b913bff86a6f2ba061f97eb860e78a)
- merge remote tracking branch (1c412e2113c3889b13572af931a3be19fc93df5a)

Files changed (11) hide show

app.py +3 -3
pyproject.toml +1 -1
src/distilabel_dataset_generator/_tabbedinterface.py +4 -2
src/distilabel_dataset_generator/apps/base.py +16 -33
src/distilabel_dataset_generator/apps/eval.py +687 -202
src/distilabel_dataset_generator/apps/sft.py +102 -47
src/distilabel_dataset_generator/apps/textcat.py +171 -140
src/distilabel_dataset_generator/pipelines/eval.py +205 -0
src/distilabel_dataset_generator/pipelines/sft.py +50 -49
src/distilabel_dataset_generator/pipelines/textcat.py +89 -70
src/distilabel_dataset_generator/utils.py +97 -8

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 from src.distilabel_dataset_generator._tabbedinterface import TabbedInterface
 from src.distilabel_dataset_generator.apps.faq import app as faq_app
 from src.distilabel_dataset_generator.apps.sft import app as sft_app
 from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
 theme ='argilla/argilla-theme'
@@ -25,12 +26,11 @@ button.hf-login:hover {background: var(--neutral-700); color: white}
 """
 demo = TabbedInterface(
-    [textcat_app, sft_app, faq_app],
-    ["Text Classification", "Supervised Fine-Tuning", "FAQ"],
     css=css,
     title="""
     <h1>Synthetic Data Generator</h1>
-    <h3>Build datasets using natural language</h3>
     """,
     head="Synthetic Data Generator",
     theme=theme,

 from src.distilabel_dataset_generator._tabbedinterface import TabbedInterface
 from src.distilabel_dataset_generator.apps.faq import app as faq_app
 from src.distilabel_dataset_generator.apps.sft import app as sft_app
+from src.distilabel_dataset_generator.apps.eval import app as eval_app
 from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
 theme ='argilla/argilla-theme'
 """
 demo = TabbedInterface(
+    [textcat_app, sft_app, eval_app, faq_app],
+    ["Text Classification", "Supervised Fine-Tuning", "Evaluation", "FAQ"],
     css=css,
     title="""
     <h1>Synthetic Data Generator</h1>
     """,
     head="Synthetic Data Generator",
     theme=theme,

pyproject.toml CHANGED Viewed

@@ -6,7 +6,7 @@ authors = [
     {name = "davidberenstein1957", email = "[email protected]"},
 ]
 dependencies = [
-    "distilabel[hf-inference-endpoints,argilla,outlines]>=1.4.1",
     "gradio[oauth]<5.0.0",
     "transformers>=4.44.2",
     "sentence-transformers>=3.2.0",

     {name = "davidberenstein1957", email = "[email protected]"},
 ]
 dependencies = [
+    "distilabel[hf-inference-endpoints,argilla,outlines,instructor]>=1.4.1",
     "gradio[oauth]<5.0.0",
     "transformers>=4.44.2",
     "sentence-transformers>=3.2.0",

src/distilabel_dataset_generator/_tabbedinterface.py CHANGED Viewed

@@ -63,10 +63,12 @@ class TabbedInterface(Blocks):
             if title:
                 HTML(value=title)
                 with gr.Row():
-                    with gr.Column(scale=1):
-                        gr.LoginButton(value="Sign in!", variant="hf-login", size="sm", scale=2)
                     with gr.Column(scale=3):
                         pass
             with Tabs():
                 for interface, tab_name in zip(interface_list, tab_names, strict=False):
                     with Tab(label=tab_name):

             if title:
                 HTML(value=title)
                 with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.Markdown("### Build datasets using natural language")
                     with gr.Column(scale=3):
                         pass
+                    with gr.Column(scale=2):
+                        gr.LoginButton(value="Sign in!", variant="hf-login", size="sm", scale=2)
             with Tabs():
                 for interface, tab_name in zip(interface_list, tab_names, strict=False):
                     with Tab(label=tab_name):

src/distilabel_dataset_generator/apps/base.py CHANGED Viewed

@@ -15,7 +15,7 @@ from src.distilabel_dataset_generator.utils import (
     get_argilla_client,
     get_login_button,
     list_orgs,
-    swap_visibilty,
 )
 TEXTCAT_TASK = "text_classification"
@@ -137,7 +137,7 @@ def get_main_ui(
             show_progress=True,
         )
-        app.load(fn=swap_visibilty, outputs=main_ui)
         app.load(get_org_dropdown, outputs=[org_name])
     return (
@@ -300,25 +300,6 @@ def get_iterate_on_sample_dataset_ui(
     )
-def get_pipeline_code_ui(pipeline_code: str) -> gr.Code:
-    gr.Markdown("## Customize and run with distilabel")
-    gr.HTML("<hr>")
-    with gr.Accordion(
-        "Run this pipeline using distilabel",
-        open=False,
-    ):
-        gr.Markdown(
-            "You can run this pipeline locally with distilabel. For more information, please refer to the [distilabel documentation](https://distilabel.argilla.io/) or go to the FAQ tab at the top of the page for more information."
-        )
-        pipeline_code = gr.Code(
-            value=pipeline_code,
-            language="python",
-            label="Distilabel Pipeline Code",
-        )
-    return pipeline_code
 def get_argilla_tab() -> Tuple[Any]:
     with gr.Tab(label="Argilla"):
         if get_argilla_client() is not None:
@@ -492,7 +473,7 @@ def get_success_message_row() -> gr.Markdown:
     return success_message
-def show_success_message_hub(org_name, repo_name) -> gr.Markdown:
     client = get_argilla_client()
     argilla_api_url = client.api_url
     return gr.Markdown(
@@ -500,25 +481,27 @@ def show_success_message_hub(org_name, repo_name) -> gr.Markdown:
         <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
             <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
             <p style="margin-top: 0.5em;">
-                Your dataset is now available the Hugging Face Hub:
-                <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
-                    https://huggingface.co/datasets/{org_name}/{repo_name}
-                </a>
             </p>
             <p style="margin-top: 0.5em;">
-                Your dataset is now available within Argilla:
-                <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
-                    {argilla_api_url}
                 </a>
-                <br>Unfamiliar with Argilla? Here are some docs to help you get started:
-                <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
-                <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
             </p>
         </div>
         """,
         visible=True,
     )
 def hide_success_message() -> gr.Markdown:
     return gr.Markdown(value="")

     get_argilla_client,
     get_login_button,
     list_orgs,
+    swap_visibility,
 )
 TEXTCAT_TASK = "text_classification"
             show_progress=True,
         )
+        app.load(fn=swap_visibility, outputs=main_ui)
         app.load(get_org_dropdown, outputs=[org_name])
     return (
     )
 def get_argilla_tab() -> Tuple[Any]:
     with gr.Tab(label="Argilla"):
         if get_argilla_client() is not None:
     return success_message
+def show_success_message(org_name, repo_name) -> gr.Markdown:
     client = get_argilla_client()
     argilla_api_url = client.api_url
     return gr.Markdown(
         <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
             <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
             <p style="margin-top: 0.5em;">
+                <strong>
+                    <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
+                        Open your dataset in the Argilla space
+                    </a>
+                </strong>
             </p>
             <p style="margin-top: 0.5em;">
+                The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks. Your dataset is now available at:
+                <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
+                    https://huggingface.co/datasets/{org_name}/{repo_name}
                 </a>
             </p>
         </div>
+        <p style="margin-top: 1em; font-size: 0.9em; color: #333;">
+            Unfamiliar with Argilla? Here are some docs to help you get started:
+            <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
+            <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
+        </p>
         """,
         visible=True,
     )
 def hide_success_message() -> gr.Markdown:
     return gr.Markdown(value="")

src/distilabel_dataset_generator/apps/eval.py CHANGED Viewed

@@ -1,70 +1,106 @@
 import json
 import gradio as gr
 import pandas as pd
-from datasets import load_dataset
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
-from src.distilabel_dataset_generator.utils import get_org_dropdown
-def get_iframe(hub_repo_id) -> str:
     if not hub_repo_id:
-        raise gr.Error("Hub repo id is required")
     url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
     iframe = f"""
     <iframe
-  src="{url}"
-  frameborder="0"
-  width="100%"
-  height="600px"
-></iframe>
-"""
     return iframe
-def get_valid_columns(df: pd.DataFrame):
-    valid_columns = []
-    for col in df.columns:
-        sample_val = df[col].iloc[0]
         if isinstance(sample_val, str) or (
-            isinstance(sample_val, list)
-            and all(isinstance(item, dict) for item in sample_val)
         ):
-            valid_columns.append(col)
-    return valid_columns
-def load_dataset_from_hub(hub_repo_id: str, n_rows: int = 10):
-    gr.Info(message="Loading dataset ...")
-    if not hub_repo_id:
         raise gr.Error("Hub repo id is required")
-    ds_dict = load_dataset(hub_repo_id)
-    splits = list(ds_dict.keys())
     ds = ds_dict[splits[0]]
-    if n_rows:
-        ds = ds.select(range(n_rows))
-    df = ds.to_pandas()
-    # Get columns that contain either strings or lists of dictionaries
-    valid_columns = get_valid_columns(df)
     return (
-        df,
-        gr.Dropdown(choices=valid_columns, label="Instruction Column"),
-        gr.Dropdown(choices=valid_columns, label="Instruction Column"),
-        gr.Dropdown(choices=valid_columns, label="Response Column"),
     )
 def define_evaluation_aspects(task_type: str):
-    if task_type == "instruction":
-        return gr.Dropdown(
-            value=["overall-rating"],
-            choices=["complexity", "quality"],
-            label="Evaluation Aspects",
-            multiselect=True,
-            interactive=True,
-        )
-    elif task_type == "instruction-response":
         return gr.Dropdown(
             value=["overall-rating"],
             choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
@@ -76,226 +112,635 @@ def define_evaluation_aspects(task_type: str):
         return gr.Dropdown(interactive=False, visible=False)
-def evaluate_instruction(df: pd.DataFrame, aspects: list[str], instruction_column: str):
-    pass
 def evaluate_instruction_response(
-    df: pd.DataFrame, aspects: list[str], instruction_column: str, response_column: str
 ):
-    pass
 def evaluate_custom(
-    df: pd.DataFrame, aspects: list[str], prompt_template: str, structured_output: dict
 ):
-    pass
-def _apply_to_dataset(
-    df: pd.DataFrame,
     eval_type: str,
-    aspects_instruction: list[str],
-    instruction_column: str,
     aspects_instruction_response: list[str],
-    instruction_column_response: str,
-    response_column_response: str,
-    aspects_custom: list[str],
     prompt_template: str,
     structured_output: dict,
 ):
-    if eval_type == "instruction":
-        df = evaluate_instruction(df, aspects_instruction, instruction_column)
-    elif eval_type == "instruction-response":
-        df = evaluate_instruction_response(
-            df,
-            aspects_instruction_response,
-            instruction_column_response,
-            response_column_response,
         )
-    elif eval_type == "custom":
-        df = evaluate_custom(df, aspects_custom, prompt_template, structured_output)
-    return df
-def apply_to_sample_dataset(
     repo_id: str,
     eval_type: str,
-    aspects_instruction: list[str],
     aspects_instruction_response: list[str],
-    aspects_custom: list[str],
-    instruction_instruction: str,
     instruction_instruction_response: str,
     response_instruction_response: str,
     prompt_template: str,
     structured_output: dict,
 ):
-    df, _, _, _ = load_dataset_from_hub(repo_id, n_rows=10)
-    df = _apply_to_dataset(
-        df,
-        eval_type,
-        aspects_instruction,
-        instruction_instruction,
-        aspects_instruction_response,
-        instruction_instruction_response,
-        response_instruction_response,
-        aspects_custom,
-        prompt_template,
-        structured_output,
     )
-    return df
-def push_to_hub(
     org_name: str,
     repo_name: str,
     private: bool,
-    n_rows: int,
     original_repo_id: str,
     eval_type: str,
-    aspects_instruction: list[str],
     aspects_instruction_response: list[str],
-    aspects_custom: list[str],
-    instruction_instruction: str,
     instruction_instruction_response: str,
     response_instruction_response: str,
     prompt_template: str,
     structured_output: dict,
-):
-    df, _, _, _ = load_dataset_from_hub(original_repo_id, n_rows=n_rows)
-    df = _apply_to_dataset(
-        df,
-        eval_type,
-        aspects_instruction,
-        instruction_instruction,
-        aspects_instruction_response,
-        instruction_instruction_response,
-        response_instruction_response,
-        aspects_custom,
-        prompt_template,
-        structured_output,
     )
-    new_repo_id = f"{org_name}/{repo_name}"
-with gr.Blocks() as app:
-    gr.Markdown("## 1. Select your input dataset")
-    with gr.Row():
-        with gr.Column(scale=1):
-            search_in = HuggingfaceHubSearch(
-                label="Search",
-                placeholder="Search for a Dataset",
-                search_type="dataset",
-                sumbit_on_select=True,
             )
-            load_btn = gr.Button("Load dataset", variant="primary")
-        with gr.Column(scale=3):
-            search_out = gr.HTML(label="Dataset Preview")
-    gr.HTML("<hr>")
-    gr.Markdown("## 2. Configure your task")
-    with gr.Row():
-        with gr.Column(scale=1):
-            eval_type = gr.Dropdown(
-                label="Evaluation Type",
-                choices=["instruction", "instruction-response", "custom-template"],
-                visible=False,
             )
-            with gr.Tab("instruction") as tab_instruction:
-                aspects_instruction = define_evaluation_aspects("instruction")
-                instruction_instruction = gr.Dropdown(
-                    label="Instruction Column", interactive=True
                 )
-                tab_instruction.select(
-                    lambda: "instruction",
-                    inputs=[],
-                    outputs=[eval_type],
                 )
-            with gr.Tab("instruction-response") as tab_instruction_response:
-                aspects_instruction_response = define_evaluation_aspects(
-                    "instruction-response"
                 )
-                instruction_instruction_response = gr.Dropdown(
-                    label="Instruction Column", interactive=True
                 )
-                response_instruction_response = gr.Dropdown(
-                    label="Response Column", interactive=True
                 )
-                tab_instruction_response.select(
-                    lambda: "instruction-response",
-                    inputs=[],
-                    outputs=[eval_type],
                 )
-            with gr.Tab("custom") as tab_custom:
-                aspects_custom = define_evaluation_aspects("custom")
-                prompt_template = gr.Code(
-                    label="Prompt Template",
-                    value="{{column_1}} based on {{column_2}}",
-                    language="markdown",
                     interactive=True,
                 )
-                structured_output = gr.Code(
-                    label="Structured Output",
-                    value=json.dumps({"eval_aspect": "str"}),
-                    language="json",
                     interactive=True,
                 )
-                tab_custom.select(
-                    lambda: "custom-template",
-                    inputs=[],
-                    outputs=[eval_type],
                 )
-            btn_apply_to_sample_dataset = gr.Button("Refresh dataset")
-        with gr.Column(scale=3):
-            dataframe = gr.Dataframe()
-    gr.HTML("<hr>")
-    gr.Markdown("## 3. Generate your dataset")
-    with gr.Row():
-        with gr.Column(scale=1):
-            org_name = get_org_dropdown()
-            repo_name = gr.Textbox(
-                label="Repo name",
-                placeholder="dataset_name",
-                value="my-distiset",
-                interactive=True,
-            )
-            n_rows = gr.Number(
-                label="Number of rows",
-                value=10,
-                interactive=True,
-                scale=1,
-            )
-            private = gr.Checkbox(
-                label="Private dataset",
-                value=False,
-                interactive=True,
-                scale=1,
-            )
-            btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
-        with gr.Column(scale=3):
-            success_message = gr.Markdown(visible=False)
-    search_in.submit(get_iframe, inputs=search_in, outputs=search_out)
     load_btn.click(
-        load_dataset_from_hub,
         inputs=[search_in],
         outputs=[
             dataframe,
-            instruction_instruction,
             instruction_instruction_response,
             response_instruction_response,
         ],
     )
     btn_apply_to_sample_dataset.click(
-        apply_to_sample_dataset,
         inputs=[
             search_in,
             eval_type,
-            aspects_instruction,
             aspects_instruction_response,
-            aspects_custom,
-            instruction_instruction,
             instruction_instruction_response,
             response_instruction_response,
             prompt_template,
@@ -303,24 +748,64 @@ with gr.Blocks() as app:
         ],
         outputs=dataframe,
     )
     btn_push_to_hub.click(
-        push_to_hub,
         inputs=[
             org_name,
             repo_name,
             private,
-            n_rows,
             search_in,
             eval_type,
-            aspects_instruction,
             aspects_instruction_response,
-            aspects_custom,
-            instruction_instruction,
             instruction_instruction_response,
             response_instruction_response,
             prompt_template,
             structured_output,
         ],
-        outputs=success_message,
     )
     app.load(fn=get_org_dropdown, outputs=[org_name])

 import json
+import uuid
+from typing import Union
+import argilla as rg
 import gradio as gr
+import numpy as np
 import pandas as pd
+from datasets import (
+    Dataset,
+    get_dataset_config_names,
+    get_dataset_split_names,
+    load_dataset,
+)
+from distilabel.distiset import Distiset
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from huggingface_hub import HfApi
+from src.distilabel_dataset_generator.apps.base import (
+    hide_success_message,
+    show_success_message,
+    validate_argilla_user_workspace_dataset,
+    validate_push_to_hub,
+)
+from src.distilabel_dataset_generator.pipelines.base import (
+    DEFAULT_BATCH_SIZE,
+)
+from src.distilabel_dataset_generator.pipelines.embeddings import (
+    get_embeddings,
+    get_sentence_embedding_dimensions,
+)
+from src.distilabel_dataset_generator.pipelines.eval import (
+    generate_pipeline_code,
+    get_custom_evaluator,
+    get_ultrafeedback_evaluator,
+)
+from src.distilabel_dataset_generator.utils import (
+    column_to_list,
+    extract_column_names,
+    get_argilla_client,
+    get_org_dropdown,
+    process_columns,
+    swap_visibility,
+    pad_or_truncate_list,
+)
+def get_iframe(hub_repo_id: str) -> str:
     if not hub_repo_id:
+        raise gr.Error("Hub repository ID is required.")
     url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
     iframe = f"""
     <iframe
+        src="{url}"
+        frameborder="0"
+        width="100%"
+        height="600px"
+    ></iframe>
+    """
     return iframe
+def get_valid_columns(dataframe: pd.DataFrame):
+    instruction_valid_columns = []
+    response_valid_columns = []
+    for col in dataframe.columns:
+        sample_val = dataframe[col].iloc[0]
         if isinstance(sample_val, str) or (
+            isinstance(sample_val, (list, np.ndarray))
+            and all(isinstance(item, dict) and "role" in item for item in sample_val)
         ):
+            instruction_valid_columns.append(col)
+            response_valid_columns.append(col)
+        if isinstance(sample_val, (list, np.ndarray)) and all(
+            isinstance(item, str) for item in sample_val
+        ):
+            response_valid_columns.append(col)
+    return instruction_valid_columns, response_valid_columns
+def load_dataset_from_hub(repo_id: str, num_rows: int = 10):
+    if not repo_id:
         raise gr.Error("Hub repo id is required")
+    subsets = get_dataset_config_names(repo_id)
+    ds_dict = load_dataset(repo_id, subsets[0])
+    splits = get_dataset_split_names(repo_id, subsets[0])
     ds = ds_dict[splits[0]]
+    if num_rows:
+        ds = ds.select(range(num_rows))
+    dataframe = ds.to_pandas()
+    instruction_valid_columns, response_valid_columns = get_valid_columns(dataframe)
     return (
+        dataframe,
+        gr.Dropdown(choices=instruction_valid_columns, label="Instruction column"),
+        gr.Dropdown(choices=response_valid_columns, label="Response column"),
     )
 def define_evaluation_aspects(task_type: str):
+    if task_type == "ultrafeedback":
         return gr.Dropdown(
             value=["overall-rating"],
             choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
         return gr.Dropdown(interactive=False, visible=False)
 def evaluate_instruction_response(
+    dataframe: pd.DataFrame,
+    aspects: list[str],
+    instruction_column: str,
+    response_columns: str,
+    num_rows: int = 10,
+    is_sample: bool = False,
+    progress=gr.Progress(),
 ):
+    progress(0.0, desc="Evaluating instructions and responses")
+    data = process_columns(dataframe, instruction_column, response_columns)
+    num_generations = len(data[0]["generations"])
+    evaluated_results = []
+    for entry in data:
+        result_row = {
+            "instruction": entry["instruction"],
+            "generations": entry["generations"],
+        }
+        for aspect in aspects:
+            result_row[f"ratings_{aspect}"] = None
+            result_row[f"rationale_for_ratings_{aspect}"] = None
+            if aspect in ["truthfulness", "helpfulness"]:
+                result_row[f"type_{aspect}"] = None
+                result_row[f"rationale_for_type_{aspect}"] = None
+        result_row["model_name"] = None
+        evaluated_results.append(result_row)
+    batch_size = DEFAULT_BATCH_SIZE
+    total_steps: int = len(aspects) * num_rows
+    # evaluate instructions and responses
+    for aspect in aspects:
+        ultrafeedback_evaluator = get_ultrafeedback_evaluator(aspect, is_sample)
+        n_processed = 0
+        while n_processed < num_rows:
+            progress(
+                (len(aspects) * n_processed) / total_steps,
+                total=total_steps,
+                desc=f"Evaluating aspect: {aspect}",
+            )
+            remaining_rows = num_rows - n_processed
+            batch_size = min(batch_size, remaining_rows)
+            inputs = data[n_processed : n_processed + batch_size]
+            batch_results = list(ultrafeedback_evaluator.process(inputs=inputs))
+            for j, result in enumerate(batch_results[0]):
+                idx = n_processed + j
+                evaluated_results[idx][f"ratings_{aspect}"] = pad_or_truncate_list(
+                    result.get("ratings"), num_generations
+                )
+                evaluated_results[idx]["model_name"] = result.get("model_name")
+                if aspect in ["truthfulness", "helpfulness"]:
+                    evaluated_results[idx][f"type_{aspect}"] = pad_or_truncate_list(
+                        result.get("types"), num_generations
+                    )
+                    evaluated_results[idx][f"rationale_for_type_{aspect}"] = (
+                        pad_or_truncate_list(result.get("rationales"), num_generations)
+                    )
+                    evaluated_results[idx][f"rationale_for_ratings_{aspect}"] = (
+                        pad_or_truncate_list(
+                            result.get("rationales-for-ratings"), num_generations
+                        )
+                    )
+                else:
+                    evaluated_results[idx][f"rationale_for_ratings_{aspect}"] = (
+                        pad_or_truncate_list(result.get("rationales"), num_generations)
+                    )
+            n_processed += batch_size
+    # create final dataset
+    dataframe = pd.DataFrame(evaluated_results)
+    progress(1.0, desc="Dataset evaluation completed")
+    return dataframe
 def evaluate_custom(
+    dataframe: pd.DataFrame,
+    prompt_template: str,
+    structured_output: dict,
+    num_rows: int = 10,
+    is_sample: bool = False,
+    progress=gr.Progress(),
 ):
+    progress(0.0, desc="Evaluating dataset")
+    columns = extract_column_names(prompt_template)
+    input_columns = {column: column_to_list(dataframe, column) for column in columns}
+    custom_evaluator = get_custom_evaluator(
+        prompt_template, structured_output, columns, is_sample
+    )
+    batch_size = DEFAULT_BATCH_SIZE
+    # evaluate the data
+    n_processed = 0
+    evaluation_results = []
+    while n_processed < num_rows:
+        progress(
+            n_processed / num_rows,
+            desc="Evaluating dataset",
+        )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        inputs = []
+        for idx in range(n_processed, n_processed + batch_size):
+            input = {column: input_columns[column][idx] for column in input_columns}
+            inputs.append(input)
+        batch = list(custom_evaluator.process(inputs=inputs))
+        evaluation_results.extend(batch[0])
+        n_processed += batch_size
+    # create final dataset
+    distiset_results = []
+    for result in evaluation_results:
+        record = {key: result[key] for key in result if key != "distilabel_metadata"}
+        distiset_results.append(record)
+    dataframe = pd.DataFrame(distiset_results)
+    progress(1.0, desc="Dataset evaluation completed")
+    return dataframe
+def _evaluate_dataset(
+    dataframe: pd.DataFrame,
     eval_type: str,
     aspects_instruction_response: list[str],
+    instruction_instruction_response: str,
+    response_instruction_response: str,
     prompt_template: str,
     structured_output: dict,
+    num_rows: int = 10,
+    is_sample: bool = False,
 ):
+    if eval_type == "ultrafeedback":
+        dataframe = evaluate_instruction_response(
+            dataframe=dataframe,
+            aspects=aspects_instruction_response,
+            instruction_column=instruction_instruction_response,
+            response_columns=response_instruction_response,
+            num_rows=num_rows,
+            is_sample=is_sample,
+        )
+    else:
+        dataframe = evaluate_custom(
+            dataframe=dataframe,
+            prompt_template=prompt_template,
+            structured_output=structured_output,
+            num_rows=num_rows,
+            is_sample=is_sample,
         )
+    return dataframe
+def evaluate_sample_dataset(
     repo_id: str,
     eval_type: str,
     aspects_instruction_response: list[str],
     instruction_instruction_response: str,
     response_instruction_response: str,
     prompt_template: str,
     structured_output: dict,
 ):
+    dataframe, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
+    dataframe = _evaluate_dataset(
+        dataframe=dataframe,
+        eval_type=eval_type,
+        aspects_instruction_response=aspects_instruction_response,
+        instruction_instruction_response=instruction_instruction_response,
+        response_instruction_response=response_instruction_response,
+        prompt_template=prompt_template,
+        structured_output=structured_output,
+        num_rows=10,
+        is_sample=True,
     )
+    return dataframe
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame, org_name: str, repo_name: str, oauth_token, private
+):
+    repo_id = validate_push_to_hub(org_name, repo_name)
+    distiset = Distiset({"default": Dataset.from_pandas(dataframe)})
+    distiset.push_to_hub(
+        repo_id=repo_id,
+        private=private,
+        include_script=False,
+        token=oauth_token.token,
+        create_pr=False,
+    )
+def push_dataset(
     org_name: str,
     repo_name: str,
     private: bool,
+    num_rows: int,
     original_repo_id: str,
     eval_type: str,
     aspects_instruction_response: list[str],
     instruction_instruction_response: str,
     response_instruction_response: str,
     prompt_template: str,
     structured_output: dict,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    dataframe, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
+    dataframe = _evaluate_dataset(
+        dataframe=dataframe,
+        eval_type=eval_type,
+        aspects_instruction_response=aspects_instruction_response,
+        instruction_instruction_response=instruction_instruction_response,
+        response_instruction_response=response_instruction_response,
+        prompt_template=prompt_template,
+        structured_output=structured_output,
+        num_rows=num_rows,
     )
+    push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        client = get_argilla_client()
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        if eval_type == "ultrafeedback":
+            num_generations = len((dataframe["generations"][0]))
+            fields = [
+                rg.ChatField(
+                    name=f"chat_{i}",
+                    title=f"Chat {i+1}",
+                    description=f"User and assistant conversation for generation {i+1}",
+                )
+                for i in range(num_generations)
+            ]
+            questions = []
+            for i in range(num_generations):
+                for aspect in aspects_instruction_response:
+                    questions.append(
+                        rg.RatingQuestion(
+                            name=f"ratings_{aspect}_{i}",
+                            values=list(range(11)),
+                            title=f"Ratings for {aspect} for response {i+1}",
+                            required=True,
+                        )
+                    )
+                    questions.append(
+                        rg.TextQuestion(
+                            name=f"rationale_for_ratings_{aspect}_{i}",
+                            title=f"Rationale for ratings for {aspect} for response {i+1}",
+                            required=False,
+                            use_markdown=True,
+                        )
+                    )
+                    if aspect in ["truthfulness", "helpfulness"]:
+                        questions.append(
+                            rg.RatingQuestion(
+                                name=f"type_{aspect}_{i}",
+                                values=list(range(1, 6)),
+                                title=f"The type of the response {i+1} for {aspect}",
+                                required=True,
+                            )
+                        )
+                        questions.append(
+                            rg.TextQuestion(
+                                name=f"rationale_for_type_{aspect}_{i}",
+                                title=f"Rationale for type of the response {i+1} for {aspect}",
+                                required=False,
+                                use_markdown=True,
+                            )
+                        )
+            metadata = [
+                rg.IntegerMetadataProperty(
+                    name="instruction_length", title="Instruction length"
+                ),
+            ]
+            for i in range(num_generations):
+                metadata.append(
+                    rg.IntegerMetadataProperty(
+                        name=f"response_{i}_length", title=f"Response {i+1} length"
+                    )
+                )
+            vectors = [
+                rg.VectorField(
+                    name="instruction_embeddings",
+                    dimensions=get_sentence_embedding_dimensions(),
+                )
+            ]
+            settings = rg.Settings(
+                fields=fields,
+                questions=questions,
+                metadata=metadata,
+                vectors=vectors,
+                guidelines="Please review the conversation and provide an evaluation.",
             )
+            dataframe["instruction_length"] = dataframe["instruction"].apply(len)
+            for i in range(num_generations):
+                dataframe[f"response_{i}_length"] = dataframe["generations"].apply(
+                    lambda gens: len(gens[i]) if i < len(gens) else 0
+                )
+            dataframe["instruction_embeddings"] = get_embeddings(
+                dataframe["instruction"].to_list()
             )
+            progress(0.5, desc="Creating dataset")
+            rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
+            if rg_dataset is None:
+                rg_dataset = rg.Dataset(
+                    name=repo_name,
+                    workspace=hf_user,
+                    settings=settings,
+                    client=client,
+                )
+                rg_dataset = rg_dataset.create()
+            progress(0.7, desc="Pushing dataset to Argilla")
+            hf_dataset = Dataset.from_pandas(dataframe)
+            records = []
+            for sample in hf_dataset:
+                fields = {}
+                metadata = {"instruction_length": sample.get("instruction_length", 0)}
+                vectors = {
+                    "instruction_embeddings": sample.get("instruction_embeddings", [])
+                }
+                suggestions = []
+                generations = sample.get("generations", [])
+                for i in range(num_generations):
+                    fields[f"chat_{i}"] = [
+                        {"role": "user", "content": sample.get("instruction", "")},
+                        {"role": "assistant", "content": generations[i]},
+                    ]
+                    metadata[f"response_{i}_length"] = sample.get(
+                        f"response_{i}_length", 0
+                    )
+                    for aspect in aspects_instruction_response:
+                        ratings = sample.get(f"ratings_{aspect}", [])
+                        rationales = sample.get(f"rationale_for_ratings__{aspect}", [])
+                        rating_value = (
+                            ratings[i]
+                            if ratings and isinstance(ratings[i], int)
+                            else None
+                        )
+                        rationale_value = (
+                            rationales[i]
+                            if rationales and isinstance(rationales[i], str)
+                            else None
+                        )
+                        if rating_value is not None:
+                            suggestions.append(
+                                rg.Suggestion(
+                                    question_name=f"ratings_{aspect}_{i}",
+                                    value=rating_value,
+                                )
+                            )
+                        if rationale_value is not None:
+                            suggestions.append(
+                                rg.Suggestion(
+                                    question_name=f"rationale_for_ratings_{aspect}_{i}",
+                                    value=rationale_value,
+                                )
+                            )
+                        if aspect in ["truthfulness", "helpfulness"]:
+                            types = sample.get(f"type_{aspect}", [])
+                            rationale_types = sample.get(
+                                f"rationale_for_type_{aspect}", []
+                            )
+                            type_value = (
+                                types[i]
+                                if types and isinstance(types[i], int)
+                                else None
+                            )
+                            rationale_type_value = (
+                                rationale_types[i]
+                                if rationale_types
+                                and isinstance(rationale_types[i], str)
+                                else None
+                            )
+                            if type_value is not None:
+                                suggestions.append(
+                                    rg.Suggestion(
+                                        question_name=f"type_{aspect}_{i}",
+                                        value=type_value,
+                                    )
+                                )
+                            if rationale_type_value is not None:
+                                suggestions.append(
+                                    rg.Suggestion(
+                                        question_name=f"rationale_for_type_{aspect}_{i}",
+                                        value=rationale_type_value,
+                                    )
+                                )
+                records.append(
+                    rg.Record(
+                        fields=fields,
+                        metadata=metadata,
+                        vectors=vectors,
+                        suggestions=suggestions,
+                    )
                 )
+            rg_dataset.records.log(records=records)
+            progress(1.0, desc="Dataset pushed to Argilla")
+        else:
+            columns = extract_column_names(prompt_template)
+            settings = rg.Settings(
+                fields=[
+                    rg.TextField(
+                        name=column,
+                        title=column.capitalize(),
+                        description="The column content",
+                    )
+                    for column in columns
+                ],
+                questions=[
+                    rg.TextQuestion(
+                        name="evaluation",
+                        title="Evaluation",
+                        description="The generated evaluation",
+                        use_markdown=True,
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name=f"{column}_length", title=f"{column.capitalize()} length"
+                    )
+                    for column in columns
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name=f"{column}_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                    for column in columns
+                ],
+                guidelines="Please review, correct and provide an accurate evaluation.",
+            )
+            for column in columns:
+                dataframe[f"{column}_length"] = dataframe[column].apply(len)
+                dataframe[f"{column}_embeddings"] = get_embeddings(dataframe[column])
+            progress(0.5, desc="Creating dataset")
+            rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
+            if rg_dataset is None:
+                rg_dataset = rg.Dataset(
+                    name=repo_name,
+                    workspace=hf_user,
+                    settings=settings,
+                    client=client,
                 )
+                rg_dataset = rg_dataset.create()
+            progress(0.7, desc="Pushing dataset to Argilla")
+            hf_dataset = Dataset.from_pandas(dataframe)
+            rg_dataset.records.log(
+                records=hf_dataset, mapping={"generation": "evaluation"}
+            )
+            progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return ""
+def show_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=True)}
+def hide_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=False)}
+######################
+# Gradio UI
+######################
+with gr.Blocks() as app:
+    with gr.Column() as main_ui:
+        gr.Markdown("## 1. Select your input dataset")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=1):
+                search_in = HuggingfaceHubSearch(
+                    label="Search",
+                    placeholder="Search for a dataset",
+                    search_type="dataset",
+                    sumbit_on_select=True,
                 )
+                load_btn = gr.Button("Load dataset", variant="primary")
+            with gr.Column(scale=3):
+                search_out = gr.HTML(label="Dataset preview")
+        gr.HTML(value="<hr>")
+        gr.Markdown(value="## 2. Configure your task")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=1):
+                eval_type = gr.Dropdown(
+                    label="Evaluation type",
+                    choices=["ultrafeedback", "custom"],
+                    value="ultrafeedback",
+                    multiselect=False,
+                    visible=False,
                 )
+                with gr.Tab("ultrafeedback") as tab_instruction_response:
+                    aspects_instruction_response = define_evaluation_aspects(
+                        "ultrafeedback"
+                    )
+                    instruction_instruction_response = gr.Dropdown(
+                        label="Instruction Column",
+                        interactive=True,
+                        multiselect=False,
+                        allow_custom_value=False,
+                    )
+                    response_instruction_response = gr.Dropdown(
+                        label="Response Column",
+                        interactive=True,
+                        multiselect=True,
+                        allow_custom_value=False,
+                    )
+                    tab_instruction_response.select(
+                        fn=lambda: "ultrafeedback",
+                        inputs=[],
+                        outputs=[eval_type],
+                    )
+                with gr.Tab("custom") as tab_custom:
+                    aspects_custom = define_evaluation_aspects("custom")
+                    prompt_template = gr.Code(
+                        label="Prompt template",
+                        value="Evaluate {{column_1}} based on {{column_2}}.",
+                        language="markdown",
+                        interactive=True,
+                    )
+                    structured_output = gr.Code(
+                        label="Structured output",
+                        value=json.dumps(
+                            {
+                                "type": "object",
+                                "properties": {
+                                    "quality": {"type": "integer"},
+                                    "clarity": {"type": "integer"},
+                                    "relevance": {"type": "integer"},
+                                },
+                            },
+                            indent=4,
+                        ),
+                        language="json",
+                        interactive=True,
+                    )
+                    tab_custom.select(
+                        fn=lambda: "custom",
+                        inputs=[],
+                        outputs=[eval_type],
+                    )
+                btn_apply_to_sample_dataset = gr.Button(
+                    "Refresh dataset", variant="secondary", size="sm"
                 )
+            with gr.Column(scale=3):
+                dataframe = gr.Dataframe(
+                    headers=["prompt", "completion", "evaluation"],
+                    wrap=False,
+                    height=500,
+                    interactive=False,
                 )
+        gr.HTML(value="<hr>")
+        gr.Markdown(value="## 3. Evaluate your dataset")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                org_name = get_org_dropdown()
+                repo_name = gr.Textbox(
+                    label="Repo name",
+                    placeholder="dataset_name",
+                    value=f"my-distiset-{str(uuid.uuid4())[:8]}",
                     interactive=True,
                 )
+                num_rows = gr.Number(
+                    label="Number of rows",
+                    value=10,
                     interactive=True,
+                    scale=1,
                 )
+                private = gr.Checkbox(
+                    label="Private dataset",
+                    value=False,
+                    interactive=True,
+                    scale=1,
                 )
+                btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
+            with gr.Column(scale=3):
+                success_message = gr.Markdown(visible=True)
+                with gr.Accordion(
+                    "Do you want to go further? Customize and run with Distilabel",
+                    open=False,
+                    visible=False,
+                ) as pipeline_code_ui:
+                    code = generate_pipeline_code(
+                            repo_id=search_in.value,
+                            aspects=aspects_instruction_response.value,
+                            instruction_column=instruction_instruction_response,
+                            response_columns=response_instruction_response,
+                            prompt_template=prompt_template.value,
+                            structured_output=structured_output.value,
+                            num_rows=num_rows.value,
+                            eval_type=eval_type.value,
+                        )
+                    pipeline_code = gr.Code(
+                        value=code,
+                        language="python",
+                        label="Distilabel Pipeline Code",
+                    )
+    search_in.submit(fn=get_iframe, inputs=search_in, outputs=search_out)
     load_btn.click(
+        fn=load_dataset_from_hub,
         inputs=[search_in],
         outputs=[
             dataframe,
             instruction_instruction_response,
             response_instruction_response,
         ],
     )
     btn_apply_to_sample_dataset.click(
+        fn=evaluate_sample_dataset,
         inputs=[
             search_in,
             eval_type,
             aspects_instruction_response,
             instruction_instruction_response,
             response_instruction_response,
             prompt_template,
         ],
         outputs=dataframe,
     )
     btn_push_to_hub.click(
+        fn=validate_argilla_user_workspace_dataset,
+        inputs=[repo_name],
+        outputs=[success_message],
+        show_progress=True,
+    ).then(
+        fn=validate_push_to_hub,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+        show_progress=True,
+    ).success(
+        fn=hide_success_message,
+        outputs=[success_message],
+        show_progress=True,
+    ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=push_dataset,
         inputs=[
             org_name,
             repo_name,
             private,
+            num_rows,
             search_in,
             eval_type,
             aspects_instruction_response,
             instruction_instruction_response,
             response_instruction_response,
             prompt_template,
             structured_output,
         ],
+        outputs=[success_message],
+        show_progress=True,
+    ).success(
+        fn=show_success_message,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            search_in,
+            aspects_instruction_response,
+            instruction_instruction_response,
+            response_instruction_response,
+            prompt_template,
+            structured_output,
+            num_rows,
+            eval_type,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
     )
+    app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -10,10 +10,8 @@ from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
-    get_argilla_client,
-    get_pipeline_code_ui,
     hide_success_message,
-    show_success_message_hub,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
@@ -26,7 +24,6 @@ from src.distilabel_dataset_generator.pipelines.embeddings import (
 )
 from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_DATASET_DESCRIPTIONS,
-    PROMPT_CREATION_PROMPT,
     generate_pipeline_code,
     get_magpie_generator,
     get_prompt_generator,
@@ -36,7 +33,7 @@ from src.distilabel_dataset_generator.utils import (
     _LOGGED_OUT_CSS,
     get_argilla_client,
     get_org_dropdown,
-    swap_visibilty,
 )
@@ -55,35 +52,33 @@ def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
     return dataframe
-def generate_system_prompt(dataset_description, progress=gr.Progress()):
     progress(0.0, desc="Generating system prompt")
     progress(0.3, desc="Initializing text generation")
-    generate_description = get_prompt_generator()
     progress(0.7, desc="Generating system prompt")
     result = next(
         generate_description.process(
             [
                 {
-                    "system_prompt": PROMPT_CREATION_PROMPT,
                     "instruction": dataset_description,
                 }
             ]
         )
     )[0]["generation"]
     progress(1.0, desc="System prompt generated")
-    return result, pd.DataFrame()
-def generate_sample_dataset(system_prompt, progress=gr.Progress()):
-    df = generate_dataset(
         system_prompt=system_prompt,
-        num_turns=1,
         num_rows=10,
         progress=progress,
         is_sample=True,
     )
-    return df
 def generate_dataset(
@@ -94,10 +89,8 @@ def generate_dataset(
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating instructions")
-    magpie_generator = get_magpie_generator(
-        num_turns, num_rows, system_prompt, is_sample
-    )
-    response_generator = get_response_generator(num_turns, system_prompt, is_sample)
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
@@ -209,12 +202,12 @@ def push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private):
     return original_dataframe
-def push_dataset_to_argilla(
     org_name: str,
     repo_name: str,
     system_prompt: str,
     num_turns: int = 1,
-    n_rows: int = 10,
     private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
@@ -222,7 +215,7 @@ def push_dataset_to_argilla(
     dataframe = generate_dataset(
         system_prompt=system_prompt,
         num_turns=num_turns,
-        num_rows=n_rows,
     )
     push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
     try:
@@ -344,29 +337,54 @@ def push_dataset_to_argilla(
     return ""
 with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
     with gr.Column() as main_ui:
         gr.Markdown(value="## 1. Describe the dataset you want")
         with gr.Row():
-            with gr.Column(scale=1):
                 dataset_description = gr.Textbox(
                     label="Dataset description",
                     placeholder="Give a precise description of your desired dataset.",
                 )
                 examples = gr.Examples(
                     examples=DEFAULT_DATASET_DESCRIPTIONS,
                     inputs=[dataset_description],
                     cache_examples=False,
-                    label="Example descriptions",
                 )
-                load_btn = gr.Button("Load dataset", variant="primary")
-            with gr.Column(scale=3):
                 pass
         gr.HTML(value="<hr>")
-        gr.Markdown(value="## 2. Configure your task")
-        with gr.Row():
             with gr.Column(scale=1):
                 system_prompt = gr.Textbox(
                     label="System prompt",
@@ -381,14 +399,21 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                     interactive=True,
                     info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
                 )
-                btn_apply_to_sample_dataset = gr.Button("Refresh dataset")
             with gr.Column(scale=3):
-                dataframe = gr.Dataframe()
         gr.HTML(value="<hr>")
         gr.Markdown(value="## 3. Generate your dataset")
-        with gr.Row():
-            with gr.Column(scale=1):
                 org_name = get_org_dropdown()
                 repo_name = gr.Textbox(
                     label="Repo name",
@@ -396,7 +421,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                     value=f"my-distiset-{str(uuid.uuid4())[:8]}",
                     interactive=True,
                 )
-                n_rows = gr.Number(
                     label="Number of rows",
                     value=10,
                     interactive=True,
@@ -410,21 +435,38 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                 )
                 btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
             with gr.Column(scale=3):
-                success_message = gr.Markdown()
-        pipeline_code = get_pipeline_code_ui(
-            generate_pipeline_code(system_prompt.value, num_turns.value, n_rows.value)
-        )
-    gr.on(
-        triggers=[load_btn.click, btn_apply_to_sample_dataset.click],
         fn=generate_system_prompt,
-        inputs=[dataset_description],
-        outputs=[system_prompt, dataframe],
         show_progress=True,
     ).then(
         fn=generate_sample_dataset,
-        inputs=[system_prompt],
         outputs=[dataframe],
         show_progress=True,
     )
@@ -444,21 +486,34 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
         outputs=[success_message],
         show_progress=True,
     ).success(
-        fn=push_dataset_to_argilla,
         inputs=[
             org_name,
             repo_name,
             system_prompt,
             num_turns,
-            n_rows,
             private,
         ],
         outputs=[success_message],
         show_progress=True,
     ).success(
-        fn=show_success_message_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
-    app.load(fn=swap_visibilty, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])

 from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
     hide_success_message,
+    show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
 )
 from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_DATASET_DESCRIPTIONS,
     generate_pipeline_code,
     get_magpie_generator,
     get_prompt_generator,
     _LOGGED_OUT_CSS,
     get_argilla_client,
     get_org_dropdown,
+    swap_visibility,
 )
     return dataframe
+def generate_system_prompt(dataset_description, temperature, progress=gr.Progress()):
     progress(0.0, desc="Generating system prompt")
     progress(0.3, desc="Initializing text generation")
+    generate_description = get_prompt_generator(temperature)
     progress(0.7, desc="Generating system prompt")
     result = next(
         generate_description.process(
             [
                 {
                     "instruction": dataset_description,
                 }
             ]
         )
     )[0]["generation"]
     progress(1.0, desc="System prompt generated")
+    return result
+def generate_sample_dataset(system_prompt, num_turns, progress=gr.Progress()):
+    dataframe = generate_dataset(
         system_prompt=system_prompt,
+        num_turns=num_turns,
         num_rows=10,
         progress=progress,
         is_sample=True,
     )
+    return dataframe
 def generate_dataset(
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating instructions")
+    magpie_generator = get_magpie_generator(system_prompt, num_turns, is_sample)
+    response_generator = get_response_generator(system_prompt, num_turns, is_sample)
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
     return original_dataframe
+def push_dataset(
     org_name: str,
     repo_name: str,
     system_prompt: str,
     num_turns: int = 1,
+    num_rows: int = 10,
     private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
     dataframe = generate_dataset(
         system_prompt=system_prompt,
         num_turns=num_turns,
+        num_rows=num_rows,
     )
     push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
     try:
     return ""
+def show_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=True)}
+def hide_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=False)}
+######################
+# Gradio UI
+######################
 with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
     with gr.Column() as main_ui:
         gr.Markdown(value="## 1. Describe the dataset you want")
         with gr.Row():
+            with gr.Column(scale=2):
                 dataset_description = gr.Textbox(
                     label="Dataset description",
                     placeholder="Give a precise description of your desired dataset.",
                 )
+                with gr.Accordion("Temperature", open=False):
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=1,
+                        value=0.8,
+                        step=0.1,
+                        interactive=True,
+                        show_label=False,
+                    )
+                load_btn = gr.Button(
+                    "Create dataset",
+                    variant="primary",
+                )
+            with gr.Column(scale=2):
                 examples = gr.Examples(
                     examples=DEFAULT_DATASET_DESCRIPTIONS,
                     inputs=[dataset_description],
                     cache_examples=False,
+                    label="Examples",
                 )
+            with gr.Column(scale=1):
                 pass
         gr.HTML(value="<hr>")
+        gr.Markdown(value="## 2. Configure your dataset")
+        with gr.Row(equal_height=False):
             with gr.Column(scale=1):
                 system_prompt = gr.Textbox(
                     label="System prompt",
                     interactive=True,
                     info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
                 )
+                btn_apply_to_sample_dataset = gr.Button(
+                    "Refresh dataset", variant="secondary", size="sm"
+                )
             with gr.Column(scale=3):
+                dataframe = gr.Dataframe(
+                    headers=["prompt", "completion"],
+                    wrap=True,
+                    height=500,
+                    interactive=False,
+                )
         gr.HTML(value="<hr>")
         gr.Markdown(value="## 3. Generate your dataset")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
                 org_name = get_org_dropdown()
                 repo_name = gr.Textbox(
                     label="Repo name",
                     value=f"my-distiset-{str(uuid.uuid4())[:8]}",
                     interactive=True,
                 )
+                num_rows = gr.Number(
                     label="Number of rows",
                     value=10,
                     interactive=True,
                 )
                 btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
             with gr.Column(scale=3):
+                success_message = gr.Markdown(visible=True)
+                with gr.Accordion(
+                    "Do you want to go further? Customize and run with Distilabel",
+                    open=False,
+                    visible=False,
+                ) as pipeline_code_ui:
+                    code = generate_pipeline_code(
+                        system_prompt=system_prompt.value,
+                        num_turns=num_turns.value,
+                        num_rows=num_rows.value,
+                    )
+                    pipeline_code = gr.Code(
+                        value=code,
+                        language="python",
+                        label="Distilabel Pipeline Code",
+                    )
+    load_btn.click(
         fn=generate_system_prompt,
+        inputs=[dataset_description, temperature],
+        outputs=[system_prompt],
         show_progress=True,
     ).then(
         fn=generate_sample_dataset,
+        inputs=[system_prompt, num_turns],
+        outputs=[dataframe],
+        show_progress=True,
+    )
+    btn_apply_to_sample_dataset.click(
+        fn=generate_sample_dataset,
+        inputs=[system_prompt, num_turns],
         outputs=[dataframe],
         show_progress=True,
     )
         outputs=[success_message],
         show_progress=True,
     ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=push_dataset,
         inputs=[
             org_name,
             repo_name,
             system_prompt,
             num_turns,
+            num_rows,
             private,
         ],
         outputs=[success_message],
         show_progress=True,
     ).success(
+        fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[system_prompt, num_turns, num_rows],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
     )
+    app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import re
 import uuid
 from typing import List, Union
@@ -10,10 +10,8 @@ from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
-    get_argilla_client,
-    get_pipeline_code_ui,
     hide_success_message,
-    show_success_message_hub,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
@@ -26,7 +24,6 @@ from src.distilabel_dataset_generator.pipelines.embeddings import (
 )
 from src.distilabel_dataset_generator.pipelines.textcat import (
     DEFAULT_DATASET_DESCRIPTIONS,
-    PROMPT_CREATION_PROMPT,
     generate_pipeline_code,
     get_labeller_generator,
     get_prompt_generator,
@@ -37,45 +34,42 @@ from src.distilabel_dataset_generator.utils import (
     get_argilla_client,
     get_org_dropdown,
     get_preprocess_labels,
-    swap_visibilty,
 )
-def generate_system_prompt(dataset_description, progress=gr.Progress()):
     progress(0.0, desc="Generating text classification task")
     progress(0.3, desc="Initializing text generation")
-    generate_description = get_prompt_generator()
     progress(0.7, desc="Generating text classification task")
-    system_prompt = next(
         generate_description.process(
             [
                 {
-                    "system_prompt": PROMPT_CREATION_PROMPT,
                     "instruction": dataset_description,
                 }
             ]
         )
     )[0]["generation"]
     progress(1.0, desc="Text classification task generated")
-    return system_prompt, pd.DataFrame()
-def generate_sample_dataset(system_prompt, progress=gr.Progress()):
-    df = generate_dataset(
         system_prompt=system_prompt,
-        difficulty="mixed",
-        clarity="mixed",
-        labels=[],
-        num_labels=1,
         num_rows=10,
         progress=progress,
         is_sample=True,
     )
-    if "label" in df.columns:
-        df = df[["label", "text"]]
-    elif "labels" in df.columns:
-        df = df[["labels", "text"]]
-    return df
 def generate_dataset(
@@ -88,17 +82,13 @@ def generate_dataset(
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
-    if is_sample:
-        multiplier = 1
-    else:
-        multiplier = 2
     progress(0.0, desc="(1/2) Generating text classification data")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
         difficulty=difficulty, clarity=clarity, is_sample=is_sample
     )
     labeller_generator = get_labeller_generator(
-        system_prompt=system_prompt,
         labels=labels,
         num_labels=num_labels,
     )
@@ -110,13 +100,15 @@ def generate_dataset(
     textcat_results = []
     while n_processed < num_rows:
         progress(
-            multiplier * 0.5 * n_processed / num_rows,
             total=total_steps,
             desc="(1/2) Generating text classification data",
         )
         remaining_rows = num_rows - n_processed
         batch_size = min(batch_size, remaining_rows)
-        inputs = [{"task": system_prompt} for _ in range(batch_size)]
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
         n_processed += batch_size
@@ -124,58 +116,41 @@ def generate_dataset(
         result["text"] = result["input_text"]
     # label text classification data
-    progress(multiplier * 0.5, desc="(1/2) Generating text classification data")
-    if not is_sample:
-        n_processed = 0
-        labeller_results = []
-        while n_processed < num_rows:
-            progress(
-                0.5 + 0.5 * n_processed / num_rows,
-                total=total_steps,
-                desc="(1/2) Labeling text classification data",
-            )
-            batch = textcat_results[n_processed : n_processed + batch_size]
-            labels_batch = list(labeller_generator.process(inputs=batch))
-            labeller_results.extend(labels_batch[0])
-            n_processed += batch_size
         progress(
-            1,
             total=total_steps,
-            desc="(2/2) Creating dataset",
         )
     # create final dataset
     distiset_results = []
-    source_results = textcat_results if is_sample else labeller_results
-    for result in source_results:
         record = {
             key: result[key]
-            for key in ["text", "label" if is_sample else "labels"]
             if key in result
         }
         distiset_results.append(record)
     dataframe = pd.DataFrame(distiset_results)
-    if not is_sample:
-        if num_labels == 1:
-            dataframe = dataframe.rename(columns={"labels": "label"})
-            dataframe["label"] = dataframe["label"].apply(
-                lambda x: x.lower().strip() if x.lower().strip() in labels else None
-            )
-        else:
-            dataframe["labels"] = dataframe["labels"].apply(
-                lambda x: (
-                    list(
-                        set(
-                            label.lower().strip()
-                            for label in x
-                            if label.lower().strip() in labels
-                        )
-                    )
-                    if isinstance(x, list)
-                    else None
-                )
-            )
     progress(1.0, desc="Dataset generation completed")
     return dataframe
@@ -213,14 +188,14 @@ def push_dataset_to_hub(
     )
-def push_dataset_to_argilla(
     org_name: str,
     repo_name: str,
     system_prompt: str,
     difficulty: str,
     clarity: str,
     num_labels: int = 1,
-    n_rows: int = 10,
     labels: List[str] = None,
     private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
@@ -232,7 +207,7 @@ def push_dataset_to_argilla(
         clarity=clarity,
         num_labels=num_labels,
         labels=labels,
-        num_rows=n_rows,
     )
     push_dataset_to_hub(
         dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
@@ -283,7 +258,7 @@ def push_dataset_to_argilla(
         )
         dataframe["text_length"] = dataframe["text"].apply(len)
-        dataframe["text_embeddings"] = get_embeddings(dataframe["text"])
         progress(0.5, desc="Creating dataset")
         rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
@@ -332,15 +307,6 @@ def push_dataset_to_argilla(
     return ""
-def update_suggested_labels(system_prompt):
-    new_labels = re.findall(r"'(\b[\w-]+\b)'", system_prompt)
-    if not new_labels:
-        return gr.Warning(
-            "No labels found in the system prompt. Please add labels manually."
-        )
-    return gr.update(choices=new_labels, value=new_labels)
 def validate_input_labels(labels):
     if not labels or len(labels) < 2:
         raise gr.Error(
@@ -353,44 +319,74 @@ def update_max_num_labels(labels):
     return gr.update(maximum=len(labels) if labels else 1)
 with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
     with gr.Column() as main_ui:
         gr.Markdown("## 1. Describe the dataset you want")
         with gr.Row():
-            with gr.Column(scale=1):
                 dataset_description = gr.Textbox(
                     label="Dataset description",
                     placeholder="Give a precise description of your desired dataset.",
                 )
                 examples = gr.Examples(
                     examples=DEFAULT_DATASET_DESCRIPTIONS,
                     inputs=[dataset_description],
                     cache_examples=False,
-                    label="Example descriptions",
                 )
-                load_btn = gr.Button("Load dataset", variant="primary")
-            with gr.Column(scale=3):
                 pass
         gr.HTML("<hr>")
-        gr.Markdown("## 2. Configure your task")
-        with gr.Row():
             with gr.Column(scale=1):
                 system_prompt = gr.Textbox(
                     label="System prompt",
                     placeholder="You are a helpful assistant.",
                     visible=True,
                 )
-                difficulty = gr.Dropdown(
-                    choices=[
-                        ("High School", "high school"),
-                        ("College", "college"),
-                        ("PhD", "PhD"),
-                        ("Mixed", "mixed"),
-                    ],
-                    value="mixed",
-                    label="Difficulty",
-                    info="Select the comprehension level for the text. Ensure it matches the task context.",
                     interactive=True,
                 )
                 clarity = gr.Dropdown(
@@ -408,30 +404,30 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                     info="Set how easily the correct label or labels can be identified.",
                     interactive=True,
                 )
-                labels = gr.Dropdown(
-                    choices=[],
-                    allow_custom_value=True,
                     interactive=True,
-                    label="Labels",
-                    multiselect=True,
-                    info="Add the labels to classify the text.",
                 )
-                num_labels = gr.Number(
-                    label="Number of labels per text",
-                    value=1,
-                    minimum=1,
-                    maximum=10,
-                    info="Select 1 for single-label and >1 for multi-label.",
-                    interactive=True,
                 )
-                btn_apply_to_sample_dataset = gr.Button("Refresh dataset")
             with gr.Column(scale=3):
-                dataframe = gr.Dataframe()
         gr.HTML("<hr>")
         gr.Markdown("## 3. Generate your dataset")
-        with gr.Row():
-            with gr.Column(scale=1):
                 org_name = get_org_dropdown()
                 repo_name = gr.Textbox(
                     label="Repo name",
@@ -439,7 +435,7 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                     value=f"my-distiset-{str(uuid.uuid4())[:8]}",
                     interactive=True,
                 )
-                n_rows = gr.Number(
                     label="Number of rows",
                     value=10,
                     interactive=True,
@@ -454,39 +450,54 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
                 btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
             with gr.Column(scale=3):
                 success_message = gr.Markdown(visible=True)
-        pipeline_code = get_pipeline_code_ui(
-            generate_pipeline_code(
-                system_prompt.value,
-                difficulty=difficulty.value,
-                clarity=clarity.value,
-                labels=labels.value,
-                num_labels=num_labels.value,
-                num_rows=n_rows.value,
-            )
-        )
-    gr.on(
-        triggers=[load_btn.click, btn_apply_to_sample_dataset.click],
         fn=generate_system_prompt,
-        inputs=[dataset_description],
-        outputs=[system_prompt, dataframe],
         show_progress=True,
     ).then(
         fn=generate_sample_dataset,
-        inputs=[system_prompt],
         outputs=[dataframe],
         show_progress=True,
-    ).then(
-        fn=update_suggested_labels,
-        inputs=[system_prompt],
-        outputs=labels,
     ).then(
         fn=update_max_num_labels,
         inputs=[labels],
         outputs=[num_labels],
     )
     btn_push_to_hub.click(
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],
@@ -502,7 +513,11 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
         outputs=[success_message],
         show_progress=True,
     ).success(
-        fn=push_dataset_to_argilla,
         inputs=[
             org_name,
             repo_name,
@@ -510,16 +525,32 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
             difficulty,
             clarity,
             num_labels,
-            n_rows,
             labels,
             private,
         ],
         outputs=[success_message],
         show_progress=True,
     ).success(
-        fn=show_success_message_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
-    app.load(fn=swap_visibilty, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])

+import json
 import uuid
 from typing import List, Union
 from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
     hide_success_message,
+    show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
 )
 from src.distilabel_dataset_generator.pipelines.textcat import (
     DEFAULT_DATASET_DESCRIPTIONS,
     generate_pipeline_code,
     get_labeller_generator,
     get_prompt_generator,
     get_argilla_client,
     get_org_dropdown,
     get_preprocess_labels,
+    swap_visibility,
 )
+def generate_system_prompt(dataset_description, temperature, progress=gr.Progress()):
     progress(0.0, desc="Generating text classification task")
     progress(0.3, desc="Initializing text generation")
+    generate_description = get_prompt_generator(temperature)
     progress(0.7, desc="Generating text classification task")
+    result = next(
         generate_description.process(
             [
                 {
                     "instruction": dataset_description,
                 }
             ]
         )
     )[0]["generation"]
     progress(1.0, desc="Text classification task generated")
+    data = json.loads(result)
+    system_prompt = data["classification_task"]
+    labels = data["labels"]
+    return system_prompt, labels
+def generate_sample_dataset(system_prompt, difficulty, clarity, labels, num_labels, progress=gr.Progress()):
+    dataframe = generate_dataset(
         system_prompt=system_prompt,
+        difficulty=difficulty,
+        clarity=clarity,
+        labels=labels,
+        num_labels=num_labels,
         num_rows=10,
         progress=progress,
         is_sample=True,
     )
+    return dataframe
 def generate_dataset(
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating text classification data")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
         difficulty=difficulty, clarity=clarity, is_sample=is_sample
     )
     labeller_generator = get_labeller_generator(
+        system_prompt=f"{system_prompt} {', '.join(labels)}",
         labels=labels,
         num_labels=num_labels,
     )
     textcat_results = []
     while n_processed < num_rows:
         progress(
+            2 * 0.5 * n_processed / num_rows,
             total=total_steps,
             desc="(1/2) Generating text classification data",
         )
         remaining_rows = num_rows - n_processed
         batch_size = min(batch_size, remaining_rows)
+        inputs = [
+            {"task": f"{system_prompt} {', '.join(labels)}"} for _ in range(batch_size)
+        ]
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
         n_processed += batch_size
         result["text"] = result["input_text"]
     # label text classification data
+    progress(2 * 0.5, desc="(1/2) Generating text classification data")
+    n_processed = 0
+    labeller_results = []
+    while n_processed < num_rows:
         progress(
+            0.5 + 0.5 * n_processed / num_rows,
             total=total_steps,
+            desc="(1/2) Labeling text classification data",
         )
+        batch = textcat_results[n_processed : n_processed + batch_size]
+        labels_batch = list(labeller_generator.process(inputs=batch))
+        labeller_results.extend(labels_batch[0])
+        n_processed += batch_size
+    progress(
+        1,
+        total=total_steps,
+        desc="(2/2) Creating dataset",
+    )
     # create final dataset
     distiset_results = []
+    for result in labeller_results:
         record = {
             key: result[key]
+            for key in ["labels", "text"]
             if key in result
         }
         distiset_results.append(record)
     dataframe = pd.DataFrame(distiset_results)
+    if num_labels == 1:
+        dataframe = dataframe.rename(columns={"labels": "label"})
+        dataframe["label"] = dataframe["label"].apply(
+            lambda x: x.lower().strip() if x.lower().strip() in labels else None
+        )
     progress(1.0, desc="Dataset generation completed")
     return dataframe
     )
+def push_dataset(
     org_name: str,
     repo_name: str,
     system_prompt: str,
     difficulty: str,
     clarity: str,
     num_labels: int = 1,
+    num_rows: int = 10,
     labels: List[str] = None,
     private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
         clarity=clarity,
         num_labels=num_labels,
         labels=labels,
+        num_rows=num_rows,
     )
     push_dataset_to_hub(
         dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
         )
         dataframe["text_length"] = dataframe["text"].apply(len)
+        dataframe["text_embeddings"] = get_embeddings(dataframe["text"].to_list())
         progress(0.5, desc="Creating dataset")
         rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
     return ""
 def validate_input_labels(labels):
     if not labels or len(labels) < 2:
         raise gr.Error(
     return gr.update(maximum=len(labels) if labels else 1)
+def show_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=True)}
+def hide_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=False)}
+######################
+# Gradio UI
+######################
 with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
     with gr.Column() as main_ui:
         gr.Markdown("## 1. Describe the dataset you want")
         with gr.Row():
+            with gr.Column(scale=2):
                 dataset_description = gr.Textbox(
                     label="Dataset description",
                     placeholder="Give a precise description of your desired dataset.",
                 )
+                with gr.Accordion("Temperature", open=False):
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=1,
+                        value=0.8,
+                        step=0.1,
+                        interactive=True,
+                        show_label=False,
+                    )
+                load_btn = gr.Button(
+                    "Create dataset",
+                    variant="primary",
+                )
+            with gr.Column(scale=2):
                 examples = gr.Examples(
                     examples=DEFAULT_DATASET_DESCRIPTIONS,
                     inputs=[dataset_description],
                     cache_examples=False,
+                    label="Examples",
                 )
+            with gr.Column(scale=1):
                 pass
         gr.HTML("<hr>")
+        gr.Markdown("## 2. Configure your dataset")
+        with gr.Row(equal_height=False):
             with gr.Column(scale=1):
                 system_prompt = gr.Textbox(
                     label="System prompt",
                     placeholder="You are a helpful assistant.",
                     visible=True,
                 )
+                labels = gr.Dropdown(
+                    choices=[],
+                    allow_custom_value=True,
+                    interactive=True,
+                    label="Labels",
+                    multiselect=True,
+                    info="Add the labels to classify the text.",
+                )
+                num_labels = gr.Number(
+                    label="Number of labels per text",
+                    value=1,
+                    minimum=1,
+                    maximum=10,
+                    info="Select 1 for single-label and >1 for multi-label.",
                     interactive=True,
                 )
                 clarity = gr.Dropdown(
                     info="Set how easily the correct label or labels can be identified.",
                     interactive=True,
                 )
+                difficulty = gr.Dropdown(
+                    choices=[
+                        ("High School", "high school"),
+                        ("College", "college"),
+                        ("PhD", "PhD"),
+                        ("Mixed", "mixed"),
+                    ],
+                    value="mixed",
+                    label="Difficulty",
+                    info="Select the comprehension level for the text. Ensure it matches the task context.",
                     interactive=True,
                 )
+                btn_apply_to_sample_dataset = gr.Button(
+                    "Refresh dataset", variant="secondary", size="sm"
                 )
             with gr.Column(scale=3):
+                dataframe = gr.Dataframe(
+                    headers=["labels", "text"], wrap=True, height=500, interactive=False
+                )
         gr.HTML("<hr>")
         gr.Markdown("## 3. Generate your dataset")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
                 org_name = get_org_dropdown()
                 repo_name = gr.Textbox(
                     label="Repo name",
                     value=f"my-distiset-{str(uuid.uuid4())[:8]}",
                     interactive=True,
                 )
+                num_rows = gr.Number(
                     label="Number of rows",
                     value=10,
                     interactive=True,
                 btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
             with gr.Column(scale=3):
                 success_message = gr.Markdown(visible=True)
+                with gr.Accordion(
+                    "Do you want to go further? Customize and run with Distilabel",
+                    open=False,
+                    visible=False,
+                ) as pipeline_code_ui:
+                    code = generate_pipeline_code(
+                        system_prompt.value,
+                        difficulty=difficulty.value,
+                        clarity=clarity.value,
+                        labels=labels.value,
+                        num_labels=num_labels.value,
+                        num_rows=num_rows.value,
+                    )
+                    pipeline_code = gr.Code(
+                        value=code,
+                        language="python",
+                        label="Distilabel Pipeline Code",
+                    )
+    load_btn.click(
         fn=generate_system_prompt,
+        inputs=[dataset_description, temperature],
+        outputs=[system_prompt, labels],
         show_progress=True,
     ).then(
         fn=generate_sample_dataset,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels],
         outputs=[dataframe],
         show_progress=True,
     ).then(
         fn=update_max_num_labels,
         inputs=[labels],
         outputs=[num_labels],
     )
+    labels.input(
+        fn=update_max_num_labels,
+        inputs=[labels],
+        outputs=[num_labels],
+    )
+    btn_apply_to_sample_dataset.click(
+        fn=generate_sample_dataset,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels],
+        outputs=[dataframe],
+        show_progress=True,
+    )
     btn_push_to_hub.click(
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],
         outputs=[success_message],
         show_progress=True,
     ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=push_dataset,
         inputs=[
             org_name,
             repo_name,
             difficulty,
             clarity,
             num_labels,
+            num_rows,
             labels,
             private,
         ],
         outputs=[success_message],
         show_progress=True,
     ).success(
+        fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            system_prompt,
+            difficulty,
+            clarity,
+            labels,
+            num_labels,
+            num_rows,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
     )
+    app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])

src/distilabel_dataset_generator/pipelines/eval.py ADDED Viewed

	@@ -0,0 +1,205 @@

+from typing import List
+from datasets import get_dataset_config_names, get_dataset_split_names
+from distilabel.llms import InferenceEndpointsLLM
+from distilabel.steps.tasks import (
+    UltraFeedback,
+    TextGeneration,
+)
+from src.distilabel_dataset_generator.pipelines.base import (
+    MODEL,
+    _get_next_api_key,
+)
+from src.distilabel_dataset_generator.utils import extract_column_names
+def get_ultrafeedback_evaluator(aspect, is_sample):
+    ultrafeedback_evaluator = UltraFeedback(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={
+                "temperature": 0.7,
+                "max_new_tokens": 256 if is_sample else 2048,
+            },
+        ),
+        aspect=aspect,
+    )
+    ultrafeedback_evaluator.load()
+    return ultrafeedback_evaluator
+def get_custom_evaluator(prompt_template, structured_output, columns, is_sample):
+    custom_evaluator = TextGeneration(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            structured_output={"format": "json", "schema": structured_output},
+            generation_kwargs={
+                "temperature": 0.7,
+                "max_new_tokens": 256 if is_sample else 2048,
+            },
+        ),
+        template=prompt_template,
+        columns=columns
+    )
+    custom_evaluator.load()
+    return custom_evaluator
+def generate_ultrafeedback_pipeline_code(
+    repo_id, subset, split, aspects, instruction_column, response_columns, num_rows
+):
+    if len(aspects) == 1:
+        code = f"""
+# Requirements: `pip install distilabel[hf-inference-endpoints]`
+import os
+from datasets import load_dataset
+from distilabel.pipeline import Pipeline
+from distilabel.steps import LoadDataFromDicts
+from distilabel.steps.tasks import UltraFeedback
+from distilabel.llms import InferenceEndpointsLLM
+MODEL = "{MODEL}"
+os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
+hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}[:{num_rows}]")
+data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
+with Pipeline(name="ultrafeedback") as pipeline:
+    load_the_dataset = LoadDataFromDicts(
+        data = data,
+    )
+    ultrafeedback_evaluator = UltraFeedback(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=os.environ["HF_TOKEN"],
+            generation_kwargs={{
+                "temperature": 0.7,
+                "max_new_tokens": 2048,
+            }},
+        ),
+        aspect=aspect,
+    )
+    load_the_dataset >> ultrafeedback_evaluator
+if __name__ == "__main__":
+    distiset = pipeline.run()
+"""
+    else:
+        code = f"""
+# Requirements: `pip install distilabel[hf-inference-endpoints]`
+import os
+from distilabel.pipeline import Pipeline
+from distilabel.steps import LoadDataFromDicts, CombineOutputs
+from distilabel.steps.tasks import UltraFeedback
+from distilabel.llms import InferenceEndpointsLLM
+MODEL = "{MODEL}"
+os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
+hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}")
+data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
+with Pipeline(name="ultrafeedback") as pipeline:
+    load_the_dataset = LoadDataFromDicts(
+        data = data,
+    )
+    tasks = []
+    for aspect in aspects:
+        evaluate_responses = UltraFeedback(
+            name=f"evaluate-responses-{{aspect}}",
+            aspect=aspect,
+            llm=InferenceEndpointsLLM(
+                model_id=MODEL,
+                tokenizer_id=MODEL,
+                api_key=os.environ["HF_TOKEN"],
+                generation_kwargs={{
+                    "temperature": 0.7,
+                    "max_new_tokens": 2048,
+                }},
+            output_mappings={{
+                "ratings": f"ratings_{{aspect}}",
+                "types": f"type_{{aspect}}",
+                "rationales": f"rationales_for_types_{{aspect}}",
+                "rationales-for-ratings": f"rationales_for_ratings_{{aspect}}",
+            }} if aspect in ["truthfulness", "helpfulness"] else {{"rationales": f"rationales_{{aspect}}", "ratings": f"ratings_{{aspect}}"}},
+        )
+        tasks.append(evaluate_responses)
+    combine_outputs = CombineOutputs()
+    load_the_dataset >> tasks >> combine_outputs
+if __name__ == "__main__":
+    distiset = pipeline.run()
+"""
+    return code
+def generate_custom_pipeline_code(
+    repo_id, subset, split, prompt_template, structured_output, num_rows
+):
+    columns = extract_column_names(structured_output)
+    code = f"""
+# Requirements: `pip install distilabel[hf-inference-endpoints, instructor]`
+import os
+from distilabel.pipeline import Pipeline
+from distilabel.steps import LoadDataFromHub
+from distilabel.steps.tasks import TextGeneration
+from distilabel.llms import InferenceEndpointsLLM
+MODEL = "{MODEL}"
+CUSTOM_TEMPLATE = "{prompt_template}"
+os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
+with Pipeline(name="custom-evaluation") as pipeline:
+    load_the_dataset = LoadDataFromHub(
+        repo_id="{repo_id}",
+        config="{subset}",
+        split="{split}",
+        num_examples={num_rows},
+        batch_size=2
+    )
+    custom_evaluator = TextGeneration(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=os.environ["HF_TOKEN"],
+            structured_output={{"format": "json", "schema": {structured_output}}},
+            generation_kwargs={{
+                "temperature": 0.7,
+                "max_new_tokens": 2048,
+            }},
+        ),
+        template=CUSTOM_TEMPLATE,
+        columns={columns}
+    )
+    load_the_dataset >> custom_evaluator
+if __name__ == "__main__":
+    distiset = pipeline.run()
+"""
+    return code
+def generate_pipeline_code(repo_id, aspects, instruction_column, response_columns, prompt_template, structured_output, num_rows, eval_type):
+    if repo_id is None:
+        subset = "default"
+        split = "train"
+    else:
+        subset = get_dataset_config_names(repo_id)[0]
+        split = get_dataset_split_names(repo_id, subset)[0]
+    if eval_type == "ultrafeedback":
+        return generate_ultrafeedback_pipeline_code(repo_id, subset, split, aspects, instruction_column, response_columns, num_rows)
+    return generate_custom_pipeline_code(repo_id, subset, split, prompt_template, structured_output, num_rows)

src/distilabel_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -138,52 +138,26 @@ def _get_output_mappings(num_turns):
         return {"conversation": "messages"}
-def generate_pipeline_code(system_prompt, num_turns, num_rows):
-    input_mappings = _get_output_mappings(num_turns)
-    code = f"""
-# Requirements: `pip install distilabel[hf-inference-endpoints]`
-import os
-from distilabel.pipeline import Pipeline
-from distilabel.steps import KeepColumns
-from distilabel.steps.tasks import MagpieGenerator
-from distilabel.llms import InferenceEndpointsLLM
-MODEL = "{MODEL}"
-SYSTEM_PROMPT = "{system_prompt}"
-os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
-with Pipeline(name="sft") as pipeline:
-    magpie = MagpieGenerator(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             tokenizer_id=MODEL,
-            magpie_pre_query_template="llama3",
-            generation_kwargs={{
-                "temperature": 0.9,
-                "do_sample": True,
                 "max_new_tokens": 2048,
-                "stop_sequences": {_STOP_SEQUENCES}
-            }},
-            api_key=os.environ["HF_TOKEN"],
         ),
-        n_turns={num_turns},
-        num_rows={num_rows},
-        batch_size=1,
-        system_prompt=SYSTEM_PROMPT,
-        output_mappings={input_mappings},
-    )
-    keep_columns = KeepColumns(
-        columns={list(input_mappings.values())} + ["model_name"],
     )
-    magpie.connect(keep_columns)
-if __name__ == "__main__":
-    distiset = pipeline.run()
-"""
-    return code
-def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings.copy()
     if num_turns == 1:
@@ -228,7 +202,7 @@ def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
     return magpie_generator
-def get_response_generator(num_turns, system_prompt, is_sample):
     if num_turns == 1:
         response_generator = TextGeneration(
             llm=InferenceEndpointsLLM(
@@ -262,19 +236,46 @@ def get_response_generator(num_turns, system_prompt, is_sample):
     return response_generator
-def get_prompt_generator():
-    prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(
-            api_key=_get_next_api_key(),
             model_id=MODEL,
             tokenizer_id=MODEL,
-            generation_kwargs={
-                "temperature": 0.8,
-                "max_new_tokens": 2048,
                 "do_sample": True,
-            },
         ),
-        use_system_prompt=True,
     )
-    prompt_generator.load()
-    return prompt_generator

         return {"conversation": "messages"}
+def get_prompt_generator(temperature):
+    prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(
+            api_key=_get_next_api_key(),
             model_id=MODEL,
             tokenizer_id=MODEL,
+            generation_kwargs={
+                "temperature": temperature,
                 "max_new_tokens": 2048,
+                "do_sample": True,
+            },
         ),
+        system_prompt=PROMPT_CREATION_PROMPT,
+        use_system_prompt=True,
     )
+    prompt_generator.load()
+    return prompt_generator
+def get_magpie_generator(system_prompt, num_turns, is_sample):
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings.copy()
     if num_turns == 1:
     return magpie_generator
+def get_response_generator(system_prompt, num_turns, is_sample):
     if num_turns == 1:
         response_generator = TextGeneration(
             llm=InferenceEndpointsLLM(
     return response_generator
+def generate_pipeline_code(system_prompt, num_turns, num_rows):
+    input_mappings = _get_output_mappings(num_turns)
+    code = f"""
+# Requirements: `pip install distilabel[hf-inference-endpoints]`
+import os
+from distilabel.pipeline import Pipeline
+from distilabel.steps import KeepColumns
+from distilabel.steps.tasks import MagpieGenerator
+from distilabel.llms import InferenceEndpointsLLM
+MODEL = "{MODEL}"
+SYSTEM_PROMPT = "{system_prompt}"
+os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
+with Pipeline(name="sft") as pipeline:
+    magpie = MagpieGenerator(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             tokenizer_id=MODEL,
+            magpie_pre_query_template="llama3",
+            generation_kwargs={{
+                "temperature": 0.9,
                 "do_sample": True,
+                "max_new_tokens": 2048,
+                "stop_sequences": {_STOP_SEQUENCES}
+            }},
+            api_key=os.environ["HF_TOKEN"],
         ),
+        n_turns={num_turns},
+        num_rows={num_rows},
+        batch_size=1,
+        system_prompt=SYSTEM_PROMPT,
+        output_mappings={input_mappings},
     )
+    keep_columns = KeepColumns(
+        columns={list(input_mappings.values())} + ["model_name"],
+    )
+    magpie.connect(keep_columns)
+if __name__ == "__main__":
+    distiset = pipeline.run()
+"""
+    return code

src/distilabel_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import random
 from typing import List
 from distilabel.llms import InferenceEndpointsLLM
@@ -22,25 +23,27 @@ The prompt you write should follow the same style and structure as the following
 If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
-Classify the following customer review of a cinema as either 'positive' or 'negative'.
-Classify the following news article into one or more of the following categories: 'politics', 'sports', 'technology', 'entertainment', 'health', 'business', 'environment', 'education', 'science', 'international'.
-Determine the sentiment of the following social media post: 'ambiguous', 'sarcastic', 'informative', 'emotional'.
-Identify the issue category for the following technical support ticket: 'billing', 'technical', 'account', 'shipping', 'returns', 'installation', 'subscription'.
-Classify the following movie review into one of the following categories: 'critical', 'praise', 'disappointed', 'enthusiastic'.
-Determine the level of customer satisfaction from the following customer service transcript: 'satisfied', 'dissatisfied', 'highly-satisfied', 'somewhat-dissatisfied', 'indifferent'.
-Categorize the following product description into one of the following product types: 'smartphone', 'laptop', 'tablet', 'smartwatch', 'e-reader', 'headphones'.
-Classify the following tweet as expressing either 'support' or 'opposition' to the political event discussed.
-Classify the following restaurant review into one of the following categories: 'food-quality', 'service', 'ambiance', or 'price'.
-Classify the following blog post based on its primary fashion trend or style: 'casual', 'formal', 'streetwear', 'vintage' or 'sustainable-fashion'.
 User dataset description:
 """
@@ -51,6 +54,82 @@ DEFAULT_DATASET_DESCRIPTIONS = [
 ]
 def generate_pipeline_code(
     system_prompt: str,
     difficulty: str = None,
@@ -146,63 +225,3 @@ with Pipeline(name="textcat") as pipeline:
         distiset = pipeline.run()
     """
     )
-def get_textcat_generator(difficulty, clarity, is_sample):
-    textcat_generator = GenerateTextClassificationData(
-        llm=InferenceEndpointsLLM(
-            model_id=MODEL,
-            tokenizer_id=MODEL,
-            api_key=_get_next_api_key(),
-            generation_kwargs={
-                "temperature": 0.9,
-                "max_new_tokens": 256 if is_sample else 2048,
-                "do_sample": True,
-                "top_k": 50,
-                "top_p": 0.95,
-            },
-        ),
-        difficulty=None if difficulty == "mixed" else difficulty,
-        clarity=None if clarity == "mixed" else clarity,
-        seed=random.randint(0, 2**32 - 1),
-    )
-    textcat_generator.load()
-    return textcat_generator
-def get_labeller_generator(system_prompt, labels, num_labels):
-    labeller_generator = TextClassification(
-        llm=InferenceEndpointsLLM(
-            model_id=MODEL,
-            tokenizer_id=MODEL,
-            api_key=_get_next_api_key(),
-            generation_kwargs={
-                "temperature": 0.7,
-                "max_new_tokens": 2048,
-            },
-        ),
-        context=system_prompt,
-        available_labels=labels,
-        n=num_labels,
-        default_label="unknown",
-    )
-    labeller_generator.load()
-    return labeller_generator
-def get_prompt_generator():
-    prompt_generator = TextGeneration(
-        llm=InferenceEndpointsLLM(
-            api_key=_get_next_api_key(),
-            model_id=MODEL,
-            tokenizer_id=MODEL,
-            generation_kwargs={
-                "temperature": 0.8,
-                "max_new_tokens": 2048,
-                "do_sample": True,
-            },
-        ),
-        use_system_prompt=True,
-    )
-    prompt_generator.load()
-    return prompt_generator

 import random
+from pydantic import BaseModel, Field
 from typing import List
 from distilabel.llms import InferenceEndpointsLLM
 If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
+{"classification_task": "Classify the following customer review of a cinema as", "labels": ["positive", "negative"]}
+{"classification_task": "Categorize the following news article into one or more of the following categories:", "labels": ["politics", "sports", "technology", "entertainment", "health", "business", "environment", "education", "science", "international"]}
+{"classification_task": "Classify the following news article into one or more of the following categories:", "labels": ['politics', 'sports', 'technology', 'entertainment', 'health', 'business', 'environment', 'education', 'science', 'international']}
+{"classification_task": "Determine the sentiment of the following social media post:", "labels": ['ambiguous', 'sarcastic', 'informative', 'emotional']}
+{"classification_task": "Identify the issue category for the following technical support ticket:", "labels": ['billing', 'technical', 'account', 'shipping', 'returns', 'installation', 'subscription']}
+{"classification_task": "Classify the following movie review into one of the following categories:", "labels": ['critical', 'praise', 'disappointed', 'enthusiastic']}
+{"classification_task": "Categorize the following customer service transcript into one of the following categories:", "labels": ['satisfied', 'dissatisfied', 'highly-satisfied', 'somewhat-dissatisfied', 'indifferent']}
+{"classification_task": "Classify the following product description into one of the following product types:", "labels": ['smartphone', 'laptop', 'tablet', 'smartwatch', 'e-reader', 'headphones']}
+{"classification_task": "Categorize the following tweet expressing the political event discussed as", "labels": ['support', 'opposition']}
+{"classification_task": "Classify the following restaurant review into one of the following categories:", "labels": ['food-quality', 'service', 'ambiance', 'price']}
+{"classification_task": "Categorize the following blog post based on its primary fashion trend or style:", "labels": ['casual', 'formal', 'streetwear', 'vintage', 'sustainable-fashion']}
 User dataset description:
 """
 ]
+class TextClassificationTask(BaseModel):
+    classification_task: str = Field(
+        ...,
+        title="classification_task",
+        description="The classification task to be performed.",
+    )
+    labels: list[str] = Field(
+        ...,
+        title="Labels",
+        description="The possible labels for the classification task.",
+    )
+def get_prompt_generator(temperature):
+    prompt_generator = TextGeneration(
+        llm=InferenceEndpointsLLM(
+            api_key=_get_next_api_key(),
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            structured_output={"format": "json", "schema": TextClassificationTask},
+            generation_kwargs={
+                "temperature": temperature,
+                "max_new_tokens": 2048,
+                "do_sample": True,
+            },
+        ),
+        system_prompt=PROMPT_CREATION_PROMPT,
+        use_system_prompt=True,
+    )
+    prompt_generator.load()
+    return prompt_generator
+def get_textcat_generator(difficulty, clarity, is_sample):
+    textcat_generator = GenerateTextClassificationData(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={
+                "temperature": 0.9,
+                "max_new_tokens": 256 if is_sample else 2048,
+                "do_sample": True,
+                "top_k": 50,
+                "top_p": 0.95,
+            },
+        ),
+        difficulty=None if difficulty == "mixed" else difficulty,
+        clarity=None if clarity == "mixed" else clarity,
+        seed=random.randint(0, 2**32 - 1),
+    )
+    textcat_generator.load()
+    return textcat_generator
+def get_labeller_generator(system_prompt, labels, num_labels):
+    labeller_generator = TextClassification(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={
+                "temperature": 0.7,
+                "max_new_tokens": 2048,
+            },
+        ),
+        context=system_prompt,
+        available_labels=labels,
+        n=num_labels,
+        default_label="unknown",
+    )
+    labeller_generator.load()
+    return labeller_generator
 def generate_pipeline_code(
     system_prompt: str,
     difficulty: str = None,
         distiset = pipeline.run()
     """
     )

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import os
 from typing import List, Optional, Union
 import argilla as rg
 import gradio as gr
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
     OAUTH_CLIENT_SECRET,
@@ -11,6 +14,7 @@ from gradio.oauth import (
     get_space,
 )
 from huggingface_hub import whoami
 _LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"
@@ -50,22 +54,22 @@ def list_orgs(oauth_token: OAuthToken = None):
             return []
         data = whoami(oauth_token.token)
         if data["auth"]["type"] == "oauth":
-            organisations = [data["name"]] + [org["name"] for org in data["orgs"]]
         elif data["auth"]["type"] == "access_token":
-            organisations = [org["name"] for org in data["orgs"]]
         else:
-            organisations = [
                 entry["entity"]["name"]
                 for entry in data["auth"]["accessToken"]["fineGrained"]["scoped"]
                 if "repo.write" in entry["permissions"]
             ]
-            organisations = [org for org in organisations if org != data["name"]]
-            organisations = [data["name"]] + organisations
     except Exception as e:
         raise gr.Error(
             f"Failed to get organizations: {e}. See if you are logged and connected: https://huggingface.co/settings/connected-applications."
         )
-    return organisations
 def get_org_dropdown(oauth_token: OAuthToken = None):
@@ -89,7 +93,7 @@ def get_token(oauth_token: OAuthToken = None):
         return ""
-def swap_visibilty(oauth_token: Optional[OAuthToken] = None):
     if oauth_token:
         return gr.update(elem_classes=["main_ui_logged_in"])
     else:
@@ -132,6 +136,91 @@ def get_argilla_client() -> Union[rg.Argilla, None]:
     except Exception:
         return None
 def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
     return list(set([label.lower().strip() for label in labels])) if labels else []

+import json
 import os
 from typing import List, Optional, Union
 import argilla as rg
 import gradio as gr
+import numpy as np
+import pandas as pd
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
     OAUTH_CLIENT_SECRET,
     get_space,
 )
 from huggingface_hub import whoami
+from jinja2 import Environment, meta
 _LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"
             return []
         data = whoami(oauth_token.token)
         if data["auth"]["type"] == "oauth":
+            organizations = [data["name"]] + [org["name"] for org in data["orgs"]]
         elif data["auth"]["type"] == "access_token":
+            organizations = [org["name"] for org in data["orgs"]]
         else:
+            organizations = [
                 entry["entity"]["name"]
                 for entry in data["auth"]["accessToken"]["fineGrained"]["scoped"]
                 if "repo.write" in entry["permissions"]
             ]
+            organizations = [org for org in organizations if org != data["name"]]
+            organizations = [data["name"]] + organizations
     except Exception as e:
         raise gr.Error(
             f"Failed to get organizations: {e}. See if you are logged and connected: https://huggingface.co/settings/connected-applications."
         )
+    return organizations
 def get_org_dropdown(oauth_token: OAuthToken = None):
         return ""
+def swap_visibility(oauth_token: Optional[OAuthToken] = None):
     if oauth_token:
         return gr.update(elem_classes=["main_ui_logged_in"])
     else:
     except Exception:
         return None
 def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
     return list(set([label.lower().strip() for label in labels])) if labels else []
+def column_to_list(dataframe: pd.DataFrame, column_name: str) -> List[str]:
+    if column_name in dataframe.columns:
+        return dataframe[column_name].tolist()
+    else:
+        raise ValueError(f"Column '{column_name}' does not exist.")
+def process_columns(
+    dataframe,
+    instruction_column: str,
+    response_columns: Union[str, List[str]],
+) -> List[dict]:
+    instruction_column = [instruction_column]
+    if isinstance(response_columns, str):
+        response_columns = [response_columns]
+    data = []
+    for _, row in dataframe.iterrows():
+        instruction = ""
+        for col in instruction_column:
+            value = row[col]
+            if isinstance(value, (list, np.ndarray)):
+                user_contents = [d["content"] for d in value if d.get("role") == "user"]
+                if user_contents:
+                    instruction = user_contents[-1]
+            elif isinstance(value, str):
+                try:
+                    parsed_message = json.loads(value)
+                    user_contents = [
+                        d["content"] for d in parsed_message if d.get("role") == "user"
+                    ]
+                    if user_contents:
+                        instruction = user_contents[-1]
+                except json.JSONDecodeError:
+                    instruction = value
+            else:
+                instruction = ""
+        generations = []
+        for col in response_columns:
+            value = row[col]
+            if isinstance(value, (list, np.ndarray)):
+                if all(isinstance(item, dict) and "role" in item for item in value):
+                    assistant_contents = [
+                        d["content"] for d in value if d.get("role") == "assistant"
+                    ]
+                    if assistant_contents:
+                        generations.append(assistant_contents[-1])
+                else:
+                    generations.extend(value)
+            elif isinstance(value, str):
+                try:
+                    parsed_message = json.loads(value)
+                    assistant_contents = [
+                        d["content"]
+                        for d in parsed_message
+                        if d.get("role") == "assistant"
+                    ]
+                    if assistant_contents:
+                        generations.append(assistant_contents[-1])
+                except json.JSONDecodeError:
+                    generations.append(value)
+            else:
+                pass
+        data.append({"instruction": instruction, "generations": generations})
+    return data
+def extract_column_names(prompt_template: str) -> List[str]:
+    env = Environment()
+    parsed_content = env.parse(prompt_template)
+    variables = meta.find_undeclared_variables(parsed_content)
+    return list(variables)
+def pad_or_truncate_list(lst, target_length):
+    lst = lst or []
+    lst_length = len(lst)
+    if lst_length >= target_length:
+        return lst[-target_length:]
+    else:
+        return lst + [None] * (target_length - lst_length)