synthetic-data-generator

Runtime error

App Files Files Community

davidberenstein1957 commited on Dec 10, 2024

Commit

d15b1c7

unverified ·

2 Parent(s): 79801ad 136bd13

Merge pull request #6 from argilla-io/feat/improve-textcat

Browse files

update logic for creating samples within the textcat pipeline

Files changed (5) hide show

src/synthetic_dataset_generator/app.py +1 -3
src/synthetic_dataset_generator/apps/eval.py +0 -1
src/synthetic_dataset_generator/apps/sft.py +4 -5
src/synthetic_dataset_generator/apps/textcat.py +93 -70
src/synthetic_dataset_generator/pipelines/textcat.py +14 -20

src/synthetic_dataset_generator/app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from synthetic_dataset_generator._tabbedinterface import TabbedInterface
 # from synthetic_dataset_generator.apps.eval import app as eval_app
 from synthetic_dataset_generator.apps.readme import app as readme_app
 from synthetic_dataset_generator.apps.sft import app as sft_app
@@ -15,9 +16,6 @@ button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-prima
 #system_prompt_examples { color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
 .container {padding-inline: 0 !important}
 #sign_in_button { flex-grow: 0; width: auto !important; display: flex; align-items: center; justify-content: center; margin: 0 auto; }
-.table-view .table-wrap {
-    max-height: 450px;
-}
 """
 image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""

 from synthetic_dataset_generator._tabbedinterface import TabbedInterface
 # from synthetic_dataset_generator.apps.eval import app as eval_app
 from synthetic_dataset_generator.apps.readme import app as readme_app
 from synthetic_dataset_generator.apps.sft import app as sft_app
 #system_prompt_examples { color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
 .container {padding-inline: 0 !important}
 #sign_in_button { flex-grow: 0; width: auto !important; display: flex; align-items: center; justify-content: center; margin: 0 auto; }
 """
 image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""

src/synthetic_dataset_generator/apps/eval.py CHANGED Viewed

@@ -750,7 +750,6 @@ with gr.Blocks() as app:
                     headers=["prompt", "completion", "evaluation"],
                     wrap=True,
                     interactive=False,
-                    elem_classes="table-view",
                 )
         gr.HTML(value="<hr>")

                     headers=["prompt", "completion", "evaluation"],
                     wrap=True,
                     interactive=False,
                 )
         gr.HTML(value="<hr>")

src/synthetic_dataset_generator/apps/sft.py CHANGED Viewed

@@ -55,10 +55,10 @@ def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
-    progress(0.0, desc="Generating system prompt")
-    progress(0.3, desc="Initializing text generation")
     generate_description = get_prompt_generator()
-    progress(0.7, desc="Generating system prompt")
     result = next(
         generate_description.process(
             [
@@ -68,7 +68,7 @@ def generate_system_prompt(dataset_description, progress=gr.Progress()):
             ]
         )
     )[0]["generation"]
-    progress(1.0, desc="System prompt generated")
     return result
@@ -88,7 +88,6 @@ def _get_dataframe():
         headers=["prompt", "completion"],
         wrap=True,
         interactive=False,
-        elem_classes="table-view",
     )

 def generate_system_prompt(dataset_description, progress=gr.Progress()):
+    progress(0.0, desc="Starting")
+    progress(0.3, desc="Initializing")
     generate_description = get_prompt_generator()
+    progress(0.7, desc="Generating")
     result = next(
         generate_description.process(
             [
             ]
         )
     )[0]["generation"]
+    progress(1.0, desc="Prompt generated")
     return result
         headers=["prompt", "completion"],
         wrap=True,
         interactive=False,
     )

src/synthetic_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 import uuid
 from typing import List, Union
@@ -41,15 +42,14 @@ def _get_dataframe():
         headers=["labels", "text"],
         wrap=True,
         interactive=False,
-        elem_classes="table-view",
     )
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
-    progress(0.0, desc="Generating text classification task")
-    progress(0.3, desc="Initializing text generation")
     generate_description = get_prompt_generator()
-    progress(0.7, desc="Generating text classification task")
     result = next(
         generate_description.process(
             [
@@ -59,7 +59,7 @@ def generate_system_prompt(dataset_description, progress=gr.Progress()):
             ]
         )
     )[0]["generation"]
-    progress(1.0, desc="Text classification task generated")
     data = json.loads(result)
     system_prompt = data["classification_task"]
     labels = data["labels"]
@@ -93,7 +93,7 @@ def generate_dataset(
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
-    progress(0.0, desc="(1/2) Generating text classification data")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
         difficulty=difficulty,
@@ -116,13 +116,21 @@ def generate_dataset(
         progress(
             2 * 0.5 * n_processed / num_rows,
             total=total_steps,
-            desc="(1/2) Generating text classification data",
         )
         remaining_rows = num_rows - n_processed
         batch_size = min(batch_size, remaining_rows)
-        inputs = [
-            {"task": f"{system_prompt} {', '.join(labels)}"} for _ in range(batch_size)
-        ]
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
         n_processed += batch_size
@@ -130,14 +138,14 @@ def generate_dataset(
         result["text"] = result["input_text"]
     # label text classification data
-    progress(2 * 0.5, desc="(1/2) Generating text classification data")
     n_processed = 0
     labeller_results = []
     while n_processed < num_rows:
         progress(
             0.5 + 0.5 * n_processed / num_rows,
             total=total_steps,
-            desc="(1/2) Labeling text classification data",
         )
         batch = textcat_results[n_processed : n_processed + batch_size]
         labels_batch = list(labeller_generator.process(inputs=batch))
@@ -161,7 +169,19 @@ def generate_dataset(
         dataframe["label"] = dataframe["label"].apply(
             lambda x: x.lower().strip() if x.lower().strip() in labels else None
         )
-    progress(1.0, desc="Dataset generation completed")
     return dataframe
@@ -295,7 +315,7 @@ def push_dataset(
                 client=client,
             )
             rg_dataset = rg_dataset.create()
-        progress(0.7, desc="Pushing dataset to Argilla")
         hf_dataset = Dataset.from_pandas(dataframe)
         records = [
             rg.Record(
@@ -326,7 +346,7 @@ def push_dataset(
             for sample in hf_dataset
         ]
         rg_dataset.records.log(records=records)
-        progress(1.0, desc="Dataset pushed to Argilla")
     except Exception as e:
         raise gr.Error(f"Error pushing dataset to Argilla: {e}")
     return ""
@@ -385,61 +405,64 @@ with gr.Blocks() as app:
         gr.HTML("<hr>")
         gr.Markdown("## 2. Configure your dataset")
-        with gr.Row(equal_height=False):
-            with gr.Column(scale=2):
-                system_prompt = gr.Textbox(
-                    label="System prompt",
-                    placeholder="You are a helpful assistant.",
-                    visible=True,
-                )
-                labels = gr.Dropdown(
-                    choices=[],
-                    allow_custom_value=True,
-                    interactive=True,
-                    label="Labels",
-                    multiselect=True,
-                    info="Add the labels to classify the text.",
-                )
-                num_labels = gr.Number(
-                    label="Number of labels per text",
-                    value=1,
-                    minimum=1,
-                    maximum=10,
-                    info="Select 1 for single-label and >1 for multi-label.",
-                    interactive=True,
-                )
-                clarity = gr.Dropdown(
-                    choices=[
-                        ("Clear", "clear"),
-                        (
-                            "Understandable",
-                            "understandable with some effort",
-                        ),
-                        ("Ambiguous", "ambiguous"),
-                        ("Mixed", "mixed"),
-                    ],
-                    value="mixed",
-                    label="Clarity",
-                    info="Set how easily the correct label or labels can be identified.",
-                    interactive=True,
-                )
-                difficulty = gr.Dropdown(
-                    choices=[
-                        ("High School", "high school"),
-                        ("College", "college"),
-                        ("PhD", "PhD"),
-                        ("Mixed", "mixed"),
-                    ],
-                    value="mixed",
-                    label="Difficulty",
-                    info="Select the comprehension level for the text. Ensure it matches the task context.",
-                    interactive=True,
-                )
-                with gr.Row():
-                    clear_btn_full = gr.Button("Clear", variant="secondary")
-                    btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
-            with gr.Column(scale=3):
-                dataframe = _get_dataframe()
         gr.HTML("<hr>")
         gr.Markdown("## 3. Generate your dataset")

 import json
+import random
 import uuid
 from typing import List, Union
         headers=["labels", "text"],
         wrap=True,
         interactive=False,
     )
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
+    progress(0.0, desc="Starting")
+    progress(0.3, desc="Initializing")
     generate_description = get_prompt_generator()
+    progress(0.7, desc="Generating")
     result = next(
         generate_description.process(
             [
             ]
         )
     )[0]["generation"]
+    progress(1.0, desc="Prompt generated")
     data = json.loads(result)
     system_prompt = data["classification_task"]
     labels = data["labels"]
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
+    progress(0.0, desc="(1/2) Generating dataset")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
         difficulty=difficulty,
         progress(
             2 * 0.5 * n_processed / num_rows,
             total=total_steps,
+            desc="(1/2) Generating dataset",
         )
         remaining_rows = num_rows - n_processed
         batch_size = min(batch_size, remaining_rows)
+        inputs = []
+        for _ in range(batch_size):
+            if num_labels == 1:
+                num_labels = 1
+            else:
+                num_labels = int(random.gammavariate(2, 2) * num_labels)
+            sampled_labels = random.sample(labels, num_labels)
+            random.shuffle(sampled_labels)
+            inputs.append(
+                {"task": f"{system_prompt}. Labels: {', '.join(sampled_labels)}"}
+            )
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
         n_processed += batch_size
         result["text"] = result["input_text"]
     # label text classification data
+    progress(2 * 0.5, desc="(2/2) Labeling dataset")
     n_processed = 0
     labeller_results = []
     while n_processed < num_rows:
         progress(
             0.5 + 0.5 * n_processed / num_rows,
             total=total_steps,
+            desc="(2/2) Labeling dataset",
         )
         batch = textcat_results[n_processed : n_processed + batch_size]
         labels_batch = list(labeller_generator.process(inputs=batch))
         dataframe["label"] = dataframe["label"].apply(
             lambda x: x.lower().strip() if x.lower().strip() in labels else None
         )
+    else:
+        dataframe["labels"] = dataframe["labels"].apply(
+            lambda x: list(
+                set(
+                    [
+                        label.lower().strip()
+                        for label in x
+                        if label.lower().strip() in labels
+                    ]
+                )
+            )
+        )
+    progress(1.0, desc="Dataset created")
     return dataframe
                 client=client,
             )
             rg_dataset = rg_dataset.create()
+        progress(0.7, desc="Pushing dataset")
         hf_dataset = Dataset.from_pandas(dataframe)
         records = [
             rg.Record(
             for sample in hf_dataset
         ]
         rg_dataset.records.log(records=records)
+        progress(1.0, desc="Dataset pushed")
     except Exception as e:
         raise gr.Error(f"Error pushing dataset to Argilla: {e}")
     return ""
         gr.HTML("<hr>")
         gr.Markdown("## 2. Configure your dataset")
+        with gr.Row(equal_height=True):
+            with gr.Row(equal_height=False):
+                with gr.Column(scale=2):
+                    system_prompt = gr.Textbox(
+                        label="System prompt",
+                        placeholder="You are a helpful assistant.",
+                        visible=True,
+                    )
+                    labels = gr.Dropdown(
+                        choices=[],
+                        allow_custom_value=True,
+                        interactive=True,
+                        label="Labels",
+                        multiselect=True,
+                        info="Add the labels to classify the text.",
+                    )
+                    num_labels = gr.Number(
+                        label="Number of labels per text",
+                        value=1,
+                        minimum=1,
+                        maximum=10,
+                        info="Select 1 for single-label and >1 for multi-label.",
+                        interactive=True,
+                    )
+                    clarity = gr.Dropdown(
+                        choices=[
+                            ("Clear", "clear"),
+                            (
+                                "Understandable",
+                                "understandable with some effort",
+                            ),
+                            ("Ambiguous", "ambiguous"),
+                            ("Mixed", "mixed"),
+                        ],
+                        value="understandable with some effort",
+                        label="Clarity",
+                        info="Set how easily the correct label or labels can be identified.",
+                        interactive=True,
+                    )
+                    difficulty = gr.Dropdown(
+                        choices=[
+                            ("High School", "high school"),
+                            ("College", "college"),
+                            ("PhD", "PhD"),
+                            ("Mixed", "mixed"),
+                        ],
+                        value="high school",
+                        label="Difficulty",
+                        info="Select the comprehension level for the text. Ensure it matches the task context.",
+                        interactive=True,
+                    )
+                    with gr.Row():
+                        clear_btn_full = gr.Button("Clear", variant="secondary")
+                        btn_apply_to_sample_dataset = gr.Button(
+                            "Save", variant="primary"
+                        )
+                with gr.Column(scale=3):
+                    dataframe = _get_dataframe()
         gr.HTML("<hr>")
         gr.Markdown("## 3. Generate your dataset")

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -15,35 +15,29 @@ from synthetic_dataset_generator.utils import get_preprocess_labels
 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
-Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else.
-The prompt you write should follow the same style and structure as the following example prompts, clearly specifying the possible classification labels.
-If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
-{"classification_task": "Classify the following customer review of a cinema as", "labels": ["positive", "negative"]}
-{"classification_task": "Categorize the following news article into one or more of the following categories:", "labels": ["politics", "sports", "technology", "entertainment", "health", "business", "environment", "education", "science", "international"]}
-{"classification_task": "Classify the following news article into one or more of the following categories:", "labels": ['politics', 'sports', 'technology', 'entertainment', 'health', 'business', 'environment', 'education', 'science', 'international']}
-{"classification_task": "Determine the sentiment of the following social media post:", "labels": ['ambiguous', 'sarcastic', 'informative', 'emotional']}
-{"classification_task": "Identify the issue category for the following technical support ticket:", "labels": ['billing', 'technical', 'account', 'shipping', 'returns', 'installation', 'subscription']}
-{"classification_task": "Classify the following movie review into one of the following categories:", "labels": ['critical', 'praise', 'disappointed', 'enthusiastic']}
-{"classification_task": "Categorize the following customer service transcript into one of the following categories:", "labels": ['satisfied', 'dissatisfied', 'highly-satisfied', 'somewhat-dissatisfied', 'indifferent']}
-{"classification_task": "Classify the following product description into one of the following product types:", "labels": ['smartphone', 'laptop', 'tablet', 'smartwatch', 'e-reader', 'headphones']}
-{"classification_task": "Categorize the following tweet expressing the political event discussed as", "labels": ['support', 'opposition']}
-{"classification_task": "Classify the following restaurant review into one of the following categories:", "labels": ['food-quality', 'service', 'ambiance', 'price']}
-{"classification_task": "Categorize the following blog post based on its primary fashion trend or style:", "labels": ['casual', 'formal', 'streetwear', 'vintage', 'sustainable-fashion']}
-User dataset description:
 """
 DEFAULT_DATASET_DESCRIPTIONS = [

 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
+Your should write a prompt following a the dataset description. Respond with the prompt and nothing else.
+The prompt should follow the same style and structure as the following example prompts, clearly specifying the possible classification labels.
+Make sure to always include all of the detailed information from the description and the context of the company that is provided.
+Don't include the labels in the classification_task but only provide a high level description of the classification task.
+If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
+Description: DavidMovieHouse is a cinema that has been in business for 10 years.
+Output: {"classification_task": "The company DavidMovieHouse is a cinema that has been in business for 10 years and has had customers reviews. Classify the customer reviews as", "labels": ["positive", "negative"]}
+Description: A dataset that focuses on creating neo-ludite discussions about technologies within the AI space.
+Output: {"classification_task": "Neo-ludiite discussions about technologies within the AI space cover. Categorize the discussions into one of the following categories", "labels": ["tech-support", "tech-opposition"]}
+Description: A dataset that covers the articles of a niche sports website called TheSportBlogs that focuses on female sports within the ballsport domain for the US market.
+Output: {"classification_task": "TechSportBlogs is a niche sports website that focuses on female sports within the ballsport domain for the US market. Determine the category of based on the article using the following categories", "labels": ["basketball", "volleyball", "tennis", "hockey", "baseball", "soccer"]}
+Description: A dataset covering customer reviews for an e-commerce website called Argilla that sells technology datasets within the open source Natural Language Processing space and has review with labels "data-quality", "data-accuracy", "customer-service", "price", "product-availability", "shipping-speed"
+Output: {"classification_task": "A dataset covering customer reviews for an e-commerce website called Argilla that sells technology datasets within the open source Natural Language Processing space and has review with labels", "labels": ["data-quality", "data-accuracy", "customer-service", "price", "product-availability", "shipping-speed"]}
+Description:
 """
 DEFAULT_DATASET_DESCRIPTIONS = [