synthetic-data-generator

Runtime error

App Files Files Community

davidberenstein1957 commited on Dec 10, 2024

Commit

e044b6a

1 Parent(s): 4983843

add label randomification perf created sample

Browse files

Files changed (2) hide show

src/synthetic_dataset_generator/apps/textcat.py +39 -6
src/synthetic_dataset_generator/pipelines/textcat.py +27 -20

src/synthetic_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 import uuid
 from typing import List, Union
@@ -11,6 +12,7 @@ from huggingface_hub import HfApi
 from src.synthetic_dataset_generator.apps.base import (
     hide_success_message,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
@@ -119,9 +121,17 @@ def generate_dataset(
         )
         remaining_rows = num_rows - n_processed
         batch_size = min(batch_size, remaining_rows)
-        inputs = [
-            {"task": f"{system_prompt} {', '.join(labels)}"} for _ in range(batch_size)
-        ]
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
         n_processed += batch_size
@@ -160,6 +170,18 @@ def generate_dataset(
         dataframe["label"] = dataframe["label"].apply(
             lambda x: x.lower().strip() if x.lower().strip() in labels else None
         )
     progress(1.0, desc="Dataset generation completed")
     return dataframe
@@ -172,6 +194,7 @@ def push_dataset_to_hub(
     labels: List[str] = None,
     oauth_token: Union[gr.OAuthToken, None] = None,
     private: bool = False,
 ):
     repo_id = validate_push_to_hub(org_name, repo_name)
     labels = get_preprocess_labels(labels)
@@ -195,6 +218,7 @@ def push_dataset_to_hub(
         token=oauth_token.token,
         create_pr=False,
     )
 def push_dataset(
@@ -208,6 +232,7 @@ def push_dataset(
     labels: List[str] = None,
     private: bool = False,
     temperature: float = 0.8,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
@@ -221,7 +246,14 @@ def push_dataset(
         temperature=temperature,
     )
     push_dataset_to_hub(
-        dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
     )
     dataframe = dataframe[
@@ -407,7 +439,7 @@ with gr.Blocks() as app:
                         ("Ambiguous", "ambiguous"),
                         ("Mixed", "mixed"),
                     ],
-                    value="mixed",
                     label="Clarity",
                     info="Set how easily the correct label or labels can be identified.",
                     interactive=True,
@@ -419,7 +451,7 @@ with gr.Blocks() as app:
                         ("PhD", "PhD"),
                         ("Mixed", "mixed"),
                     ],
-                    value="mixed",
                     label="Difficulty",
                     info="Select the comprehension level for the text. Ensure it matches the task context.",
                     interactive=True,
@@ -544,6 +576,7 @@ with gr.Blocks() as app:
             labels,
             private,
             temperature,
         ],
         outputs=[success_message],
         show_progress=True,

 import json
+import random
 import uuid
 from typing import List, Union
 from src.synthetic_dataset_generator.apps.base import (
     hide_success_message,
+    push_pipeline_code_to_hub,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
         )
         remaining_rows = num_rows - n_processed
         batch_size = min(batch_size, remaining_rows)
+        inputs = []
+        for _ in range(batch_size):
+            if num_labels == 1:
+                num_labels = 1
+            else:
+                num_labels = int(random.gammavariate(2, 2) * num_labels)
+            sampled_labels = random.sample(labels, num_labels)
+            random.shuffle(sampled_labels)
+            inputs.append(
+                {"task": f"{system_prompt}. Labels: {', '.join(sampled_labels)}"}
+            )
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
         n_processed += batch_size
         dataframe["label"] = dataframe["label"].apply(
             lambda x: x.lower().strip() if x.lower().strip() in labels else None
         )
+    else:
+        dataframe["labels"] = dataframe["labels"].apply(
+            lambda x: list(
+                set(
+                    [
+                        label.lower().strip()
+                        for label in x
+                        if label.lower().strip() in labels
+                    ]
+                )
+            )
+        )
     progress(1.0, desc="Dataset generation completed")
     return dataframe
     labels: List[str] = None,
     oauth_token: Union[gr.OAuthToken, None] = None,
     private: bool = False,
+    pipeline_code: str = "",
 ):
     repo_id = validate_push_to_hub(org_name, repo_name)
     labels = get_preprocess_labels(labels)
         token=oauth_token.token,
         create_pr=False,
     )
+    push_pipeline_code_to_hub(pipeline_code, org_name, repo_name, oauth_token)
 def push_dataset(
     labels: List[str] = None,
     private: bool = False,
     temperature: float = 0.8,
+    pipeline_code: str = "",
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
         temperature=temperature,
     )
     push_dataset_to_hub(
+        dataframe,
+        org_name,
+        repo_name,
+        num_labels,
+        labels,
+        oauth_token,
+        private,
+        pipeline_code,
     )
     dataframe = dataframe[
                         ("Ambiguous", "ambiguous"),
                         ("Mixed", "mixed"),
                     ],
+                    value="understandable with some effort",
                     label="Clarity",
                     info="Set how easily the correct label or labels can be identified.",
                     interactive=True,
                         ("PhD", "PhD"),
                         ("Mixed", "mixed"),
                     ],
+                    value="high school",
                     label="Difficulty",
                     info="Select the comprehension level for the text. Ensure it matches the task context.",
                     interactive=True,
             labels,
             private,
             temperature,
+            pipeline_code,
         ],
         outputs=[success_message],
         show_progress=True,

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -15,35 +15,29 @@ from synthetic_dataset_generator.utils import get_preprocess_labels
 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
-Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else.
-The prompt you write should follow the same style and structure as the following example prompts, clearly specifying the possible classification labels.
-If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
-{"classification_task": "Classify the following customer review of a cinema as", "labels": ["positive", "negative"]}
-{"classification_task": "Categorize the following news article into one or more of the following categories:", "labels": ["politics", "sports", "technology", "entertainment", "health", "business", "environment", "education", "science", "international"]}
-{"classification_task": "Classify the following news article into one or more of the following categories:", "labels": ['politics', 'sports', 'technology', 'entertainment', 'health', 'business', 'environment', 'education', 'science', 'international']}
-{"classification_task": "Determine the sentiment of the following social media post:", "labels": ['ambiguous', 'sarcastic', 'informative', 'emotional']}
-{"classification_task": "Identify the issue category for the following technical support ticket:", "labels": ['billing', 'technical', 'account', 'shipping', 'returns', 'installation', 'subscription']}
-{"classification_task": "Classify the following movie review into one of the following categories:", "labels": ['critical', 'praise', 'disappointed', 'enthusiastic']}
-{"classification_task": "Categorize the following customer service transcript into one of the following categories:", "labels": ['satisfied', 'dissatisfied', 'highly-satisfied', 'somewhat-dissatisfied', 'indifferent']}
-{"classification_task": "Classify the following product description into one of the following product types:", "labels": ['smartphone', 'laptop', 'tablet', 'smartwatch', 'e-reader', 'headphones']}
-{"classification_task": "Categorize the following tweet expressing the political event discussed as", "labels": ['support', 'opposition']}
-{"classification_task": "Classify the following restaurant review into one of the following categories:", "labels": ['food-quality', 'service', 'ambiance', 'price']}
-{"classification_task": "Categorize the following blog post based on its primary fashion trend or style:", "labels": ['casual', 'formal', 'streetwear', 'vintage', 'sustainable-fashion']}
-User dataset description:
 """
 DEFAULT_DATASET_DESCRIPTIONS = [
@@ -66,6 +60,19 @@ class TextClassificationTask(BaseModel):
     )
 def get_prompt_generator():
     prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(

 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
+Your should write a prompt following a the dataset description. Respond with the prompt and nothing else.
+The prompt should follow the same style and structure as the following example prompts, clearly specifying the possible classification labels.
+Make sure to always include all of the detailed information from the description and the context of the company that is provided.
+Don't include the labels in the classification_task but only provide a high level description of the classification task.
+If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
+Description: DavidMovieHouse is a cinema that has been in business for 10 years.
+Output: {"classification_task": "The company DavidMovieHouse is a cinema that has been in business for 10 years and has had customers reviews. Classify the customer reviews as", "labels": ["positive", "negative"]}
+Description: A dataset that focuses on creating neo-ludite discussions about technologies within the AI space.
+Output: {"classification_task": "Neo-ludiite discussions about technologies within the AI space cover. Categorize the discussions into one of the following categories", "labels": ["tech-support", "tech-opposition"]}
+Description: A dataset that covers the articles of a niche sports website called TheSportBlogs that focuses on female sports within the ballsport domain for the US market.
+Output: {"classification_task": "TechSportBlogs is a niche sports website that focuses on female sports within the ballsport domain for the US market. Determine the category of based on the article using the following categories", "labels": ["basketball", "volleyball", "tennis", "hockey", "baseball", "soccer"]}
+Description: A dataset covering customer reviews for an e-commerce website called Argilla that sells technology datasets within the open source Natural Language Processing space and has review with labels "data-quality", "data-accuracy", "customer-service", "price", "product-availability", "shipping-speed"
+Output: {"classification_task": "A dataset covering customer reviews for an e-commerce website called Argilla that sells technology datasets within the open source Natural Language Processing space and has review with labels", "labels": ["data-quality", "data-accuracy", "customer-service", "price", "product-availability", "shipping-speed"]}
+Description:
 """
 DEFAULT_DATASET_DESCRIPTIONS = [
     )
+class DatasetDescription(BaseModel):
+    description: str = Field(
+        ...,
+        title="description",
+        description="The description of the dataset.",
+    )
+    labels: list[str] = Field(
+        ...,
+        title="labels",
+        description="The possible labels for the classification task.",
+    )
 def get_prompt_generator():
     prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(