Spaces:

fdaudens
/

journalism_config_space

Runtime error

App Files Files Community

ignacioct commited on Apr 23, 2024

Commit

8773ff3

1 Parent(s): 7f24bfc

recommiting all files

Browse files

Files changed (17) hide show

.streamlit/config.toml +0 -0
DATASET_README_BASE.md +6 -0
README.md +4 -3
app.py +94 -0
defaults.py +38 -0
domain.py +89 -0
hub.py +129 -0
infer.py +16 -0
pages/2_👩🏼‍🔬 Describe Domain.py +240 -0
pages/3_🌱 Generate Dataset.py +219 -0
pages/4_🔍 Review Generated Data.py +48 -0
pipeline.py +184 -0
pipeline.yaml +546 -0
project_config.json +1 -0
requirements.txt +8 -0
seed_data.json +40 -0
utils.py +32 -0

.streamlit/config.toml ADDED Viewed

File without changes

DATASET_README_BASE.md ADDED Viewed

	@@ -0,0 +1,6 @@

+# Domain Dataset Grower
+This dataset was generated by [distilabel](https://distilabel.argilla.io/latest/) as a domain specific dataset for the domain of farming. The dataset used this seed data to generate the samples. The seed data was define by a domain expert and the generated data can be reviewed in this [Argilla](https://argilla.io/) space here: [Argilla](https://huggingface.co/spaces/argilla/farming)
+If you want to define a domain specific seed dataset for your own domain, you can use the distilabel tool to generate the dataset, and seed your dataset [here](https://huggingface.co/spaces/argilla/domain-specific-seed)

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Domain Specific Dataset Template
 emoji: 💻
-colorFrom: indigo
-colorTo: green
 sdk: streamlit
 sdk_version: 1.33.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Domain Specific Seed
 emoji: 💻
+colorFrom: purple
+colorTo: red
 sdk: streamlit
 sdk_version: 1.33.0
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import streamlit as st
+from defaults import (
+    PROJECT_NAME,
+    ARGILLA_SPACE_REPO_ID,
+    DATASET_REPO_ID,
+    ARGILLA_URL,
+    PROJECT_SPACE_REPO_ID,
+    DIBT_PARENT_APP_URL,
+)
+from utils import project_sidebar
+st.set_page_config("Domain Data Grower", page_icon="🧑‍🌾")
+project_sidebar()
+if PROJECT_NAME == "DEFAULT_DOMAIN":
+    st.warning(
+        "Please set up the project configuration in the parent app before proceeding."
+    )
+    st.stop()
+st.header("🧑‍🌾 Domain Data Grower")
+st.divider()
+st.markdown(
+    """
+## 🌱 Create a dataset seed for aligning models to a specific domain
+This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
+Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose.
+"""
+)
+st.markdown(
+    """
+## 🚜 How it works
+You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
+The dataset seed is then used to generate synthetic data for training a language model.
+"""
+)
+st.markdown(
+    """
+## 🗺️ The process
+### Step 1: ~~Setup the project~~
+~~Define the project details, including the project name, domain, and API credentials. Create Dataset Repo on the Hub.~~
+"""
+)
+st.link_button("🚀 ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL)
+st.markdown(
+    """
+### Step 2: Describe the Domain
+Define the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
+You can collaborate with domain experts to define the domain expertise and perspectives.
+"""
+)
+st.page_link(
+    "pages/2_👩🏼‍🔬 Describe Domain.py",
+    label="Describe Domain",
+    icon="👩🏼‍🔬",
+)
+st.markdown(
+    """
+### Step 3: Generate Synthetic Data
+Use distilabel to generate synthetic data for your domain-specific dataset.
+You can run the pipeline locally or in this space to generate synthetic data.
+"""
+)
+st.page_link(
+    "pages/3_🌱 Generate Dataset.py",
+    label="Generate Dataset",
+    icon="🌱",
+)
+st.markdown(
+    """
+### Step 4: Review the Dataset
+Use Argilla to review the generated synthetic data and provide feedback on the quality of the data.
+"""
+)
+st.link_button("🔍 Review the dataset in Argilla", ARGILLA_URL)

defaults.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import json
+SEED_DATA_PATH = "seed_data.json"
+PIPELINE_PATH = "pipeline.yaml"
+REMOTE_CODE_PATHS = ["defaults.py", "domain.py", "pipeline.py"]
+DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
+N_PERSPECTIVES = 5
+N_TOPICS = 5
+N_EXAMPLES = 5
+################################################
+# DEFAULTS ON FARMING
+################################################
+with open(SEED_DATA_PATH) as f:
+    DEFAULT_DATA = json.load(f)
+DEFAULT_DOMAIN = DEFAULT_DATA["domain"]
+DEFAULT_PERSPECTIVES = DEFAULT_DATA["perspectives"]
+DEFAULT_TOPICS = DEFAULT_DATA["topics"]
+DEFAULT_EXAMPLES = DEFAULT_DATA["examples"]
+DEFAULT_SYSTEM_PROMPT = DEFAULT_DATA["domain_expert_prompt"]
+################################################
+# PROJECT CONFIG FROM PARENT APP
+################################################
+with open("project_config.json") as f:
+    PROJECT_CONFIG = json.load(f)
+PROJECT_NAME = PROJECT_CONFIG["project_name"]
+ARGILLA_SPACE_REPO_ID = PROJECT_CONFIG["argilla_space_repo_id"]
+DATASET_REPO_ID = PROJECT_CONFIG["dataset_repo_id"]
+ARGILLA_SPACE_NAME = ARGILLA_SPACE_REPO_ID.replace("/", "-").replace("_", "-")
+ARGILLA_URL = f"https://{ARGILLA_SPACE_NAME}.hf.space"
+PROJECT_SPACE_REPO_ID = PROJECT_CONFIG["project_space_repo_id"]
+DATASET_URL = f"https://huggingface.co/datasets/{DATASET_REPO_ID}"
+HUB_USERNAME = DATASET_REPO_ID.split("/")[0]

domain.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import json
+from typing import Any, Dict, List
+from distilabel.steps.tasks.typing import ChatType
+from distilabel.steps.tasks.text_generation import TextGeneration
+from distilabel.steps import StepInput, StepOutput, Step
+from dotenv import load_dotenv
+from defaults import (
+    DEFAULT_DOMAIN,
+    DEFAULT_PERSPECTIVES,
+    DEFAULT_TOPICS,
+    DEFAULT_EXAMPLES,
+    DEFAULT_SYSTEM_PROMPT,
+    N_PERSPECTIVES,
+    N_TOPICS,
+    N_EXAMPLES,
+)
+load_dotenv()
+# Application description used for SelfInstruct
+APPLICATION_DESCRIPTION = f"""You are an AI assistant than generates queries around the domain of {DEFAULT_DOMAIN}.
+Your should not expect basic but profound questions from your users.
+The queries should reflect a diversity of vision and economic positions and political positions.
+The queries may know about different methods of {DEFAULT_DOMAIN}.
+The queries can be positioned politically, economically, socially, or practically.
+Also take into account the impact of diverse causes on diverse domains."""
+TOPICS = DEFAULT_TOPICS[:N_TOPICS]
+PERSPECTIVES = DEFAULT_PERSPECTIVES[:N_PERSPECTIVES]
+EXAMPLES = DEFAULT_EXAMPLES[:N_EXAMPLES]
+def create_examples_template(examples: List[Dict[str, str]]) -> List[str]:
+    questions = """ Examples of high quality questions:"""
+    answers = """ Examples of high quality answers:"""
+    for example in examples:
+        questions += f"""\n- Question: {example["question"]}\n"""
+        answers += f"""\n- Answer: {example["answer"]}\n"""
+    _template: str = (
+        """{instruction}\nThis is the the instruction.\n Examples: """
+        + questions
+        + answers
+    )
+    return _template
+def create_topics(topics: List[str], positions: List[str]) -> List[str]:
+    return [
+        f"{topic} from a {position} perspective"
+        for topic in topics
+        for position in positions
+    ]
+class DomainExpert(TextGeneration):
+    """A customized task to generate text as a domain expert in the domain of farming and agriculture."""
+    _system_prompt: (str) = DEFAULT_SYSTEM_PROMPT
+    _template: str = """{instruction}\nThis is the the instruction.\n Examples: """
+    def format_input(self, input: Dict[str, Any]) -> "ChatType":
+        return [
+            {
+                "role": "system",
+                "content": self._system_prompt,
+            },
+            {
+                "role": "user",
+                "content": self._template.format(**input),
+            },
+        ]
+class CleanNumberedList(Step):
+    """A step to clean the numbered list of questions."""
+    def process(self, inputs: StepInput) -> StepOutput:
+        import re
+        pattern = r"^\d+\.\s"
+        for input in inputs:
+            input["question"] = re.sub(pattern, "", input["question"])
+        yield inputs

hub.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import json
+from tempfile import mktemp
+import argilla as rg
+from huggingface_hub import HfApi
+from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH
+hf_api = HfApi()
+with open("DATASET_README_BASE.md") as f:
+    DATASET_README_BASE = f.read()
+def create_readme(domain_seed_data, project_name, domain):
+    # create a readme for the project that shows the domain and project name
+    readme = DATASET_README_BASE
+    readme += f"# {project_name}\n\n## Domain: {domain}"
+    perspectives = domain_seed_data.get("perspectives")
+    topics = domain_seed_data.get("topics")
+    examples = domain_seed_data.get("examples")
+    if perspectives:
+        readme += "\n\n## Perspectives\n\n"
+        for p in perspectives:
+            readme += f"- {p}\n"
+    if topics:
+        readme += "\n\n## Topics\n\n"
+        for t in topics:
+            readme += f"- {t}\n"
+    if examples:
+        readme += "\n\n## Examples\n\n"
+        for example in examples:
+            readme += f"### {example['question']}\n\n{example['answer']}\n\n"
+    temp_file = mktemp()
+    with open(temp_file, "w") as f:
+        f.write(readme)
+    return temp_file
+def setup_dataset_on_hub(repo_id, hub_token):
+    # create an empty dataset repo on the hub
+    hf_api.create_repo(
+        repo_id=repo_id,
+        token=hub_token,
+        repo_type="dataset",
+        exist_ok=True,
+    )
+def push_dataset_to_hub(
+    domain_seed_data_path,
+    project_name,
+    domain,
+    pipeline_path,
+    hub_username,
+    hub_token: str,
+):
+    repo_id = f"{hub_username}/{project_name}"
+    setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)
+    #  upload the seed data and readme to the hub
+    hf_api.upload_file(
+        path_or_fileobj=domain_seed_data_path,
+        path_in_repo="seed_data.json",
+        token=hub_token,
+        repo_id=repo_id,
+        repo_type="dataset",
+    )
+    # upload the readme to the hub
+    domain_seed_data = json.load(open(domain_seed_data_path))
+    hf_api.upload_file(
+        path_or_fileobj=create_readme(
+            domain_seed_data=domain_seed_data, project_name=project_name, domain=domain
+        ),
+        path_in_repo="README.md",
+        token=hub_token,
+        repo_id=repo_id,
+        repo_type="dataset",
+    )
+def push_pipeline_to_hub(
+    pipeline_path,
+    hub_username,
+    hub_token: str,
+    project_name,
+):
+    repo_id = f"{hub_username}/{project_name}"
+    # upload the pipeline to the hub
+    hf_api.upload_file(
+        path_or_fileobj=pipeline_path,
+        path_in_repo="pipeline.yaml",
+        token=hub_token,
+        repo_id=repo_id,
+        repo_type="dataset",
+    )
+    for code_path in REMOTE_CODE_PATHS:
+        hf_api.upload_file(
+            path_or_fileobj=code_path,
+            path_in_repo=code_path,
+            token=hub_token,
+            repo_id=repo_id,
+            repo_type="dataset",
+        )
+    print(f"Dataset uploaded to {repo_id}")
+def pull_seed_data_from_repo(repo_id, hub_token):
+    # pull the dataset repo from the hub
+    hf_api.hf_hub_download(
+        repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
+    )
+    return json.load(open(SEED_DATA_PATH))
+def push_argilla_dataset_to_hub(
+    name: str, repo_id: str, url: str, api_key: str, workspace: str = "admin"
+):
+    rg.init(api_url=url, api_key=api_key)
+    feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
+    local_dataset = feedback_dataset.pull()
+    local_dataset.push_to_huggingface(repo_id=repo_id)

infer.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+import requests
+HF_API_KEY = os.getenv("HF_API_KEY")
+API_URL = (
+    "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
+)
+headers = {"Authorization": f"Bearer {HF_API_KEY}"}
+def query(question):
+    payload = {
+        "inputs": question,
+    }
+    response = requests.post(API_URL, headers=headers, json=payload)
+    return response.json()[0]["generated_text"]

pages/2_👩🏼‍🔬 Describe Domain.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import json
+import streamlit as st
+from hub import push_dataset_to_hub
+from infer import query
+from defaults import (
+    DEFAULT_DOMAIN,
+    DEFAULT_PERSPECTIVES,
+    DEFAULT_TOPICS,
+    DEFAULT_EXAMPLES,
+    DEFAULT_SYSTEM_PROMPT,
+    N_PERSPECTIVES,
+    N_TOPICS,
+    SEED_DATA_PATH,
+    PIPELINE_PATH,
+    PROJECT_NAME,
+    DATASET_REPO_ID,
+)
+from utils import project_sidebar
+st.set_page_config(
+    page_title="Domain Data Grower",
+    page_icon="🧑‍🌾",
+)
+project_sidebar()
+################################################################################
+# HEADER
+################################################################################
+st.header("🧑‍🌾 Domain Data Grower")
+st.divider()
+st.subheader(
+    "Step 2. Define the specific domain that you want to generate synthetic data for.",
+)
+st.write(
+    "Define the project details, including the project name, domain, and API credentials"
+)
+################################################################################
+# Domain Expert Section
+################################################################################
+(
+    tab_domain_expert,
+    tab_domain_perspectives,
+    tab_domain_topics,
+    tab_examples,
+) = st.tabs(
+    tabs=[
+        "👩🏼‍🔬 Domain Expert",
+        "🔍 Domain Perspectives",
+        "🕸️ Domain Topics",
+        "📚 Examples",
+    ]
+)
+with tab_domain_expert:
+    st.text("Define the domain expertise that you want to train a language model")
+    st.info(
+        "A domain expert is a person who is an expert in a particular field or area. For example, a domain expert in farming would be someone who has extensive knowledge and experience in farming and agriculture."
+    )
+    domain = st.text_input("Domain Name", DEFAULT_DOMAIN)
+    domain_expert_prompt = st.text_area(
+        label="Domain Expert Definition",
+        value=DEFAULT_SYSTEM_PROMPT,
+        height=200,
+    )
+################################################################################
+# Domain Perspectives
+################################################################################
+with tab_domain_perspectives:
+    st.text("Define the different perspectives from which the domain can be viewed")
+    st.info(
+        """
+    Perspectives are different viewpoints or angles from which a domain can be viewed.
+    For example, the domain of farming can be viewed from the perspective of a commercial
+    farmer or an independent family farmer."""
+    )
+    perspectives = st.session_state.get(
+        "perspectives",
+        [st.text_input(f"Domain Perspective 0", value=DEFAULT_PERSPECTIVES[0])],
+    )
+    if st.button("Add New Perspective"):
+        n = len(perspectives)
+        value = DEFAULT_PERSPECTIVES[n] if n < N_PERSPECTIVES else ""
+        perspectives.append(st.text_input(f"Domain Perspective {n}", value=""))
+        st.session_state["perspectives"] = perspectives
+################################################################################
+# Domain Topics
+################################################################################
+with tab_domain_topics:
+    st.text("Define the main themes or subjects that are relevant to the domain")
+    st.info(
+        """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
+    )
+    topics = st.session_state.get(
+        "topics", [st.text_input(f"Domain Topic 0", value=DEFAULT_TOPICS[0])]
+    )
+    new_topic = st.button("Add New Topic")
+    if new_topic:
+        n = len(topics)
+        value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
+        topics.append(st.text_input(f"Domain Topic {n}", value=value))
+        st.session_state["topics"] = topics
+################################################################################
+# Examples Section
+################################################################################
+with tab_examples:
+    st.text(
+        "Add high-quality questions and answers that can be used to generate synthetic data"
+    )
+    st.info(
+        """
+    Examples are high-quality questions and answers that can be used to generate
+    synthetic data for the domain. These examples will be used to train the language model
+    to generate questions and answers.
+    """
+    )
+    questions_answers = st.session_state.get(
+        "questions_answers",
+        [
+            (
+                st.text_area(
+                    "Question", key="question_0", value=DEFAULT_EXAMPLES[0]["question"]
+                ),
+                st.text_area(
+                    "Answer", key="answer_0", value=DEFAULT_EXAMPLES[0]["answer"]
+                ),
+            )
+        ],
+    )
+    if st.button("Add New Example"):
+        n = len(questions_answers)
+        default_question, default_answer = DEFAULT_EXAMPLES[n].values()
+        st.subheader(f"Example {n + 1}")
+        if st.button("Generate New Answer", key=f"generate_{n}"):
+            default_answer = query(default_question)
+        _question = st.text_area(
+            "Question", key=f"question_{n}", value=default_question
+        )
+        _answer = st.text_area("Answer", key=f"answer_{n}", value=default_answer)
+        questions_answers.append((_question, _answer))
+        st.session_state["questions_answers"] = questions_answers
+################################################################################
+# Setup Dataset on the Hub
+################################################################################
+st.divider()
+hub_username = DATASET_REPO_ID.split("/")[0]
+project_name = DATASET_REPO_ID.split("/")[1]
+st.write("Define the dataset repo details on the Hub")
+st.session_state["project_name"] = st.text_input("Project Name", project_name)
+st.session_state["hub_username"] = st.text_input("Hub Username", hub_username)
+st.session_state["hub_token"] = st.text_input("Hub Token", type="password", value=None)
+if all(
+    (
+        st.session_state.get("project_name"),
+        st.session_state.get("hub_username"),
+        st.session_state.get("hub_token"),
+    )
+):
+    st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
+if st.button("🤗 Push Dataset Seed") and all(
+    (
+        domain,
+        domain_expert_prompt,
+        perspectives,
+        topics,
+        questions_answers,
+    )
+):
+    if all(
+        (
+            st.session_state.get("project_name"),
+            st.session_state.get("hub_username"),
+            st.session_state.get("hub_token"),
+        )
+    ):
+        project_name = st.session_state["project_name"]
+        hub_username = st.session_state["hub_username"]
+        hub_token = st.session_state["hub_token"]
+    else:
+        st.error(
+            "Please create a dataset repo on the Hub before pushing the dataset seed"
+        )
+        st.stop()
+    perspectives = list(filter(None, perspectives))
+    topics = list(filter(None, topics))
+    examples = [{"question": q, "answer": a} for q, a in questions_answers]
+    domain_data = {
+        "domain": domain,
+        "perspectives": perspectives,
+        "topics": topics,
+        "examples": examples,
+        "domain_expert_prompt": domain_expert_prompt,
+    }
+    with open(SEED_DATA_PATH, "w") as f:
+        json.dump(domain_data, f, indent=2)
+    push_dataset_to_hub(
+        domain_seed_data_path=SEED_DATA_PATH,
+        project_name=project_name,
+        domain=domain,
+        hub_username=hub_username,
+        hub_token=hub_token,
+        pipeline_path=PIPELINE_PATH,
+    )
+    st.sidebar.success(
+        f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
+    )
+else:
+    st.info(
+        "Please fill in all the required domain fields to push the dataset seed to the Hub"
+    )

pages/3_🌱 Generate Dataset.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import streamlit as st
+from streamlit.errors import EntryNotFoundError
+from hub import pull_seed_data_from_repo, push_pipeline_to_hub
+from defaults import (
+    DEFAULT_SYSTEM_PROMPT,
+    PIPELINE_PATH,
+    PROJECT_NAME,
+    ARGILLA_SPACE_REPO_ID,
+    DATASET_REPO_ID,
+    ARGILLA_SPACE_NAME,
+    ARGILLA_URL,
+    PROJECT_SPACE_REPO_ID,
+    HUB_USERNAME,
+)
+from utils import project_sidebar
+from pipeline import serialize_pipeline, run_pipeline, create_pipelines_run_command
+st.set_page_config(
+    page_title="Domain Data Grower",
+    page_icon="🧑‍🌾",
+)
+project_sidebar()
+################################################################################
+# HEADER
+################################################################################
+st.header("🧑‍🌾 Domain Data Grower")
+st.divider()
+st.subheader("Step 3. Run the pipeline to generate synthetic data")
+st.write(
+    "Define the project details, including the project name, domain, and API credentials"
+)
+###############################################################
+# CONFIGURATION
+###############################################################
+st.divider()
+st.markdown("### Pipeline Configuration")
+st.write("🤗 Hub details to pull the seed data")
+hub_username = st.text_input("Hub Username", HUB_USERNAME)
+project_name = st.text_input("Project Name", PROJECT_NAME)
+repo_id = f"{hub_username}/{project_name}"
+hub_token = st.text_input("Hub Token", type="password")
+st.write("🤖 Inference configuration")
+st.write(
+    "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
+)
+st.link_button(
+    "🤗 Inference compaptible models on the hub",
+    "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
+)
+base_url = st.text_input("Base URL")
+st.write("🔬 Argilla API details to push the generated dataset")
+argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
+argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
+argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
+st.divider()
+###############################################################
+# LOCAL
+###############################################################
+st.markdown("### Run the pipeline")
+st.write(
+    "Once you've defined the pipeline configuration, you can run the pipeline locally or on this space."
+)
+st.write(
+    """We recommend running the pipeline locally if you're planning on generating a large dataset. \
+        But running the pipeline on this space is a handy way to get started quickly. Your synthetic
+        samples will be pushed to Argilla and available for review.
+        """
+)
+st.write(
+    """If you're planning on running the pipeline on the space, be aware that it \
+        will take some time to complete and you will need to maintain a \
+        connection to the space."""
+)
+if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
+    if all(
+        [
+            argilla_api_key,
+            argilla_url,
+            base_url,
+            hub_username,
+            project_name,
+            hub_token,
+            argilla_dataset_name,
+        ]
+    ):
+        with st.spinner("Pulling seed data from the Hub..."):
+            seed_data = pull_seed_data_from_repo(
+                repo_id=f"{hub_username}/{project_name}",
+                hub_token=hub_token,
+            )
+            domain = seed_data["domain"]
+            perspectives = seed_data["perspectives"]
+            topics = seed_data["topics"]
+            examples = seed_data["examples"]
+            domain_expert_prompt = seed_data["domain_expert_prompt"]
+        with st.spinner("Serializing the pipeline configuration..."):
+            serialize_pipeline(
+                argilla_api_key=argilla_api_key,
+                argilla_dataset_name=argilla_dataset_name,
+                argilla_api_url=argilla_url,
+                topics=topics,
+                perspectives=perspectives,
+                pipeline_config_path=PIPELINE_PATH,
+                domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
+                hub_token=hub_token,
+                endpoint_base_url=base_url,
+                examples=examples,
+            )
+            push_pipeline_to_hub(
+                pipeline_path=PIPELINE_PATH,
+                hub_token=hub_token,
+                hub_username=hub_username,
+                project_name=project_name,
+            )
+        st.success(f"Pipeline configuration saved to {hub_username}/{project_name}")
+        st.info(
+            "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
+        )
+        st.text(
+            "Execute the following command to generate a synthetic dataset from the seed data:"
+        )
+        command_to_run = create_pipelines_run_command(
+            hub_token=hub_token,
+            pipeline_config_path=PIPELINE_PATH,
+            argilla_dataset_name=argilla_dataset_name,
+        )
+        st.code(
+            f"""
+            pip install git+https://github.com/argilla-io/distilabel.git
+            git clone https://huggingface.co/{hub_username}/{project_name}
+            cd {project_name}
+            {' '.join(command_to_run[2:])}
+        """,
+            language="bash",
+        )
+    else:
+        st.error("Please fill all the required fields.")
+###############################################################
+# SPACE
+###############################################################
+if st.button("🔥 Run pipeline right here, right now!"):
+    if all(
+        [
+            argilla_api_key,
+            argilla_url,
+            base_url,
+            hub_username,
+            project_name,
+            hub_token,
+            argilla_dataset_name,
+        ]
+    ):
+        with st.spinner("Pulling seed data from the Hub..."):
+            try:
+                seed_data = pull_seed_data_from_repo(
+                    repo_id=f"{hub_username}/{project_name}",
+                    hub_token=hub_token,
+                )
+            except EntryNotFoundError:
+                st.error(
+                    "Seed data not found. Please make sure you pushed the data seed in Step 2."
+                )
+            domain = seed_data["domain"]
+            perspectives = seed_data["perspectives"]
+            topics = seed_data["topics"]
+            examples = seed_data["examples"]
+            domain_expert_prompt = seed_data["domain_expert_prompt"]
+        with st.spinner("Serializing the pipeline configuration..."):
+            serialize_pipeline(
+                argilla_api_key=argilla_api_key,
+                argilla_dataset_name=argilla_dataset_name,
+                argilla_api_url=argilla_url,
+                topics=topics,
+                perspectives=perspectives,
+                pipeline_config_path=PIPELINE_PATH,
+                domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
+                hub_token=hub_token,
+                endpoint_base_url=base_url,
+                examples=examples,
+            )
+        with st.spinner("Starting the pipeline..."):
+            logs = run_pipeline(PIPELINE_PATH)
+        st.success(f"Pipeline started successfully! 🚀")
+        with st.expander(label="View Logs", expanded=True):
+            for out in logs:
+                st.text(out)
+    else:
+        st.error("Please fill all the required fields.")

pages/4_🔍 Review Generated Data.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import streamlit as st
+from defaults import PROJECT_NAME, ARGILLA_URL, DATASET_REPO_ID
+from utils import project_sidebar
+from hub import push_argilla_dataset_to_hub
+st.set_page_config(
+    page_title="Domain Data Grower",
+    page_icon="🧑‍🌾",
+)
+project_sidebar()
+################################################################################
+# HEADER
+################################################################################
+st.header("🧑‍🌾 Domain Data Grower")
+st.divider()
+st.write(
+    """Once you have reviewed the synthetic data in Argilla, you can publish the
+    generated dataset to the Hub."""
+)
+################################################################################
+# Configuration
+################################################################################
+st.divider()
+st.write("🔬 Argilla API details to push the generated dataset")
+argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
+argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
+argilla_dataset_name = st.text_input("Argilla Dataset Name", PROJECT_NAME)
+dataset_repo_id = st.text_input("Dataset Repo ID", DATASET_REPO_ID)
+st.divider()
+if st.button("🚀 Publish the generated dataset"):
+    with st.spinner("Publishing the generated dataset..."):
+        push_argilla_dataset_to_hub(
+            name=argilla_dataset_name,
+            repo_id=dataset_repo_id,
+            url=argilla_url,
+            api_key=argilla_api_key,
+            workspace="admin",
+        )
+    st.success("The generated dataset has been published to the Hub.")

pipeline.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+import subprocess
+import time
+from typing import List
+from distilabel.steps.generators.data import LoadDataFromDicts
+from distilabel.steps.expand import ExpandColumns
+from distilabel.steps.keep import KeepColumns
+from distilabel.steps.tasks.self_instruct import SelfInstruct
+from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
+from distilabel.llms.huggingface import InferenceEndpointsLLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps import TextGenerationToArgilla
+from dotenv import load_dotenv
+from domain import (
+    DomainExpert,
+    CleanNumberedList,
+    create_topics,
+    create_examples_template,
+    APPLICATION_DESCRIPTION,
+)
+load_dotenv()
+def define_pipeline(
+    argilla_api_key: str,
+    argilla_api_url: str,
+    argilla_dataset_name: str,
+    topics: List[str],
+    perspectives: List[str],
+    domain_expert_prompt: str,
+    examples: List[dict],
+    hub_token: str,
+    endpoint_base_url: str,
+):
+    """Define the pipeline for the specific domain."""
+    terms = create_topics(topics, perspectives)
+    template = create_examples_template(examples)
+    with Pipeline("farming") as pipeline:
+        load_data = LoadDataFromDicts(
+            name="load_data",
+            data=[{"input": term} for term in terms],
+            batch_size=64,
+        )
+        llm = InferenceEndpointsLLM(
+            base_url=endpoint_base_url,
+            api_key=hub_token,
+        )
+        self_instruct = SelfInstruct(
+            name="self-instruct",
+            application_description=APPLICATION_DESCRIPTION,
+            num_instructions=5,
+            input_batch_size=8,
+            llm=llm,
+        )
+        evol_instruction_complexity = EvolInstruct(
+            name="evol_instruction_complexity",
+            llm=llm,
+            num_evolutions=2,
+            store_evolutions=True,
+            input_batch_size=8,
+            include_original_instruction=True,
+            input_mappings={"instruction": "question"},
+        )
+        expand_instructions = ExpandColumns(
+            name="expand_columns", columns={"instructions": "question"}
+        )
+        cleaner = CleanNumberedList(name="clean_numbered_list")
+        expand_evolutions = ExpandColumns(
+            name="expand_columns_evolved",
+            columns={"evolved_instructions": "evolved_questions"},
+        )
+        domain_expert = DomainExpert(
+            name="domain_expert",
+            llm=llm,
+            input_batch_size=8,
+            input_mappings={"instruction": "evolved_questions"},
+            output_mappings={"generation": "domain_expert_answer"},
+            _system_prompt=domain_expert_prompt,
+            _template=template,
+        )
+        keep_columns = KeepColumns(
+            name="keep_columns",
+            columns=["model_name", "evolved_questions", "domain_expert_answer"],
+        )
+        to_argilla = TextGenerationToArgilla(
+            name="text_generation_to_argilla",
+            dataset_name=argilla_dataset_name,
+            dataset_workspace="admin",
+            api_url=argilla_api_url,
+            api_key=argilla_api_key,
+            input_mappings={
+                "instruction": "evolved_questions",
+                "generation": "domain_expert_answer",
+            },
+        )
+        load_data.connect(self_instruct)
+        self_instruct.connect(expand_instructions)
+        expand_instructions.connect(cleaner)
+        cleaner.connect(evol_instruction_complexity)
+        evol_instruction_complexity.connect(expand_evolutions)
+        expand_evolutions.connect(domain_expert)
+        domain_expert.connect(keep_columns)
+        keep_columns.connect(to_argilla)
+    return pipeline
+def serialize_pipeline(
+    argilla_api_key: str,
+    argilla_api_url: str,
+    argilla_dataset_name: str,
+    topics: List[str],
+    perspectives: List[str],
+    domain_expert_prompt: str,
+    hub_token: str,
+    endpoint_base_url: str,
+    pipeline_config_path: str = "pipeline.yaml",
+    examples: List[dict] = [],
+):
+    """Serialize the pipeline to a yaml file."""
+    pipeline = define_pipeline(
+        argilla_api_key=argilla_api_key,
+        argilla_api_url=argilla_api_url,
+        argilla_dataset_name=argilla_dataset_name,
+        topics=topics,
+        perspectives=perspectives,
+        domain_expert_prompt=domain_expert_prompt,
+        hub_token=hub_token,
+        endpoint_base_url=endpoint_base_url,
+        examples=examples,
+    )
+    pipeline.save(path=pipeline_config_path, overwrite=True, format="yaml")
+def create_pipelines_run_command(
+    pipeline_config_path: str = "pipeline.yaml",
+    argilla_dataset_name: str = "domain_specific_datasets",
+):
+    """Create the command to run the pipeline."""
+    command_to_run = [
+        "python",
+        "-m",
+        "distilabel",
+        "pipeline",
+        "run",
+        "--config",
+        pipeline_config_path,
+        "--param",
+        f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
+    ]
+    return command_to_run
+def run_pipeline(
+    pipeline_config_path: str = "pipeline.yaml",
+    argilla_dataset_name: str = "domain_specific_datasets",
+):
+    """Run the pipeline and yield the output as a generator of logs."""
+    command_to_run = create_pipelines_run_command(
+        pipeline_config_path=pipeline_config_path,
+        argilla_dataset_name=argilla_dataset_name,
+    )
+    # Run the script file
+    process = subprocess.Popen(
+        command_to_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    while process.stdout and process.stdout.readable():
+        time.sleep(0.2)
+        line = process.stdout.readline()
+        if not line:
+            break
+        yield line.decode("utf-8")

pipeline.yaml ADDED Viewed

	@@ -0,0 +1,546 @@

+distilabel:
+  version: 1.0.0
+pipeline:
+  name: farming
+  description: null
+  steps:
+  - step:
+      name: load_data
+      input_mappings: {}
+      output_mappings: {}
+      batch_size: 64
+      data:
+      - input: animal welfare from a Family Farming perspective
+      - input: animal welfare from a Agribusiness perspective
+      - input: animal welfare from a Permaculture perspective
+      - input: animal welfare from a Agroforestery perspective
+      - input: animal welfare from a Conventional Farming perspective
+      - input: economic growth from a Family Farming perspective
+      - input: economic growth from a Agribusiness perspective
+      - input: economic growth from a Permaculture perspective
+      - input: economic growth from a Agroforestery perspective
+      - input: economic growth from a Conventional Farming perspective
+      - input: land from a Family Farming perspective
+      - input: land from a Agribusiness perspective
+      - input: land from a Permaculture perspective
+      - input: land from a Agroforestery perspective
+      - input: land from a Conventional Farming perspective
+      - input: resources from a Family Farming perspective
+      - input: resources from a Agribusiness perspective
+      - input: resources from a Permaculture perspective
+      - input: resources from a Agroforestery perspective
+      - input: resources from a Conventional Farming perspective
+      - input: efficiency from a Family Farming perspective
+      - input: efficiency from a Agribusiness perspective
+      - input: efficiency from a Permaculture perspective
+      - input: efficiency from a Agroforestery perspective
+      - input: efficiency from a Conventional Farming perspective
+      runtime_parameters_info:
+      - name: batch_size
+        optional: true
+        description: The number of rows that will contain the batches generated by
+          the step.
+      type_info:
+        module: distilabel.steps.generators.data
+        name: LoadDataFromDicts
+    name: load_data
+  - step:
+      name: self-instruct
+      input_mappings: {}
+      output_mappings: {}
+      input_batch_size: 8
+      llm:
+        generation_kwargs: {}
+        model_id: null
+        endpoint_name: null
+        endpoint_namespace: null
+        base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
+        tokenizer_id: null
+        model_display_name: null
+        use_openai_client: false
+        type_info:
+          module: distilabel.llms.huggingface.inference_endpoints
+          name: InferenceEndpointsLLM
+      group_generations: false
+      num_generations: 1
+      num_instructions: 5
+      criteria_for_query_generation: 'Incorporate a diverse range of verbs, avoiding
+        repetition.
+        Ensure queries are compatible with AI model''s text generation functions and
+        are limited to 1-2 sentences.
+        Design queries to be self-contained and standalone.
+        Blend interrogative (e.g., "What is the significance of x?") and imperative
+        (e.g., "Detail the process of x.") styles.'
+      application_description: 'You are an AI assistant than generates queries around
+        the domain of farming.
+        Your should not expect basic but profound questions from your users.
+        The queries should reflect a diversity of vision and economic positions and
+        political positions.
+        The queries may know about different methods of farming.
+        The queries can be positioned politically, economically, socially, or practically.
+        Also take into account the impact of diverse causes on diverse domains.'
+      runtime_parameters_info:
+      - name: input_batch_size
+        optional: true
+        description: The number of rows that will contain the batches processed by
+          the step.
+      - name: llm
+        runtime_parameters_info:
+        - name: generation_kwargs
+          description: The kwargs to be propagated to either `generate` or `agenerate`
+            methods within each `LLM`.
+          keys:
+          - name: max_new_tokens
+            optional: true
+            description: the maximum number of new tokens that the model will generate.  Defaults
+              to `128`.
+          - name: frequency_penalty
+            optional: true
+            description: the repetition penalty to use for the generation. Defaults  to
+              `0.0`. Only applies if `use_openai_client=True`.
+          - name: presence_penalty
+            optional: true
+            description: the presence penalty to use for the generation. Defaults
+              to  `0.0`. Only applies if `use_openai_client=True`.
+          - name: repetition_penalty
+            optional: true
+            description: the repetition penalty to use for the generation. Defaults  to
+              `None`. Only applies if `use_openai_client=False`.
+          - name: temperature
+            optional: true
+            description: the temperature to use for the generation. Defaults to `1.0`.
+          - name: do_sample
+            optional: true
+            description: whether to use sampling for the generation. Defaults to `False`.  Only
+              applies if `use_openai_client=False`.
+          - name: top_k
+            optional: true
+            description: the top-k value to use for the generation. Defaults to `0.8`,
+              since neither  `0.0` nor `1.0` are valid values in TGI.
+          - name: top_p
+            optional: true
+            description: the top-p value to use for the generation. Defaults to `1.0`.
+          - name: typical_p
+            optional: true
+            description: the typical-p value to use for the generation. Defaults to
+              `0.5`.
+        - name: endpoint_name
+          optional: true
+          description: The name of the Inference Endpoint to use for the LLM.
+        - name: endpoint_namespace
+          optional: true
+          description: The namespace of the Inference Endpoint to use for the LLM.
+        - name: base_url
+          optional: true
+          description: The base URL to use for the Inference Endpoints API requests.
+        - name: api_key
+          optional: true
+          description: The API key to authenticate the requests to the Inference Endpoints
+            API.
+      - name: num_generations
+        optional: true
+        description: The number of generations to be produced per input.
+      type_info:
+        module: distilabel.steps.tasks.self_instruct
+        name: SelfInstruct
+    name: self-instruct
+  - step:
+      name: evol_instruction_complexity
+      input_mappings:
+        instruction: question
+      output_mappings: {}
+      input_batch_size: 8
+      llm:
+        generation_kwargs: {}
+        model_id: null
+        endpoint_name: null
+        endpoint_namespace: null
+        base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
+        tokenizer_id: null
+        model_display_name: null
+        use_openai_client: false
+        type_info:
+          module: distilabel.llms.huggingface.inference_endpoints
+          name: InferenceEndpointsLLM
+      group_generations: false
+      num_generations: 1
+      num_evolutions: 2
+      store_evolutions: true
+      generate_answers: false
+      include_original_instruction: true
+      mutation_templates:
+        CONSTRAINTS: "I want you act as a Prompt Rewriter.\n\nYour objective is to\
+          \ rewrite a given prompt into a more complex version to make those famous\
+          \ AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the\
+          \ rewritten prompt must be reasonable and must be understood and responded\
+          \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\
+          \ table and code in #The Given Prompt#:. Also, please do not omit the input\
+          \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\
+          \ the following method: \nPlease add one more constraints/requirements into\
+          \ '#The Given Prompt#'\n\nYou should try your best not to make the #Rewritten\
+          \ Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words\
+          \ into #The Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#',\
+          \ 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten\
+          \ Prompt#\n\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n"
+        DEEPENING: "I want you act as a Prompt Rewriter.\n\nYour objective is to rewrite\
+          \ a given prompt into a more complex version to make those famous AI systems\
+          \ (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the rewritten\
+          \ prompt must be reasonable and must be understood and responded by humans.\n\
+          \nYour rewriting cannot omit the non-text parts such as the table and code\
+          \ in #The Given Prompt#:. Also, please do not omit the input in #The Given\
+          \ Prompt#.\n\nYou SHOULD complicate the given prompt using the following\
+          \ method: \nIf #The Given Prompt# contains inquiries about certain issues,\
+          \ the depth and breadth of the inquiry can be increased.\n\nYou should try\
+          \ your best not to make the #Rewritten Prompt# become verbose, #Rewritten\
+          \ Prompt# can only add 10 to 20 words into #The Given Prompt#.\n\n'#The\
+          \ Given Prompt#', '#Rewritten Prompt#', 'given prompt' and 'rewritten prompt'\
+          \ are not allowed to appear in #Rewritten Prompt#\n\n#The Given Prompt#:\n\
+          <PROMPT>\n#Rewritten Prompt#:\n\n"
+        CONCRETIZING: "I want you act as a Prompt Rewriter.\n\nYour objective is to\
+          \ rewrite a given prompt into a more complex version to make those famous\
+          \ AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the\
+          \ rewritten prompt must be reasonable and must be understood and responded\
+          \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\
+          \ table and code in #The Given Prompt#:. Also, please do not omit the input\
+          \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\
+          \ the following method: \nPlease replace general concepts with more specific\
+          \ concepts.\n\nYou should try your best not to make the #Rewritten Prompt#\
+          \ become verbose, #Rewritten Prompt# can only add 10 to 20 words into #The\
+          \ Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#', 'given prompt'\
+          \ and 'rewritten prompt' are not allowed to appear in #Rewritten Prompt#\n\
+          \n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n"
+        INCREASED_REASONING_STEPS: "I want you act as a Prompt Rewriter.\n\nYour objective\
+          \ is to rewrite a given prompt into a more complex version to make those\
+          \ famous AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\n\
+          But the rewritten prompt must be reasonable and must be understood and responded\
+          \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\
+          \ table and code in #The Given Prompt#:. Also, please do not omit the input\
+          \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\
+          \ the following method: \nIf #The Given Prompt# can be solved with just\
+          \ a few simple thinking processes, you can rewrite it to explicitly request\
+          \ multiple-step reasoning.\n\nYou should try your best not to make the #Rewritten\
+          \ Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words\
+          \ into #The Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#',\
+          \ 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten\
+          \ Prompt#\n\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n"
+        BREADTH: 'I want you act as a Prompt Creator.
+          Your goal is to draw inspiration from the #Given Prompt# to create a brand
+          new prompt.
+          This new prompt should belong to the same domain as the #Given Prompt# but
+          be even more rare.
+          The LENGTH and complexity of the #Created Prompt# should be similar to that
+          of the #Given Prompt#.
+          The #Created Prompt# must be reasonable and must be understood and responded
+          by humans.
+          ''#Given Prompt#'', ''#Created Prompt#'', ''given prompt'' and ''created
+          prompt'' are not allowed to appear in #Created Prompt#
+          #Given Prompt#:
+          <PROMPT>
+          #Created Prompt#:
+          '
+      seed: 42
+      runtime_parameters_info:
+      - name: input_batch_size
+        optional: true
+        description: The number of rows that will contain the batches processed by
+          the step.
+      - name: llm
+        runtime_parameters_info:
+        - name: generation_kwargs
+          description: The kwargs to be propagated to either `generate` or `agenerate`
+            methods within each `LLM`.
+          keys:
+          - name: max_new_tokens
+            optional: true
+            description: the maximum number of new tokens that the model will generate.  Defaults
+              to `128`.
+          - name: frequency_penalty
+            optional: true
+            description: the repetition penalty to use for the generation. Defaults  to
+              `0.0`. Only applies if `use_openai_client=True`.
+          - name: presence_penalty
+            optional: true
+            description: the presence penalty to use for the generation. Defaults
+              to  `0.0`. Only applies if `use_openai_client=True`.
+          - name: repetition_penalty
+            optional: true
+            description: the repetition penalty to use for the generation. Defaults  to
+              `None`. Only applies if `use_openai_client=False`.
+          - name: temperature
+            optional: true
+            description: the temperature to use for the generation. Defaults to `1.0`.
+          - name: do_sample
+            optional: true
+            description: whether to use sampling for the generation. Defaults to `False`.  Only
+              applies if `use_openai_client=False`.
+          - name: top_k
+            optional: true
+            description: the top-k value to use for the generation. Defaults to `0.8`,
+              since neither  `0.0` nor `1.0` are valid values in TGI.
+          - name: top_p
+            optional: true
+            description: the top-p value to use for the generation. Defaults to `1.0`.
+          - name: typical_p
+            optional: true
+            description: the typical-p value to use for the generation. Defaults to
+              `0.5`.
+        - name: endpoint_name
+          optional: true
+          description: The name of the Inference Endpoint to use for the LLM.
+        - name: endpoint_namespace
+          optional: true
+          description: The namespace of the Inference Endpoint to use for the LLM.
+        - name: base_url
+          optional: true
+          description: The base URL to use for the Inference Endpoints API requests.
+        - name: api_key
+          optional: true
+          description: The API key to authenticate the requests to the Inference Endpoints
+            API.
+      - name: num_generations
+        optional: true
+        description: The number of generations to be produced per input.
+      - name: seed
+        optional: true
+        description: As `numpy` is being used in order to randomly pick a mutation
+          method, then is nice to seed a random seed.
+      type_info:
+        module: distilabel.steps.tasks.evol_instruct.base
+        name: EvolInstruct
+    name: evol_instruction_complexity
+  - step:
+      name: expand_columns
+      input_mappings: {}
+      output_mappings: {}
+      input_batch_size: 50
+      columns:
+        instructions: question
+      runtime_parameters_info:
+      - name: input_batch_size
+        optional: true
+        description: The number of rows that will contain the batches processed by
+          the step.
+      type_info:
+        module: distilabel.steps.expand
+        name: ExpandColumns
+    name: expand_columns
+  - step:
+      name: clean_numbered_list
+      input_mappings: {}
+      output_mappings: {}
+      input_batch_size: 50
+      runtime_parameters_info:
+      - name: input_batch_size
+        optional: true
+        description: The number of rows that will contain the batches processed by
+          the step.
+      type_info:
+        module: domain
+        name: CleanNumberedList
+    name: clean_numbered_list
+  - step:
+      name: expand_columns_evolved
+      input_mappings: {}
+      output_mappings: {}
+      input_batch_size: 50
+      columns:
+        evolved_instructions: evolved_questions
+      runtime_parameters_info:
+      - name: input_batch_size
+        optional: true
+        description: The number of rows that will contain the batches processed by
+          the step.
+      type_info:
+        module: distilabel.steps.expand
+        name: ExpandColumns
+    name: expand_columns_evolved
+  - step:
+      name: domain_expert
+      input_mappings:
+        instruction: evolved_questions
+      output_mappings:
+        generation: domain_expert_answer
+      input_batch_size: 8
+      llm:
+        generation_kwargs: {}
+        model_id: null
+        endpoint_name: null
+        endpoint_namespace: null
+        base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
+        tokenizer_id: null
+        model_display_name: null
+        use_openai_client: false
+        type_info:
+          module: distilabel.llms.huggingface.inference_endpoints
+          name: InferenceEndpointsLLM
+      group_generations: false
+      num_generations: 1
+      runtime_parameters_info:
+      - name: input_batch_size
+        optional: true
+        description: The number of rows that will contain the batches processed by
+          the step.
+      - name: llm
+        runtime_parameters_info:
+        - name: generation_kwargs
+          description: The kwargs to be propagated to either `generate` or `agenerate`
+            methods within each `LLM`.
+          keys:
+          - name: max_new_tokens
+            optional: true
+            description: the maximum number of new tokens that the model will generate.  Defaults
+              to `128`.
+          - name: frequency_penalty
+            optional: true
+            description: the repetition penalty to use for the generation. Defaults  to
+              `0.0`. Only applies if `use_openai_client=True`.
+          - name: presence_penalty
+            optional: true
+            description: the presence penalty to use for the generation. Defaults
+              to  `0.0`. Only applies if `use_openai_client=True`.
+          - name: repetition_penalty
+            optional: true
+            description: the repetition penalty to use for the generation. Defaults  to
+              `None`. Only applies if `use_openai_client=False`.
+          - name: temperature
+            optional: true
+            description: the temperature to use for the generation. Defaults to `1.0`.
+          - name: do_sample
+            optional: true
+            description: whether to use sampling for the generation. Defaults to `False`.  Only
+              applies if `use_openai_client=False`.
+          - name: top_k
+            optional: true
+            description: the top-k value to use for the generation. Defaults to `0.8`,
+              since neither  `0.0` nor `1.0` are valid values in TGI.
+          - name: top_p
+            optional: true
+            description: the top-p value to use for the generation. Defaults to `1.0`.
+          - name: typical_p
+            optional: true
+            description: the typical-p value to use for the generation. Defaults to
+              `0.5`.
+        - name: endpoint_name
+          optional: true
+          description: The name of the Inference Endpoint to use for the LLM.
+        - name: endpoint_namespace
+          optional: true
+          description: The namespace of the Inference Endpoint to use for the LLM.
+        - name: base_url
+          optional: true
+          description: The base URL to use for the Inference Endpoints API requests.
+        - name: api_key
+          optional: true
+          description: The API key to authenticate the requests to the Inference Endpoints
+            API.
+      - name: num_generations
+        optional: true
+        description: The number of generations to be produced per input.
+      type_info:
+        module: domain
+        name: DomainExpert
+    name: domain_expert
+  - step:
+      name: keep_columns
+      input_mappings: {}
+      output_mappings: {}
+      input_batch_size: 50
+      columns:
+      - model_name
+      - evolved_questions
+      - domain_expert_answer
+      runtime_parameters_info:
+      - name: input_batch_size
+        optional: true
+        description: The number of rows that will contain the batches processed by
+          the step.
+      type_info:
+        module: distilabel.steps.keep
+        name: KeepColumns
+    name: keep_columns
+  - step:
+      name: text_generation_to_argilla
+      input_mappings:
+        instruction: evolved_questions
+        generation: domain_expert_answer
+      output_mappings: {}
+      input_batch_size: 50
+      dataset_name: farming
+      dataset_workspace: admin
+      api_url: https://argilla-farming.hf.space
+      runtime_parameters_info:
+      - name: input_batch_size
+        optional: true
+        description: The number of rows that will contain the batches processed by
+          the step.
+      - name: dataset_name
+        optional: false
+        description: The name of the dataset in Argilla.
+      - name: dataset_workspace
+        optional: true
+        description: The workspace where the dataset will be created in Argilla. Defaultsto
+          `None` which means it will be created in the default workspace.
+      - name: api_url
+        optional: true
+        description: The base URL to use for the Argilla API requests.
+      - name: api_key
+        optional: true
+        description: The API key to authenticate the requests to the Argilla API.
+      type_info:
+        module: distilabel.steps.argilla.text_generation
+        name: TextGenerationToArgilla
+    name: text_generation_to_argilla
+  connections:
+  - from: load_data
+    to:
+    - self-instruct
+  - from: self-instruct
+    to:
+    - expand_columns
+  - from: evol_instruction_complexity
+    to:
+    - expand_columns_evolved
+  - from: expand_columns
+    to:
+    - clean_numbered_list
+  - from: clean_numbered_list
+    to:
+    - evol_instruction_complexity
+  - from: expand_columns_evolved
+    to:
+    - domain_expert
+  - from: domain_expert
+    to:
+    - keep_columns
+  - from: keep_columns
+    to:
+    - text_generation_to_argilla
+  - from: text_generation_to_argilla
+    to: []
+  type_info:
+    module: distilabel.pipeline.local
+    name: Pipeline

project_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"project_name": "DEFAULT_DOMAIN", "argilla_space_repo_id": "burtenshaw/domain_test_4_argilla_space", "project_space_repo_id": "burtenshaw/domain_test_4_config_space", "dataset_repo_id": "burtenshaw/domain_test_4"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+datasets
+python_dotenv
+sentence_transformers
+streamlit
+huggingface_hub
+mistralai
+argilla
+git+https://github.com/argilla-io/distilabel.git

seed_data.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "domain": "farming",
+  "perspectives": [
+    "Family Farming",
+    "Agribusiness",
+    "Permaculture",
+    "Agroforestery",
+    "Conventional Farming"
+  ],
+  "topics": [
+    "animal welfare",
+    "economic growth",
+    "land",
+    "resources",
+    "efficiency"
+  ],
+  "examples": [
+    {
+      "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
+      "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
+    },
+    {
+      "question": "Compare the environmental footprint of small-scale, local farming versus large-scale, industrial agriculture.",
+      "answer": "Industrial agriculture typically emphasizes high-output, monoculture farming reliant on synthetic fertilizers and pesticides, which, as Horrigan, Lawrence, and Walker (2002) argue, leads to greater greenhouse gas emissions, higher energy use, and more water consumption compared to small-scale farming. In contrast, small-scale farms often employ diverse cropping systems and lower chemical inputs, resulting in a smaller environmental footprint. Pimentel et al. (2005) note that small-scale farms tend to have higher yields per unit area when environmental and sustainability factors are integrated into farming practices."
+    },
+    {
+      "question": "Analyze the economic implications of transitioning from conventional to organic farming.",
+      "answer": "Transitioning from conventional to organic farming involves significant changes in farm management, input use, and market engagement. Crowder and Reganold (2015) present evidence that organic farms often yield smaller outputs initially but achieve higher profitability due to premium prices, lower input costs, and improved soil health over time. However, this transition requires upfront investments in knowledge and infrastructure, which can be economically challenging for some farmers, as noted by Seufert and Ramankutty (2017)."
+    },
+    {
+      "question": "Analyze the social, economic and environnmental impacts of land consolidation vs small-scale farmers.",
+      "answer": "Land consolidation has been associated with increased agricultural productivity but also with negative social and environmental impacts. Larger land holdings typically lead to monocultures, which reduce biodiversity and increase vulnerability to pests and diseases, as highlighted by Li et al. (2017). Economically, while consolidation can lead to economies of scale and potential gains in gross margins, it often displaces rural populations, exacerbating poverty and reducing local food diversity (Sutherland et al., 2015)."
+    },
+    {
+      "question": "Investigate the relationship between land ownership patterns, agricultural productivity and environment sustainability. ",
+      "answer": "Land ownership patterns critically influence agricultural productivity and sustainability. Secure land tenure supports investments in long-term improvements such as soil conservation and water management, which are pivotal for sustainable outcomes. Studies by Barrett et al. (2010) demonstrate that fragmented land ownership often results in inefficient resource use and higher transaction costs, which can detract from sustainability goals."
+    }
+  ],
+  "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."
+}

utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import streamlit as st
+from defaults import (
+    ARGILLA_SPACE_REPO_ID,
+    PROJECT_NAME,
+    ARGILLA_URL,
+    DIBT_PARENT_APP_URL,
+    DATASET_URL,
+    DATASET_REPO_ID,
+    ARGILLA_SPACE_REPO_ID,
+)
+def project_sidebar():
+    if PROJECT_NAME == "DEFAULT_DOMAIN":
+        st.warning(
+            "Please set up the project configuration in the parent app before proceeding."
+        )
+        st.stop()
+    st.sidebar.markdown(
+        """
+        ## 🌱 Domain Data Grower
+        This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
+        """
+    )
+    st.sidebar.subheader(f"Project Details: {PROJECT_NAME}")
+    st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
+    st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
+    st.sidebar.divider()
+    st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)