Spaces:

architojha
/

franky-v1

Sleeping

App Files Files Community

architojha commited on Jan 19

Commit

4067b64

1 Parent(s): 9c6dd31

adding files

Browse files

Files changed (26) hide show

.gitignore +5 -0
Dockerfile +13 -0
config.yaml +23 -0
requirements.txt +18 -0
src/__init__.py +0 -0
src/main.py +20 -0
src/models/__init__.py +0 -0
src/models/analysis_models.py +168 -0
src/models/context_events.py +0 -0
src/models/discovery_events.py +29 -0
src/models/schemas.py +66 -0
src/models/workflow_graph.py +36 -0
src/routers/__init__.py +0 -0
src/routers/analysis.py +53 -0
src/routers/analyze_generate_graph.py +63 -0
src/routers/context.py +18 -0
src/routers/discovery.py +34 -0
src/routers/graph.py +24 -0
src/utils/helper.py +280 -0
src/utils/settings.py +70 -0
src/workflows/__init__.py +0 -0
src/workflows/analysis_workflow.py +324 -0
src/workflows/graph_workflow.py +48 -0
src/workflows/reasoning_modules.py +155 -0
src/workflows/workflow_context.py +0 -0
src/workflows/workflow_discovery.py +143 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+/.env
+poetry.lock
+__pycache__
+storage/
+storage

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "7860"]

config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+userPreference:
+    llmService: 'openai' # 'ollama' or 'openai' or 'groq'
+    ModuleList: [
+            'IngestData',
+            'AugmentData',
+            'GenerateData',
+            'SearchData',
+            'Train',
+            'Evaluate',
+            'TriggerDeployment',
+            'ComparePerformance']
+OpenAIConfig:
+    llm: "gpt-4o-mini"
+    embeddingModel: "text-embedding-3-small"
+OllamaConfig:
+    llm: "llama3.2:latest"
+    baseURL: "http://localhost:11434"
+GroqConfig:
+    llm: 'llama-3.3-70b-versatile'
+    hfEmbedding: "BAAI/bge-small-en-v1.5"

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+python>=3.11,<3.12
+llama-index==0.12.8
+llama-index-core
+llama-index-llms-openai==0.3.12
+llama-index-utils-workflow==0.3.0
+fastapi==0.115.6
+uvicorn
+python-dotenv==1.0.1
+pydantic==2.10.4
+pydantic-settings==2.7.0
+pyyaml==6.0.2
+llama-index-llms-ollama==0.5.0
+llama-index-llms-groq==0.3.1
+llama-index-embeddings-huggingface==0.4.0
+llama-index-embeddings-ollama==0.5.0
+phidata==2.7.7
+groq==0.14.0
+duckduckgo-search==7.2.1

src/__init__.py ADDED Viewed

File without changes

src/main.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from fastapi import FastAPI
+from src.routers.discovery import router as interview_router
+from src.routers.context import router as context_router
+from src.routers.graph import router as graph_router
+from src.routers.analysis import router as analysis_router
+from src.routers.analyze_generate_graph import router as analyze_generate_graph_router
+from src.utils.settings import settings
+app = FastAPI(title="Franky Workflows for User Intent Recognition")
+app.include_router(interview_router, prefix="/api/intent")
+app.include_router(context_router, prefix="/api/intent")
+app.include_router(graph_router, prefix="/api/graph")
+app.include_router(analysis_router, prefix="/api/ml-analysis")
+app.include_router(analyze_generate_graph_router, prefix="/api/analyze-generate-graph")
+@app.get("/")
+async def read_root():
+    return "Franky Workflows are up!"

src/models/__init__.py ADDED Viewed

File without changes

src/models/analysis_models.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from typing import Iterator, List, Optional
+from enum import Enum
+from pydantic import BaseModel, Field
+class InputModel(BaseModel):
+    problem_statement: str = Field(
+        default=None,
+        description="Contains the description of the problem statement or task"
+    )
+class MLTaskType(str, Enum):
+    CLASSIFICATION = "classification"
+    REGRESSION = "regression"
+    CLUSTERING = "clustering"
+    NLP = "natural_language_processing"
+    COMPUTER_VISION = "computer_vision"
+    TIME_SERIES = "time_series"
+    ANOMALY_DETECTION = "anomaly_detection"
+    RECOMMENDATION = "recommendation"
+    OTHER = "other"
+class ModelResponseStatus(BaseModel):
+    """Technical specification for ML implementation"""
+    data_source: str = Field(
+        # default="...",
+        description="Required data sources and their characteristics"
+    )
+    data_format: str = Field(
+        # default="...",
+        description="Expected format of input data"
+    )
+    additional_data_requirement: bool = Field(
+        # default=False,
+        description="Whether additional data is needed"
+    )
+    constraints: str = Field(
+        # default="...",
+        description="Business and technical constraints"
+    )
+    task: MLTaskType = Field(
+        # default=MLTaskType.OTHER,
+        description="Type of ML task"
+    )
+    models: List[str] = Field(
+        # default=["..."],
+        description="Suggested ML models"
+    )
+    hyperparameters: List[str] = Field(
+        # default=["..."],
+        description="Key hyperparameters to consider"
+    )
+    eval_metrics: List[str] = Field(
+        # default=["..."],
+        description="Evaluation metrics for the solution"
+    )
+    technical_requirements: str = Field(
+        # default="...",
+        description="Technical implementation requirements"
+    )
+class RequirementsAnalysis(BaseModel):
+    """Initial analysis of business requirements"""
+    model_response: ModelResponseStatus
+    unclear_points: List[str] = Field(
+        default_factory=list,
+        description="Points needing clarification"
+    )
+    search_queries: List[str] = Field(
+        default_factory=list,
+        description="Topics to research"
+    )
+    business_understanding: str = Field(
+        description="Summary of business problem understanding"
+    )
+class TechnicalResearch(BaseModel):
+    """Results from technical research"""
+    model_response: ModelResponseStatus
+    research_findings: str = Field(
+        description="Key findings from research"
+    )
+    reference_implementations: List[str] = Field(
+        default_factory=list,
+        description="Similar implementation examples found"
+    )
+    sources: List[str] = Field(
+        default_factory=list,
+        description="Sources of information"
+    )
+# Implementation Planning Models
+class ComponentType(str, Enum):
+    DATA_PIPELINE = "data_pipeline"
+    PREPROCESSOR = "preprocessor"
+    MODEL = "model"
+    EVALUATOR = "evaluator"
+    INFERENCE = "inference"
+    MONITORING = "monitoring"
+    UTILITY = "utility"
+class ParameterSpec(BaseModel):
+    """Specification for a single parameter"""
+    name: str = Field(description="Name of the parameter")
+    param_type: str = Field(description="Type of the parameter")
+    description: str = Field(description="Description of the parameter")
+    default_value: str = Field(description="Default value if any")
+    required: bool = Field(description="Whether the parameter is required")
+class ConfigParam(BaseModel):
+    """Specification for a configuration parameter"""
+    name: str = Field(description="Name of the configuration parameter")
+    value_type: str = Field(description="Type of value expected")
+    description: str = Field(description="Description of the configuration parameter")
+    default: str = Field(description="Default value if any")
+class FunctionSpec(BaseModel):
+    """Detailed specification for a single function"""
+    name: str = Field(description="Name of the function")
+    description: str = Field(description="Detailed description of function's purpose")
+    input_params: List[ParameterSpec] = Field(
+        description="List of input parameters and their specifications"
+    )
+    return_type: str = Field(description="Return type and description")
+    dependencies: List[str] = Field(
+        description="Required dependencies/imports"
+    )
+    error_handling: List[str] = Field(
+        description="Expected errors and handling strategies"
+    )
+class ComponentSpec(BaseModel):
+    """Specification for a component (module) of the system"""
+    name: str = Field(description="Name of the component")
+    type: ComponentType = Field(description="Type of component")
+    description: str = Field(description="Detailed description of component's purpose")
+    functions: List[FunctionSpec] = Field(description="Functions within this component")
+    dependencies: List[str] = Field(
+        description="External package dependencies"
+    )
+    config_params: List[ConfigParam] = Field(
+        description="Configuration parameters needed"
+    )
+class ImplementationPlan(BaseModel):
+    """Complete implementation plan for the ML system"""
+    components: List[ComponentSpec] = Field(description="System components")
+    system_requirements: List[str] = Field(
+        description="System-level requirements and dependencies"
+    )
+    deployment_notes: str = Field(
+        description="Notes on deployment and infrastructure"
+    )
+    testing_strategy: str = Field(
+        description="Strategy for testing components"
+    )
+    implementation_order: List[str] = Field(
+        description="Suggested order of implementation"
+    )

src/models/context_events.py ADDED Viewed

File without changes

src/models/discovery_events.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from llama_index.core.workflow import Event
+class GetModulesEvent(Event):
+    """
+    Event to get modules. Outputs accepted modules for the task.
+    """
+    task: str
+    modules: str
+class RefineModulesEvent(Event):
+    """
+    Event to refine modules. Outputs refined and adapted modules.
+    """
+    task: str
+    refined_modules: str
+class ReasoningStructureEvent(Event):
+    """
+    Event to create reasoning structure. Outputs final reasoning structure.
+    """
+    task: str
+    reasoning_structure: str
+# TODO: Add JudgeEvent(Event) here which analyses context, judges if requirements complete,
+#       and emits either loop or StopEvent.

src/models/schemas.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from typing import List
+from pydantic import BaseModel
+class WorkflowSchema(BaseModel):
+    data_source: str
+    data_format: str
+    additional_data_requirement: bool
+    constraints: str
+    available_preprocess_script: bool
+    preprocess_script: str
+    recommended_preprocess_steps: List[str]
+    task: str
+    models: List[str]
+    hyperparameters: List[str]
+    eval_metrics: List[str]
+    deploy_constraints: str
+class IntentFilesSchema(BaseModel):
+    context: str
+    open_questions: List[str]
+    data_source: str
+    data_format: str
+    additional_data_requirement: bool
+    constraints: str
+    available_preprocess_script: bool
+    preprocess_script: str
+    recommended_preprocess_steps: List[str]
+    task: str
+    models: List[str]
+    hyperparameters: List[str]
+    eval_metrics: List[str]
+    deploy_constraints: str
+class IntentInterviewsSchema(BaseModel):
+    context: str
+    completion_status: bool
+    result: str
+    data_source: str
+    data_format: str
+    additional_data_requirement: bool
+    constraints: str
+    available_preprocess_script: bool
+    preprocess_script: str
+    recommended_preprocess_steps: List[str]
+    task: str
+    models: List[str]
+    hyperparameters: List[str]
+    eval_metrics: List[str]
+    deploy_constraints: str
+class IntentRequestData(BaseModel):
+    query: str
+    context: str
+    count: int
+    complete: bool
+class IntentResponseData(BaseModel):
+    context: str
+    result: str
+    count: int
+    complete: bool

src/models/workflow_graph.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from llama_index.core.workflow import Event
+from pydantic import BaseModel
+from typing import List
+class Edge(BaseModel):
+    source: str
+    target: str
+    desc: str
+class Node(BaseModel):
+    node_id: str
+    name: str
+class Graph(BaseModel):
+    nodes: List[Node]
+    edges: List[Edge]
+class GetResponseEvent(Event):
+    project_details: str
+    rawResponse: str
+class ConstructGraphEvent(Event):
+    workflowGraph: Graph
+class GraphInputSchema(BaseModel):
+    desc: str
+class GraphOutputSchema(BaseModel):
+    graph: Graph

src/routers/__init__.py ADDED Viewed

File without changes

src/routers/analysis.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from fastapi import APIRouter, Response
+from src.models.analysis_models import InputModel
+from src.workflows.analysis_workflow import MLAnalysisWorkflow, MLImplementationPlanner
+from datetime import datetime
+from phi.storage.workflow.sqlite import SqlWorkflowStorage
+from phi.utils.pprint import pprint_run_response
+from phi.utils.log import logger
+from typing import Iterator
+router = APIRouter()
+@router.get("/")
+async def read_root():
+    return Response("Ml-Analysis workflow from user problem is Up!")
+@router.post("/analyze-problem")
+async def analyze_problem(data: InputModel):
+    analysis_workflow = MLAnalysisWorkflow(
+        session_id=f"ml-analysis-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+        storage=SqlWorkflowStorage(
+            table_name="ml_analysis_workflows",
+            db_file="storage/workflows.db"
+        )
+    )
+    analysis_response: Iterator[RunResponse] = analysis_workflow.run(data.problem_statement)
+    pprint_run_response(analysis_response, markdown=True)
+    requirements_result = analysis_workflow.requirements_analyst.run_response.content if analysis_workflow.requirements_analyst.run_response else None
+    research_result = analysis_workflow.technical_researcher.run_response.content if analysis_workflow.technical_researcher.run_response else None
+    if requirements_result:
+        logger.info("===Planning Phase===")
+        planning_workflow = MLImplementationPlanner(
+            session_id=f"ml-planning-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+            storage=SqlWorkflowStorage(
+                table_name="ml_planning_workflows",
+                db_file="storage/workflows.db"
+            )
+        )
+        # run and print planning workflow
+        planning_response_stream: Iterator[RunResponse] = planning_workflow.run(requirements_result, research_result)
+        pprint_run_response(planning_response_stream, markdown=True)
+        return {"Response": planning_workflow.writer.run_response.content}
+    else:
+        return {"Error": "Requirements analysis did not complete successfully."}

src/routers/analyze_generate_graph.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from fastapi import APIRouter, Response
+from src.models.analysis_models import InputModel
+from src.workflows.analysis_workflow import MLAnalysisWorkflow, MLImplementationPlanner
+from datetime import datetime
+from phi.storage.workflow.sqlite import SqlWorkflowStorage
+from phi.utils.pprint import pprint_run_response
+from phi.utils.log import logger
+from typing import Iterator
+from llama_index.core.settings import Settings
+from src.models.workflow_graph import GraphInputSchema, GraphOutputSchema
+from src.workflows.graph_workflow import DesignGraphWorkflow
+router = APIRouter()
+async def analyze_problem(problem_statement: str):
+    analysis_workflow = MLAnalysisWorkflow(
+        session_id=f"ml-analysis-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+        storage=SqlWorkflowStorage(
+            table_name="ml_analysis_workflows",
+            db_file="storage/workflows.db"
+        )
+    )
+    analysis_response: Iterator[RunResponse] = analysis_workflow.run(problem_statement)
+    pprint_run_response(analysis_response, markdown=True)
+    requirements_result = analysis_workflow.requirements_analyst.run_response.content if analysis_workflow.requirements_analyst.run_response else None
+    research_result = analysis_workflow.technical_researcher.run_response.content if analysis_workflow.technical_researcher.run_response else None
+    if requirements_result:
+        logger.info("===Planning Phase===")
+        planning_workflow = MLImplementationPlanner(
+            session_id=f"ml-planning-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+            storage=SqlWorkflowStorage(
+                table_name="ml_planning_workflows",
+                db_file="storage/workflows.db"
+            )
+        )
+        planning_response_stream: Iterator[RunResponse] = planning_workflow.run(requirements_result, research_result)
+        pprint_run_response(planning_response_stream, markdown=True)
+        return planning_workflow.writer.run_response.content
+    else:
+        return "Requirements analysis did not complete successfully."
+@router.post("/", response_model=GraphOutputSchema)
+async def analyzer_generate_graph(data: InputModel):
+    task_description = await analyze_problem(data.problem_statement)
+    try:
+        graph_workflow = DesignGraphWorkflow(timeout=60, verbose=True)
+        graph_result = await graph_workflow.run(_project_description=task_description, llm=Settings._llm)
+        return GraphOutputSchema(graph=graph_result)
+    except Exception as e:
+        return {"detail": f"Error processing {e}"}

src/routers/context.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from fastapi import APIRouter
+from llama_index.core.settings import Settings
+# configurations
+router = APIRouter()
+@router.get("/")
+async def read_root():
+    return "Script Reader Workflow for User Intent from files is up!"
+@router.post("/context/")
+async def extract_code_context():
+    try:
+        response = ""
+        return response
+    except Exception as e:
+        return {"detail": f"Error processing {e}"}

src/routers/discovery.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from fastapi import APIRouter
+from llama_index.core.settings import Settings
+from src.models.schemas import IntentRequestData, IntentResponseData
+from src.workflows.workflow_discovery import SelfDiscoverWorkflow, JudgeWorkflow
+from src.workflows.reasoning_modules import ML_EXPERT_PROMPT_TEMPLATE
+# configurations
+router = APIRouter()
+@router.get("/")
+async def read_root():
+    return "Self-Discovery Workflow for User Intent Interview is up!"
+@router.post("/interview/", response_model=IntentResponseData)
+async def interview_user(data: IntentRequestData):
+    try:
+        interview_workflow = SelfDiscoverWorkflow()
+        task = ML_EXPERT_PROMPT_TEMPLATE.format(query=data.query)
+        workflow_handler = interview_workflow.run(task=task, llm=Settings._llm)
+        intermediate_result = await workflow_handler
+        context = await workflow_handler.ctx.get("workflow_result")
+        judge_workflow = JudgeWorkflow()
+        completion_status = await judge_workflow.run(judging_context=intermediate_result, llm=Settings._llm)
+        return IntentResponseData(context=str(context),
+                                  result=intermediate_result,
+                                  count=data.count + 1,
+                                  complete=completion_status)
+    except Exception as e:
+        return {"detail": f"Error processing {e}"}

src/routers/graph.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from fastapi import APIRouter
+from llama_index.core.settings import Settings
+from src.models.workflow_graph import GraphInputSchema, GraphOutputSchema
+from src.workflows.graph_workflow import DesignGraphWorkflow
+# configurations
+router = APIRouter()
+@router.get("/", response_model=GraphInputSchema)
+async def read_root():
+    return {'desc':"Graph Creation Workflow on User Intent is up!"}
+@router.post("/design/", response_model=GraphOutputSchema)
+async def interview_user(data: GraphInputSchema):
+    try:
+        graph_workflow = DesignGraphWorkflow(timeout=60, verbose=True)
+        graph_result = await graph_workflow.run(_project_description=data.desc, llm=Settings._llm)
+        return GraphOutputSchema(graph=graph_result)
+    except Exception as e:
+        return {"detail": f"Error processing {e}"}

src/utils/helper.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import re
+from src.models.workflow_graph import Edge, Node, Graph
+class HelperClass:
+    @staticmethod
+    def _build_prompt(project_desc: str, modules: list) -> str:
+        return f'''
+                You are an advanced AI tasked with constructing a directed graph/flow based on a set of available modules and a project description. Each module in the flow represents a node, and each edge defines the task connecting these nodes.
+                Your output should adhere strictly to the following rules: Dont give me any code and dont mention 'json' at the top of the response.
+                There should not be any extra output (even a single word) besides the output required.
+                The flow of nodes and tasks must be determined by analyzing the provided project description.
+                The modules chosen must form a complete pipeline suitable for the tasks in the project description.
+                -Steps-
+                1. Parse the project description to identify the tasks and operations required to form the flow.
+                - For each task, determine which module (from the available list) best fits the task description.
+                - Assign a unique identifier to every instance of a module.
+                - Example: If the "Train" module is used twice for lets say training 2 different model, name them both Train with a unique id to both of them.
+                - For each identified node:
+                - Node ID: Generate a Unique identifier for the module instance (5 digit random string and integer combined and in lower case).
+                - Module Name: Name of the module from the available list.
+                Format each node as:
+                <unique Node ID>Module Name</unique Node ID>
+                2. Construct Edges Between Nodes
+                - Determine the logical sequence of tasks from the project description.
+                - Identify source and target modules for each transition based on the task flow.
+                - The graph begins with a special "Start" node. Edges must connect from <Start> to the first module in the pipeline.
+                - For each connection, output the following information:
+                - Source Node: The unique ID of the starting module. ( Used the ids for each module generated in Step 1)
+                - Target Node: The unique ID of the destination module. ( Used the ids for each module generated in Step 1)
+                - Task Description: A short descripiton of what is happening during the transition.
+                Format each edge as:
+                <Edge index>( sourceNode="<Node ID>" | targetNode="<Node ID>" | task="<Task Description>" )<
+                ######################
+                -Examples-
+                ######################
+                Example 1:
+                Input: Project Description:
+                This project implements an automated quality control system for manufacturing using a modular machine learning pipeline. Data from high-resolution product images and metadata is ingested and augmented to enhance diversity and balance.
+                Task A trains a CNN for defect detection, while Task B trains a transformer-based model for quality classification. Both models are rigorously evaluated and compared against predefined benchmarks. Successful models are deployed for real-time defect monitoring and automated grading via integration with production and ERP systems.
+                Input: Available Modules:
+                ['IngestData',
+                'AugmentData',
+                'GenerateData',
+                'SearchData',
+                'Train',
+                'Evaluate',
+                'TriggerDeployment',
+                'ComparePerformance']
+                -------------------------------------
+                Flow Generated by LLM: ( This will not be Input )
+                IngestData -> AugmentData
+                AugmentData -> Train (for Task A)
+                AugmentData -> Train (for Task B)
+                Train (model A) -> Evaluate (test model A)
+                Train (model B) -> Evaluate (test model B)
+                Evaluate (test model A) -> ComparePerformance
+                Evaluate (test model B) -> ComparePerformance
+                ComparePerformance -> TriggerDeployment
+                ################
+                Output:
+                <p83fd>IngestData</p83fd>
+                <sb9ba>AugmentData</sb9ba>
+                <bxt2w>Train A</bxt2w>
+                <d1ep3>Train B</d1ep3>
+                <b9lca>Evaluate A</b9lca>
+                <5w01f>Evaluate B</5w01f>
+                <z4bun>ComparePerformance</z4bun>
+                <zj2pb>TriggerDeployment</zj2pb>
+                <Edge 1>( sourceNode="<Start>" | targetNode="<p83fd>" | task = "From Start node to Ingesting Data"  )</Edge 1>
+                <Edge 2>( sourceNode="<p83fd>" | targetNode="<sb9ba>" | task = "Ingesting Data to Augment Data"  )</Edge 2>
+                <Edge 3>( sourceNode="<sb9ba>" | targetNode="<bxt2w>" | task = "Augmenting Data to Train model A"  )</Edge 3>
+                <Edge 4>( sourceNode="<sb9ba>" | targetNode="<d1ep3>" | task = "Augmenting Data to Train model B"  )</Edge 4>
+                <Edge 5>( sourceNode="<bxt2w>" | targetNode="<b9lca>" | task = "Training A to Evaluate model A"  )</Edge 5>
+                <Edge 6>( sourceNode="<d1ep3>" | targetNode="<5w01f>" | task = "Training B to Evaluate model B"  )</Edge 6>
+                <Edge 7>( sourceNode="<b9lca>" | targetNode="<z4bun>" | task = "Evaluate model A to Compare Performance"  )</Edge 7>
+                <Edge 8>( sourceNode="<5w01f>" | targetNode="<z4bun>" | task = "Evaluate model B to Compare Performance"  )</Edge 8>
+                <Edge 9>( sourceNode="<z4bun>" | targetNode="<zj2pb>" | task = "Compare Performance to Trigger Deployment"  )</Edge 9>
+                #############################
+                Example 2:
+                Input: Project Description:
+                This project develops an automated crop health monitoring system using a modular machine learning pipeline. Data from satellite and drone imagery is ingested and preprocessed, followed by augmentation techniques to increase diversity and balance.
+                Synthetic data is generated to simulate various crop conditions, enhancing model robustness. The pipeline trains a deep learning model to classify crop health, evaluates its performance on key metrics such as accuracy and recall, and identifies areas for improvement.
+                Once performance benchmarks are met, the system is deployed for real-time crop monitoring, enabling farmers to make informed decisions and optimize agricultural productivity efficiently.
+                Input: Available Modules:
+                ['IngestData',
+                'AugmentData',
+                'GenerateData',
+                'SearchData',
+                'Train',
+                'Evaluate',
+                'TriggerDeployment',
+                'ComparePerformance']
+                -------------------------------------
+                Flow Generated by LLM: ( This will not be Input )
+                IngestData -> AugmentData
+                AugmentData -> GenerateData
+                GenerateData -> Train
+                Train -> Evaluate
+                Evaluate -> TriggerDeployment
+                ################
+                Output:
+                <p001>IngestData</p001>
+                <p002>AugmentData</p002>
+                <p003>GenerateData</p003>
+                <p004>Train</p004>
+                <p005>Evaluate</p005>
+                <p006>TriggerDeployment</p006>
+                <Edge 1>( sourceNode="<Start>" | targetNode="<p001>" | task="From Start to Ingesting Data" )</Edge 1>
+                <Edge 2>( sourceNode="<p001>" | targetNode="<p002>" | task="Ingesting Data to Augmenting Data" )</Edge 2>
+                <Edge 3>( sourceNode="<p002>" | targetNode="<p003>" | task="Augmenting Data to Generating Synthetic Data" )</Edge 3>
+                <Edge 4>( sourceNode="<p003>" | targetNode="<p004>" | task="Generating Data to Training Model" )</Edge 4>
+                <Edge 5>( sourceNode="<p004>" | targetNode="<p005>" | task="Training Model to Evaluating Performance" )</Edge 5>
+                <Edge 6>( sourceNode="<p005>" | targetNode="<p006>" | task="Evaluating Model to Triggering Deployment" )</Edge 6>
+                #############################
+                Example 3:
+                Input: Project Descripiont:
+                This project implements a robust machine learning pipeline for iterative model improvement. Data is ingested and preprocessed, followed by augmentation to enhance diversity and balance.
+                An initial model is trained on the augmented data. The pipeline then applies further data augmentation techniques tailored to improve underperforming areas, followed by retraining the model for enhanced accuracy.
+                The improved model is rigorously evaluated on a test dataset to ensure it meets predefined performance benchmarks. Upon achieving the desired metrics, the best-performing model is deployed to production, ensuring reliable and efficient real-world performance tailored to the project's objectives.
+                Input: Available Modules:
+                ['IngestData',
+                'AugmentData',
+                'GenerateData',
+                'SearchData',
+                'Train',
+                'Evaluate',
+                'TriggerDeployment',
+                'ComparePerformance']
+                -------------------------------------
+                Flow Generated by LLM: ( This will not be Input )
+                IngestData -> AugmentData (Stage 1)
+                AugmentData (Stage 1) -> Train (Stage 1)
+                Train (Stage 1) -> AugmentData (Stage 2)
+                AugmentData (Stage 2) -> Train (Stage 2)
+                Train (Stage 2) -> Evaluate
+                Evaluate -> TriggerDeployment
+                ################
+                Output:
+                <m001>IngestData</m001>
+                <m002>AugmentData Stage 1</m002>
+                <m003>Train Stage 1</m003>
+                <m004>AugmentData Stage 2</m004>
+                <m005>Train Stage 2</m005>
+                <m006>Evaluate</m006>
+                <m007>TriggerDeployment</m007>
+                <Edge 1>( sourceNode="<Start>" | targetNode="<m001>" | task="From Start to Ingesting Data" )</Edge 1>
+                <Edge 2>( sourceNode="<m001>" | targetNode="<m002>" | task="Ingesting Data to Augmenting Data Stage 1" )</Edge 2>
+                <Edge 3>( sourceNode="<m002>" | targetNode="<m003>" | task="Augmenting Data Stage 1 to Training Stage 1" )</Edge 3>
+                <Edge 4>( sourceNode="<m003>" | targetNode="<m004>" | task="Training Stage 1 to Augmenting Data Stage 2" )</Edge 4>
+                <Edge 5>( sourceNode="<m004>" | targetNode="<m005>" | task="Augmenting Data Stage 2 to Training Stage 2" )</Edge 5>
+                <Edge 6>( sourceNode="<m005>" | targetNode="<m006>" | task="Training Stage 2 to Evaluating Model" )</Edge 6>
+                <Edge 7>( sourceNode="<m006>" | targetNode="<m007>" | task="Evaluating Model to Triggering Deployment" )</Edge 7>
+                #############################
+                When you give output dont mention anything like 'Here is the list of Nodes and Edges extracted from the text:'. Just give the response straight away
+                -Real Data-
+                ######################
+                Input: Project Descripion: {project_desc}
+                **Instructions**
+                1. A list of modules available for building the pipeline. You must only use these modules to form the flow.
+                2. Do not Generate New Names for Modules. Only use whatever is available in the list
+                Input: Available Modules:
+                {modules}
+                ######################
+                Output:
+        '''
+    @staticmethod
+    def _parse_llm_response(raw_response: str) -> Graph:
+        pattern = r'<([a-zA-Z0-9]+)>([^<]+)<\/\1>|sourceNode="<([^"]+)>"\s*\|\s*targetNode="<([^"]+)>"\s*\|\s*task="([^"]+)"'
+        nodes, edges = [], []
+        list_ = raw_response.split('\n')
+        for line in list_:
+            matches = re.findall(pattern, line)
+            try:
+                for match in matches:
+                    if match[0]:
+                        nd = Node(node_id=match[0], name=match[1])
+                        nodes.append(nd)
+                    elif match[2]:
+                        edge = Edge(source=match[2], target=match[3], desc=match[4])
+                        edges.append(edge)
+            except Exception as e:
+                print(f"Error parsing line : {line}, error: {e}")
+        return Graph(nodes=nodes, edges=edges)
+    @staticmethod
+    def _store_graph(graph_data: Graph):
+        nodes, edges = [], []
+        dict_ = {}
+        for node in graph_data.nodes:
+            dict_[node.node_id] = node.name
+            nodes.append({
+                'node_id': node.node_id,
+                'name': node.name
+            })
+        dict_['Start'] = 'StartNode'
+        for edge in graph_data.edges:
+            source_node = dict_[edge.source]
+            target_node = dict_[edge.target]
+            edges.append({
+                'source': edge.source,
+                'target': edge.target,
+                'desc': edge.desc
+            })
+        json_obj = {'Nodes': nodes, 'Edges': edges}
+helper = HelperClass()

src/utils/settings.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import yaml
+from typing import Optional, List
+from pydantic_settings import BaseSettings
+from llama_index.core import Settings
+from llama_index.llms.openai import OpenAI
+from llama_index.llms.ollama import Ollama
+from llama_index.llms.groq import Groq
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.embeddings.ollama import OllamaEmbedding
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from pydantic import Field, ValidationError
+class ProjectSettings(BaseSettings):
+    OPENAI_API_KEY: Optional[str] = Field(None, env='OPENAI_API_KEY')
+    GROQ_KEY: Optional[str] = Field(None, env='GROQ_KEY')
+    config: Optional[dict] = None
+    moduleList: List[str] = Field(default_factory=list)
+    class Config:
+        env_file = '.env'
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.config = self._read_yaml_config()
+        if not self.config:
+            raise ValidationError("Config file could not be loaded")
+        self._instantiate_services()
+        self.moduleList = self.config.get('userPreference', {})['ModuleList']
+    def _instantiate_services(self):
+        llm_service = self.config.get('userPreference', {}).get('llmService', '').lower()
+        if llm_service == 'ollama':
+            self._initialize_ollama()
+        elif llm_service == 'openai':
+            self._initialize_openai()
+        elif llm_service == 'groq':
+            self._initialize_groq()
+        else:
+            raise ValueError(f"Invalid LLM service: {llm_service}")
+    def _initialize_ollama(self):
+        ollama_config = self.config.get('OllamaConfig', {})
+        Settings.llm = Ollama(base_url=ollama_config['baseURL'], model=ollama_config['llm'])
+        Settings.embed_model = OllamaEmbedding(base_url=ollama_config['baseURL'], model_name=ollama_config['llm'])
+    def _initialize_openai(self):
+        openai_config = self.config.get('OpenAIConfig', {})
+        Settings.llm = OpenAI(model=openai_config['llm'], api_key=self.OPENAI_API_KEY)
+        Settings.embed_model = OpenAIEmbedding(model=openai_config['embeddingModel'], api_key=self.OPENAI_API_KEY)
+    def _initialize_groq(self):
+        groq_config = self.config.get('GroqConfig', {})
+        Settings.llm = Groq(model=groq_config['llm'], api_key=self.GROQ_KEY)
+        Settings.embed_model = HuggingFaceEmbedding(model_name=groq_config['hfEmbedding'])
+    @staticmethod
+    def _read_yaml_config():
+        with open("config.yaml", "r") as file:
+            config = yaml.safe_load(file)
+        return config
+settings = ProjectSettings()

src/workflows/__init__.py ADDED Viewed

File without changes

src/workflows/analysis_workflow.py ADDED Viewed

	@@ -0,0 +1,324 @@

+from src.models.analysis_models import MLTaskType, ModelResponseStatus, RequirementsAnalysis, TechnicalResearch, ComponentType, ParameterSpec, ConfigParam, FunctionSpec, ComponentSpec, ImplementationPlan
+from typing import Iterator, List, Optional
+from phi.workflow import Workflow, RunResponse, RunEvent
+from phi.agent import Agent
+from phi.model.openai import OpenAIChat
+from phi.storage.workflow.sqlite import SqlWorkflowStorage
+from phi.storage.agent.sqlite import SqlAgentStorage
+# from phi.memory.db.sqlite import SqliteMemoryDb
+from phi.tools.duckduckgo import DuckDuckGo
+from phi.utils.log import logger
+from dotenv import load_dotenv
+import json
+import os
+load_dotenv()
+api_key = os.getenv("OPENAI_API_KEY")
+class MLAnalysisWorkflow(Workflow):
+    """Workflow for analyzing ML business requirements and creating technical specifications"""
+    # Initialize agents
+    requirements_analyst: Agent = Agent(
+        name="ML Requirements Analyst",
+        model=OpenAIChat(id="gpt-4o", api_key=api_key),
+        description="Expert ML Solutions Architect specialized in analyzing business requirements",
+        instructions=[
+            "Analyze business problems and translate them into technical ML specifications.",
+            "1. Understand the core business problem and objectives",
+            "2. Identify the type of ML task required",
+            "3. Determine data requirements and constraints",
+            "4. List unclear points that need clarification",
+            "5. Specify areas that need technical research",
+            "Be precise in identifying what information is missing or needs validation."
+        ],
+        response_model=RequirementsAnalysis,
+        structured_outputs=True,
+        reasoning=True,
+        storage=SqlAgentStorage(
+            table_name="requirements_sessions",
+            db_file="storage/agent_storage.db"
+        ),
+        debug_mode=True,
+        # memory=AgentMemory(memory_db=requirements_db)
+    )
+    technical_researcher: Agent = Agent(
+        name="ML Technical Researcher",
+        model=OpenAIChat(id="gpt-4o", api_key=api_key),
+        description="ML Expert specialized in researching technical implementations",
+        tools=[DuckDuckGo(search=True, news=False)],
+        instructions=[
+            "Research and validate technical aspects of ML solutions.",
+            "1. Search for similar ML implementations and best practices",
+            "2. Find recommended models and architectures",
+            "3. Research typical hyperparameters and evaluation metrics",
+            "4. Look for implementation constraints and requirements",
+            "5. Validate technical feasibility",
+            "Provide sources for all technical information.",
+            "Focus on recent and reliable technical sources."
+        ],
+        response_model=TechnicalResearch,
+        structured_outputs=True,
+        prevent_hallucination=True,
+        reasoning=True,
+        storage=SqlAgentStorage(
+            table_name="researcher_sessions",
+            db_file="storage/agent_storage.db"
+        ),
+        debug_mode=True,
+        # memory=AgentMemory(memory_db=researcher_db)
+    )
+    writer: Agent = Agent(
+        model=OpenAIChat(id="gpt-4o", api_key=api_key),
+        instructions=[
+            "You will be provided with lots of structured outputs. Your work is to display this"
+            "in a nicely formatted manner without changing any of the content. Present all the links"
+            "as they are, with explicitly mentioned hyperlinks. Do not change any content."
+        ],
+        markdown=True,
+    )
+    def validate_model_response(self, response: ModelResponseStatus) -> List[str]:
+        """Check for missing or incomplete fields in ModelResponseStatus"""
+        logger.info("Checking for missing or incomplete fields in ModelResponseStatus...")
+        missing_fields = []
+        response_dict = response.model_dump()
+        for field, value in response_dict.items():
+            if value == "..." or value == ["..."]:
+                missing_fields.append(field)
+            elif isinstance(value, list) and not value:
+                missing_fields.append(field)
+        return missing_fields
+    def analyze_requirements(self, user_query: str) -> Optional[RequirementsAnalysis]:
+        """Stream requirements analysis"""
+        logger.info("Analyzing requirements...")
+        prompt = f"Analyze this business problem and provide initial technical specifications: {user_query}"
+        analyse_stream = self.requirements_analyst.run(prompt)
+        return analyse_stream.content
+    def conduct_research(self, research_prompt: str) -> Optional[TechnicalResearch]:
+        """Stream technical research"""
+        logger.info("Conducting technical research...")
+        conduct_stream = self.technical_researcher.run(research_prompt)
+        return conduct_stream.content
+    def finalize_analysis(self, final_prompt: str) -> Optional[RequirementsAnalysis]:
+        """Stream final analysis"""
+        logger.info("Finalizing analysis...")
+        finalise_stream = self.requirements_analyst.run(final_prompt)
+        return finalise_stream.content
+    def write_requirements_post(self, requirements_results: RequirementsAnalysis) -> Iterator[RunResponse]:
+        """
+        Write a blog post on a topic.
+        :param requirements_results: requirements_analyst response
+        :return: iterator for the workflow response
+        """
+        logger.info("Writing requirements analysis...")
+        writer_input = {"model_response": requirements_results.model_response.model_dump(),
+                        "unclear_points": requirements_results.unclear_points,
+                        "search_queries": requirements_results.search_queries,
+                        "business_understanding": requirements_results.business_understanding
+                        }
+        yield from self.writer.run(json.dumps(writer_input, indent=4), stream=True)
+    def write_research_post(self, research_results: TechnicalResearch) -> Iterator[RunResponse]:
+        """
+        Write a blog post on a topic.
+        :param research_results: research content
+        :return: iterator for the workflow response
+        """
+        logger.info("Writing research findings...")
+        writer_input = {"research_findings": research_results.research_findings,
+                        "reference_implementations": research_results.reference_implementations,
+                        "sources": research_results.sources
+                        }
+        yield from self.writer.run(json.dumps(writer_input, indent=4), stream=True)
+    def run(self, user_query: str) -> Iterator[RunResponse]:
+        """
+        Run the ML analysis workflow
+        Args:
+            user_query: Description of the business problem
+        """
+        try:
+            # Initial requirements analysis with streaming
+            requirements_result: Optional[RequirementsAnalysis] = self.analyze_requirements(user_query)
+            if not requirements_result:
+                yield RunResponse(
+                    event=RunEvent.workflow_completed,
+                    content="Error: Requirements analysis failed to produce valid results."
+                )
+                return
+            logger.info("Writing initial requirements analysis...")
+            yield from self.write_requirements_post(requirements_result)
+            # Check what needs research
+            missing_fields = self.validate_model_response(requirements_result.model_response)
+            logger.info("Missing fields found!")
+            search_queries = requirements_result.search_queries
+            logger.info("Search queries found!")
+            unclear_points = requirements_result.unclear_points
+            logger.info("Unclear points found!")
+            if missing_fields or search_queries:
+                # Conduct technical research
+                logger.info("Researching technical specifications...")
+                research_prompt = (
+                    f"Research the following for this ML problem: {user_query}\n"
+                    f"Missing information needed for: {', '.join(missing_fields)}\n"
+                    f"Specific topics to research: {', '.join(search_queries)}\n"
+                    f"Points needing clarification: {', '.join(unclear_points)}\n"
+                    f"Current understanding: {requirements_result.business_understanding}"
+                )
+                logger.info("Conducting research...")
+                research_result: Optional[TechnicalResearch] = self.conduct_research(research_prompt)
+                logger.info("Sharing research findings...")
+                yield from self.write_research_post(research_result)
+                final_prompt = (
+                    f"Original problem: {user_query}\n"
+                    f"Research findings: {research_result.research_findings}\n"
+                    "Please provide final technical specifications incorporating this research."
+                )
+                logger.info("Obtaining final requirements")
+                final_result: Optional[RequirementsAnalysis] = self.finalize_analysis(final_prompt)
+                logger.info("Writing final requirements...")
+                yield from self.write_requirements_post(final_result)
+        except Exception as e:
+            logger.error(f"Workflow error: {str(e)}")
+            yield RunResponse(
+                event=RunEvent.workflow_completed,
+                content=f"Error in analysis workflow: {str(e)}"
+            )
+class MLImplementationPlanner(Workflow):
+    """Workflow for creating detailed ML implementation plans"""
+    # Initialize architect agent
+    architect: Agent = Agent(
+        name="ML System Architect",
+        model=OpenAIChat(id="gpt-4o", api_key=api_key),
+        description="Expert ML System Architect specialized in detailed implementation planning",
+        instructions=[
+            "Create detailed technical implementation plans for ML systems.",
+            "1. Break down the system into logical components",
+            "2. Define detailed function specifications for each component",
+            "3. Specify clear interfaces between components",
+            "4. Consider error handling and edge cases",
+            "5. Plan testing and deployment strategies",
+            "Be extremely specific about function signatures and component interactions.",
+            "Focus on maintainability and scalability in the design."
+        ],
+        response_model=ImplementationPlan,
+        structured_outputs=True,
+        reasoning=True,
+        storage=SqlAgentStorage(
+            table_name="architect_sessions",
+            db_file="storage/agent_storage.db"
+        ),
+        debug_mode=True,
+        # memory=AgentMemory(memory_db=architect_db)
+    )
+    writer: Agent = Agent(
+        model=OpenAIChat(id="gpt-4o", api_key=api_key),
+        instructions=[
+            "You will be provided with lots of structured outputs. Your work is to display this"
+            "in a nicely formatted manner without changing any of the content."
+        ],
+        markdown=True,
+    )
+    def create_implementation_plan(self, planning_prompt: str) -> Optional[ImplementationPlan]:
+        """Stream implementation plan creation"""
+        logger.info("Creating implementation plan...")
+        planning_stream = self.architect.run(planning_prompt)
+        return planning_stream.content
+    def validate_interfaces(self, validation_prompt: str) -> Optional[ImplementationPlan]:
+        """Stream interface validation"""
+        logger.info("Validating interfaces...")
+        architect_stream = self.architect.run(validation_prompt)
+        return architect_stream.content
+    def write_implementation_post(self, implementation_results: ImplementationPlan) -> Iterator[RunResponse]:
+        """
+        Write a blog post on a topic.
+        :param implementation_results: implementation plan results
+        :return: iterator for the workflow response
+        """
+        logger.info("Writing implementation plan...")
+        writer_input = {"components": [comp.model_dump() for comp in implementation_results.components],
+                        "system_requirements": implementation_results.system_requirements,
+                        "deployment_notes": implementation_results.deployment_notes,
+                        "testing_strategy": implementation_results.testing_strategy,
+                        "implementation_order": implementation_results.implementation_order
+                        }
+        yield from self.writer.run(json.dumps(writer_input, indent=4), stream=True)
+    def run(
+            self,
+            requirements_analysis: RequirementsAnalysis,
+            technical_research: Optional[TechnicalResearch] = None
+    ) -> Iterator[RunResponse]:
+        """
+        Create implementation plan based on requirements analysis and research
+        Args:
+            requirements_analysis: Results from requirements analysis
+            technical_research: Optional results from technical research
+        """
+        try:
+            logger.info("Starting planning workflow...")
+            # Prepare comprehensive prompt for the architect
+            planning_prompt = (
+                f"Create a detailed implementation plan for this ML system.\n\n"
+                f"Business Understanding:\n{requirements_analysis.business_understanding}\n\n"
+                f"Technical Specifications:\n"
+                f"- Task Type: {requirements_analysis.model_response.task}\n"
+                f"- Models: {', '.join(requirements_analysis.model_response.models)}\n"
+                f"- Data Requirements: {requirements_analysis.model_response.data_source}\n"
+                f"- Technical Requirements: {requirements_analysis.model_response.technical_requirements}\n"
+            )
+            if technical_research:
+                logger.info("Technical Research found! Modifying context...")
+                planning_prompt += (
+                    f"\nResearch Findings:\n{technical_research.research_findings}\n"
+                    f"Reference Implementations:\n"
+                    f"{chr(10).join(technical_research.reference_implementations)}"
+                )
+            # Stream implementation plan
+            logger.info("generating implementation plan...")
+            plan_result: Optional[ImplementationPlan] = self.create_implementation_plan(planning_prompt)
+            logger.info("writing implementation plan...")
+            yield from self.write_implementation_post(plan_result)
+            if plan_result:
+                validation_prompt = (
+                    "Validate the interfaces between these components "
+                    "and ensure all dependencies are properly specified:\n"
+                    f"{plan_result.components}"
+                )
+                logger.info("validating results...")
+                validate_result: Optional[ImplementationPlan] = self.validate_interfaces(validation_prompt)
+                logger.info("writing validated implementation plan...")
+                yield from self.write_implementation_post(validate_result)
+        except Exception as e:
+            logger.error("Error in planning workflow".format(e))
+            # yield RunResponse(
+            #     event=RunEvent.workflow_completed,
+            #     content=f"Error in planning workflow: {str(e)}"
+            # )

src/workflows/graph_workflow.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from src.utils.settings import settings
+from llama_index.core.settings import Settings
+from src.models.workflow_graph import GetResponseEvent, ConstructGraphEvent
+from llama_index.core.workflow import (
+    Event,
+    StartEvent,
+    StopEvent,
+    Workflow,
+    Context,
+    step
+)
+import asyncio
+from src.utils.helper import helper
+class DesignGraphWorkflow(Workflow):
+    @step
+    async def GetRawWorkflow(self, ctx: Context, ev: StartEvent) -> GetResponseEvent:
+        _project_description = ev._project_description
+        llm = ev.llm
+        modules = settings.moduleList
+        prompt = helper._build_prompt(
+            project_desc=_project_description,
+            modules=modules
+        )
+        try:
+            response = await llm.acomplete(prompt)
+        except Exception as e:
+            raise (f"Error: {e}")
+        return GetResponseEvent(project_details=_project_description, rawResponse=response.text)
+    @step
+    def ExtractGraph(self, ctx: Context, ev: GetResponseEvent) -> ConstructGraphEvent:
+        raw_llm_response = ev.rawResponse
+        graph = helper._parse_llm_response(raw_response=raw_llm_response)
+        return ConstructGraphEvent(workflowGraph=graph)
+    @step
+    def exportGraph(self, ctx: Context, ev: ConstructGraphEvent) -> StopEvent:
+        graph = ev.workflowGraph
+        helper._store_graph(graph_data=graph)
+        return StopEvent(result=graph)

src/workflows/reasoning_modules.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+This file contains all the reasoning modules and checks in place for devising the ML interview.
+Make changes here to add new or missing questions.
+"""
+from llama_index.core.prompts import PromptTemplate
+_REASONING_MODULES = [
+    # I. Identification of the Problem Space
+    "1. What is the specific business problem? What is the problem space owner trying to achieve?",
+    "2. Are there any stakeholders or individuals who are directly affected by the problem? What are their perspectives and needs?",  # 22
+    "3. What are the kinds of ML problems which can be used for this problem space?",
+    "4. What are the identified outcomes? What are the expected outcomes from the ML solution? What are the long-term implications of this problem and its solutions? (8)",  # 8
+    "5. How does it affect end users or stakeholders? How urgent is the problem?",
+    "6. What is the core issue or problem that needs to be addressed? (16)",  # 16
+    "7. What are the underlying causes or factors contributing to the problem? (17)",  # 17
+    "8. What are the alternative perspectives or viewpoints on this problem? (7)",  # 7
+    "9. What resources (data, compute power, expertise etc.) are needed to tackle the problem effectively? (22)",  # 22
+    "10. Do the stakeholders have someone with ML expertise in the team?",
+    "11. Is there anything the stakeholders absolutely do not want?",
+    "12. How can I simplify the problem so that it is easier to solve? (4)",  # 4
+    "13. Are there any potential solutions or strategies that have been tried before? If yes, what were the outcomes and lessons learned? (18)",  # 18
+    "14. Does the problem involve decision-making or planning, where choices need to be made under uncertainty or with competing objectives? (28)",  # 28
+    "15. Is the problem a design challenge that requires creative solutions and innovation? (30)",  # 30
+    "16. Is the problem time-sensitive or urgent, requiring immediate attention and action? (32)",  # 32
+    "17. What kinds of solutions typically are produced for this kind of problem specification? (33)",  # 33
+    "18. Given the problem specification and the current best solution, have a guess about other possible solutions. (34)",  # 34
+    "19. What is the best way to modify this current best solution, given what you know about these kinds of problem specifications? (36)",  # 36
+    "20. How could I devise an experiment to help figure out the nuances of the problem?",
+    # II. Data Assessment
+    "21. Is there any relevant data or information that can provide insights into the problem? If yes, what data sources are available, and how can they be analyzed? (20)",  # 20
+    "22. Does the available data meet the quality, quantity, and diversity requirements for the ML solution?",
+    "23. Does the existing data suffer from any biases which can be mitigated to improve performance?",
+    "24. Is there any scope for applying additional data?",
+    "25. How might additional data be applied to this problem?",
+    "26. Adaptation: Are there any privacy or security concerns with the data? How do they align with compliance standards?",
+    "27. What outcomes can come out of this data?",
+    "28. What ML models can be potentially applied on this data?",
+    "29. Adaptation: Are there constraints on data collection, storage, or computation? What preprocessing, modeling, or analysis is needed?",
+    "30. Does the problem involve a physical constraint, such as limited resources, infrastructure, or space? (26)",  # 26
+    "31. Is the problem an analytical one that requires data analysis, modeling, or optimization techniques? (29)",  # 29
+    "32. Use Risk Analysis: Evaluate potential risks, uncertainties, and tradeoffs associated with different solutions or approaches to a problem. Emphasize assessing the potential consequences and likelihood of success or failure, and making informed decisions based on a balanced analysis of risks and benefits. (14)",  # 14
+    "33. Is the data preprocessed or does it need to be processed?",
+    "34. Is there a script for processing this or a particular methodology they follow or does it need to be created?",
+    "35. Is there a preferred framework or output format that should be followed?",
+    "36. How might the data be manipulated according to the identified ML problem?",
+    # III. Defining Goals and Metrics
+    "37. What is the acceptable error rate?",
+    "38. How critical is the problem? Who does it affect?",
+    "39. What ML evaluations can be applied to assess performance here?",
+    "40. Are there any benchmarks on which this performance must be measured?",
+    "41. How urgent is the problem? What kind of latency or response time is acceptable?",
+    "42. How efficient should the solution be? What is the expected budget?",
+    "43. What is to be optimised against?",
+    "44. What’s more important - benchmarking metrics or performance in production?",
+    "45. Adaptation: What metrics (e.g., accuracy, precision, recall, business KPIs) best reflect the solution's success? How can they be tracked over time?",
+    "46. How could I measure progress on this problem? (3)",  # 3
+    "47. How can progress or success in solving the problem be measured or evaluated? (23)",  # 23
+    "48. What indicators or metrics can be used? (24)",  # 24
+    "49. How can I break down this problem into smaller, more manageable parts? (9)",  # 9
+    # IV. Experimentation and Prototyping
+    "50. What experiments can validate feasibility or assumptions? Implement step-by-step approaches to refine the ML model.",
+    "51. What machine learning solutions must be applied on this problem to iteratively improve performance?",
+    "52. How could I devise an experiment to help solve that problem? (1)",  # 1
+    "53. Make a list of ideas for solving this problem, and apply them one by one to the problem to see if any progress can be made. (2)",  # 2
+    "54. Let’s think step by step. (38)",  # 38
+    "55. Let’s make a step by step plan and implement it with good notation and explanation. (39)",  # 39
+    "56. What assumptions about the data, model, or process need testing? What challenges might arise during training or deployment?",
+    "57. What are the key assumptions underlying this problem? (5)",  # 5
+    "58. What are the potential risks and drawbacks of each solution? (6)",  # 6
+    "59. What are the potential obstacles or challenges that might arise in solving this problem? (19)",  # 19
+    "60. Is there a particular model the stakeholders are looking for or they want to experiment across multiple ones?",
+    "61. Do you have previously trained model or logs for this problem?",
+    # V. Ideation and Creativity
+    "62. Is there any out-of-the-box idea that can be executed on this data which aligns with the business needs?",
+    "63. Explore novel model architectures, feature engineering techniques, or data augmentation methods.",
+    "64. Try creative thinking, generate innovative and out-of-the-box ideas to solve the problem. Explore unconventional solutions, thinking beyond traditional boundaries, and encouraging imagination and originality. (11)",  # 11
+    "65. Ignoring the current best solution, create an entirely new solution to the problem. (37)",  # 37
+    "66. Challenge the status quo. Could a non-ML approach or an alternative ML model yield better results?",
+    "67. Let’s imagine the current best solution is totally wrong, what other ways are there to think about the problem specification? (35)",  # 35
+    # VI. Common Reasoning Patterns
+    "68. How do interconnected components (data pipelines, business logic, ML models) influence each other? How can cross-functional collaboration improve the workflow?",
+    "69. Seek input and collaboration from others to solve the problem. Emphasize teamwork, open communication, and leveraging the diverse perspectives and expertise of a group to come up with effective solutions. (12)",  # 12
+    "70. Use systems thinking: Consider the problem as part of a larger system and understanding the interconnectedness of various elements. Focuses on identifying the underlying causes, feedback loops, and interdependencies that influence the problem, and developing holistic solutions that address the system as a whole. (13)",  # 13
+    "71. Regularly evaluate the workflow, identifying areas for improvement and applying lessons from previous projects.",
+    "72. Use Reflective Thinking: Step back from the problem, take the time for introspection and self-reflection. Examine personal biases, assumptions, and mental models that may influence problem-solving, and being open to learning from past experiences to improve future approaches. (15)",  # 15
+    "73. Critical Thinking: This style involves analyzing the problem from different perspectives, questioning assumptions, and evaluating the evidence or information available. It focuses on logical reasoning, evidence-based decision-making, and identifying potential biases or flaws in thinking. (10)" , # 10
+]
+_REASONING_MODULES = "\n".join(_REASONING_MODULES)
+SELECT_PROMPT_TEMPLATE = PromptTemplate(
+    "Given the task: {task}, which of the following reasoning modules are relevant? "
+    "Elaborate on why they are relevant."
+    "\n\n {reasoning_modules}"
+)
+ADAPT_PROMPT_TEMPLATE = PromptTemplate(
+    "Without working out the full solution, adapt the following reasoning modules to be specific to our task:"
+    "\n{selected_modules} \n\nOur task: \n{task}"
+)
+IMPLEMENT_PROMPT_TEMPLATE = PromptTemplate(
+    "Without working out the full solution, create an actionable reasoning structure for the task using these adapted"
+    "reasoning modules: \n{adapted_modules} \n\nTask Description: \n{task}"
+)
+REASONING_PROMPT_TEMPLATE = PromptTemplate(
+    "Using the following reasoning structure: {reasoning_structure}\n\n"
+    "Solve this task, providing your final answer: {task}"
+)
+# TODO: Add LLM-as-the-judge system prompt here
+JUDGE_REQUIREMENT_PROMPT_TEMPLATE = PromptTemplate(
+    "You receive some data from a conversation with the user and your task is to determine whether or not they "
+    "have provided the following requirements during the conversation. Analyse the conversation to find the "
+    "requirements. Use only the provided context."
+    "\n\nContext for Judgement: \n{judging_context}"
+    "\n\nRequirements to be satisfied: "
+    """
+    class WorkflowSchema(BaseModel):
+    data_source: str
+    data_format: str
+    additional_data_requirement: bool
+    constraints: str
+    available_preprocess_script: bool
+    preprocess_script: str
+    recommended_preprocess_steps: List[str]
+    task: str
+    models: List[str]
+    hyperparameters: List[str]
+    eval_metrics: List[str]
+    deploy_constraints: str
+    """
+    "Reply only with a 0 or 1 value, corresponding to false or true, without providing any explanation."
+)
+# TODO: Add initial interaction for user query system prompt here
+ML_EXPERT_PROMPT_TEMPLATE = PromptTemplate(
+    "You're a machine learning expert, skilled at interpreting user needs from a discussion and turning it into an"
+    "end-to-end workflow according to the user requirements. From the provided context, analyse the problem from a "
+    "technical as well as a business point of view and rephrase to provide focus on the aspects requiring additional"
+    "clarification and requirements, so your input can be forwarded to the team asking the further questions. Make "
+    "vague language and requirements more clear. \n\n User query: {query}."
+)

src/workflows/workflow_context.py ADDED Viewed

File without changes

src/workflows/workflow_discovery.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""
+This module uses the self-discover workflow hand-in-hand with data representation on an index from the user
+responses and prompt programming from DsPy. Self-Discover Workflow has two stages for any given task.
+1. Stage 1:
+    a. Select: selects subset of reasoning modules.
+    b. Adapt: adapts selected reasoning modules to the task.
+    c. Implement: gives reasoning structure for the task.
+2. Stage 2:
+    Uses the generated reasoning structure for the task to generate an answer.
+"""
+import os
+import asyncio
+from llama_index.core.llms import LLM
+from llama_index.core.workflow import Workflow, Context, StartEvent, StopEvent, step
+from llama_index.core.settings import Settings
+from src.models.discovery_events import GetModulesEvent, RefineModulesEvent, ReasoningStructureEvent
+from src.workflows.reasoning_modules import _REASONING_MODULES, REASONING_PROMPT_TEMPLATE
+from src.workflows.reasoning_modules import SELECT_PROMPT_TEMPLATE, ADAPT_PROMPT_TEMPLATE, IMPLEMENT_PROMPT_TEMPLATE
+from src.workflows.reasoning_modules import JUDGE_REQUIREMENT_PROMPT_TEMPLATE
+class SelfDiscoverWorkflow(Workflow):
+    """Self discover workflow."""
+    @step
+    async def get_modules(self, context: Context, event: StartEvent) -> GetModulesEvent:
+        """
+        Select modules required for the task from the defined reasoning modules.
+        :param context: global context maintained for the user until StopEvent is emitted.
+        :param event: trigger event for this step, here Start of the workflow.
+        :return: pydantic GetModulesEvent with "task" and selected "modules".
+        """
+        task = event.get("task")
+        llm: LLM = event.get("llm")
+        await context.set("llm", llm)
+        prompt = SELECT_PROMPT_TEMPLATE.format(task=task, reasoning_modules=_REASONING_MODULES)
+        result = llm.complete(prompt)
+        return GetModulesEvent(task=task, modules=str(result))
+    @step
+    async def refine_modules(self, context: Context, event: GetModulesEvent) -> RefineModulesEvent:
+        """
+        Refines and adapts the subset of given reasoning modules based on the task.
+        :param context: global context maintained for the user until StopEvent is emitted.
+        :param event: trigger event for the step, here completion of GetModulesEvent.
+        :return: pydantic RefineModulesEvent with "task" and "refined_modules".
+        """
+        task = event.task
+        modules = event.modules
+        llm: LLM = await context.get("llm")
+        prompt = ADAPT_PROMPT_TEMPLATE.format(task=task, selected_modules=modules)
+        result = llm.complete(prompt)
+        return RefineModulesEvent(task=task, refined_modules=str(result))
+    @step
+    async def create_reasoning_structure(self, context: Context, event: RefineModulesEvent) -> ReasoningStructureEvent:
+        """
+        Creates a reasoning structure for the task given the adapted reasoning modules.
+        :param context: global context maintained for the user until StopEvent is emitted.
+        :param event: trigger event for the step, here completion of RefineModulesEvent.
+        :return: pydantic ReasoningStructureEvent with "task" and "reasoning_structure"
+        """
+        task = event.task
+        refined_modules = event.refined_modules
+        llm: LLM = await context.get("llm")
+        prompt = IMPLEMENT_PROMPT_TEMPLATE.format(task=task, adapted_modules=refined_modules)
+        result = llm.complete(prompt)
+        return ReasoningStructureEvent(task=task, reasoning_structure=str(result))
+    @step
+    async def get_final_result(self, context: Context, event: ReasoningStructureEvent) -> StopEvent:
+        """
+        Gets the final result from the reasoning structure event
+        :param context: global context maintained for the user until StopEvent is emitted.
+        :param event: trigger event for the step, here completion of ReasoningStructureEvent.
+        :return: StopEvent signal, last step of the workflow.
+        """
+        task = event.task
+        reasoning_structure = event.reasoning_structure
+        llm: LLM = await context.get("llm")
+        prompt = REASONING_PROMPT_TEMPLATE.format(task=task, reasoning_structure=reasoning_structure)
+        result = llm.complete(prompt)
+        await context.set("workflow_result", result)
+        return StopEvent(result=str(result))
+class JudgeWorkflow(Workflow):
+    """Judgement Workflow to decide whether further questions are necessary."""
+    @step
+    async def judge(self, context: Context, event: StartEvent) -> StopEvent:
+        """
+        Select modules required for the task from the defined reasoning modules.
+        :param context: global context maintained for the user until StopEvent is emitted.
+        :param event: trigger event for this step, here Start of the workflow.
+        :return: StopEvent signal, last step of the workflow.
+        """
+        judging_context = event.get("judging_context")
+        llm: LLM = event.get("llm")
+        await context.set("llm", llm)
+        prompt = JUDGE_REQUIREMENT_PROMPT_TEMPLATE.format(judging_context=judging_context)
+        result = str(llm.complete(prompt))
+        result = False if result == "0" else True
+        return StopEvent(result=result)
+# runner for the workflow
+async def main():
+    workflow = SelfDiscoverWorkflow()
+    # example task
+    predefined_task = (
+        "The user wants a step-by-step workflow for titanic survival prediction ML problem. "
+        "They want to understand whether a person has chances of surviving the titanic accident "
+        "depending on their background, ticket, gender and titanic pitfalls. To perform this, they "
+        "want to design a machine learning workflow and derive conclusions from their data. The final "
+        "model should be able to predict survive/die classes. The data has these features: "
+        "survival, ticket class, sex, age, siblings/spouses, parents/children, ticket, fare, cabin, embarked. "
+        "In case the problem is not clear at any point and you need more input from the user, share the current "
+        "workflow with the user and end with follow-up questions."
+    )
+    intermediate_result = await workflow.run(task=predefined_task, llm=Settings._llm)
+    print(str(intermediate_result))
+if __name__ == "__main__":
+    asyncio.run(main())