Spaces:

sheikhDipta003
/

data_enrichment_fabricx

Sleeping

App Files Files Community

sheikhDipta003 commited on Jan 3

Commit

2097562

1 Parent(s): 464856a

add all files

Browse files

Files changed (18) hide show

app.py +112 -0
requirements.txt +0 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/enrichment_agent/__init__.py +5 -0
src/enrichment_agent/__pycache__/__init__.cpython-311.pyc +0 -0
src/enrichment_agent/__pycache__/configuration.cpython-311.pyc +0 -0
src/enrichment_agent/__pycache__/graph.cpython-311.pyc +0 -0
src/enrichment_agent/__pycache__/prompts.cpython-311.pyc +0 -0
src/enrichment_agent/__pycache__/state.cpython-311.pyc +0 -0
src/enrichment_agent/__pycache__/tools.cpython-311.pyc +0 -0
src/enrichment_agent/__pycache__/utils.cpython-311.pyc +0 -0
src/enrichment_agent/configuration.py +62 -0
src/enrichment_agent/graph.py +229 -0
src/enrichment_agent/prompts.py +17 -0
src/enrichment_agent/state.py +88 -0
src/enrichment_agent/tools.py +74 -0
src/enrichment_agent/utils.py +34 -0

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import gradio as gr
+import json
+import asyncio
+import time
+from typing import Any, Dict
+from src.enrichment_agent import graph
+# from dotenv import  load_dotenv
+# load_dotenv()
+import os
+TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+if TAVILY_API_KEY:
+    print("TAVILY_API_KEY found!")
+else:
+    print("TAVILY_API_KEY not found. Please check your Secrets configuration.")
+if OPENAI_API_KEY:
+    print("OPENAI_API_KEY found!")
+else:
+    print("OPENAI_API_KEY not found. Please check your Secrets configuration.")
+def extract_leaf_nodes(data, parent_key=''):
+    """Extract only the leaf nodes (keys without nested key-value pairs)."""
+    leaf_nodes = {}
+    for key, value in data.items():
+        new_key = f"{parent_key}.{key}" if parent_key else key
+        if isinstance(value, dict):
+            leaf_nodes.update(extract_leaf_nodes(value, new_key))
+        elif isinstance(value, list) and all(isinstance(item, dict) for item in value):
+            for idx, item in enumerate(value):
+                leaf_nodes.update(extract_leaf_nodes(item, f"{new_key}[{idx}]"))
+        else:
+            leaf_nodes[new_key] = value
+    return leaf_nodes
+def agent_response(schema_json: str, topic: str):
+    try:
+        # Parse the schema JSON string
+        schema = json.loads(schema_json)
+    except json.JSONDecodeError:
+        return "Invalid JSON schema.", 0.0
+    async def fetch_data(schema: Dict[str, Any], topic: str) -> Dict[str, Any]:
+        return await graph.ainvoke({
+            "topic": topic,
+            "extraction_schema": schema,
+        })
+    # Measure processing time
+    start_time = time.time()
+    result = asyncio.run(fetch_data(schema, topic))
+    processing_time = time.time() - start_time
+    # Extract the 'info' dictionary from the result
+    info = result.get('info', {})
+    # Extract only the leaf nodes for display
+    leaf_nodes = extract_leaf_nodes(info)
+    # Format the key-value pairs as Markdown with newlines
+    display_data = "\n\n".join(f"**{key}**: {value}" for key, value in leaf_nodes.items())
+    return display_data, processing_time
+# Define the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        <div style="text-align: center;">
+            <h1 style="color: #4CAF50;">🌟 Enrichment Agent Interface 🌟</h1>
+            <p style="font-size: 1.2em; color: #555;">
+                Dynamically extract and display information in a visually appealing format.
+            </p>
+        </div>
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 🛠 Input")
+            schema_input = gr.Textbox(
+                label="Extraction Schema (JSON)",
+                value=json.dumps({
+                    "type": "object",
+                    "properties": {
+                        "founder": {"type": "string", "description": "Name of the founder"},
+                        "websiteUrl": {"type": "string", "description": "Website URL"},
+                        "products_sold": {"type": "array", "items": {"type": "string"}}
+                    },
+                    "required": ["founder", "websiteUrl", "products_sold"]
+                }, indent=2),
+                lines=10,
+                placeholder="Enter the extraction schema in JSON format."
+            )
+            topic_input = gr.Textbox(label="Topic", placeholder="Enter the research topic, e.g., 'Google'")
+            submit_button = gr.Button("Submit 🚀")
+        with gr.Column(scale=2):
+            gr.Markdown("### 📊 Output")
+            output_display = gr.Markdown(label="Extracted Information")
+            time_display = gr.Textbox(label="Processing Time (seconds)", interactive=False)
+    def on_submit(schema, topic):
+        data, time_taken = agent_response(schema, topic)
+        return data, f"{time_taken:.2f}"
+    submit_button.click(on_submit, inputs=[schema_input, topic_input], outputs=[output_display, time_display])
+# Launch the interface
+demo.launch(share=True)

requirements.txt ADDED Viewed

Binary file (3.5 kB). View file

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (163 Bytes). View file

src/enrichment_agent/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Enrichment for a pre-defined schema."""
+from .graph import graph
+__all__ = ["graph"]

src/enrichment_agent/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (304 Bytes). View file

src/enrichment_agent/__pycache__/configuration.cpython-311.pyc ADDED Viewed

Binary file (3.44 kB). View file

src/enrichment_agent/__pycache__/graph.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

src/enrichment_agent/__pycache__/prompts.cpython-311.pyc ADDED Viewed

Binary file (756 Bytes). View file

src/enrichment_agent/__pycache__/state.cpython-311.pyc ADDED Viewed

Binary file (2.99 kB). View file

src/enrichment_agent/__pycache__/tools.cpython-311.pyc ADDED Viewed

Binary file (4.23 kB). View file

src/enrichment_agent/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (2.21 kB). View file

src/enrichment_agent/configuration.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""Define the configurable parameters for the agent."""
+from __future__ import annotations
+from dataclasses import dataclass, field, fields
+from typing import Annotated, Optional
+from langchain_core.runnables import RunnableConfig, ensure_config
+from . import prompts
+@dataclass(kw_only=True)
+class Configuration:
+    """The configuration for the agent."""
+    model: Annotated[str, {"__template_metadata__": {"kind": "llm"}}] = field(
+        default="openai/gpt-3.5-turbo",
+        metadata={
+            "description": "The name of the language model to use for the agent. "
+            "Should be in the form: provider/model-name."
+        },
+    )
+    prompt: str = field(
+        default=prompts.MAIN_PROMPT,
+        metadata={
+            "description": "The main prompt template to use for the agent's interactions. "
+            "Expects two f-string arguments: {info} and {topic}."
+        },
+    )
+    max_search_results: int = field(
+        default=10,
+        metadata={
+            "description": "The maximum number of search results to return for each search query."
+        },
+    )
+    max_info_tool_calls: int = field(
+        default=3,
+        metadata={
+            "description": "The maximum number of times the Info tool can be called during a single interaction."
+        },
+    )
+    max_loops: int = field(
+        default=6,
+        metadata={
+            "description": "The maximum number of interaction loops allowed before the agent terminates."
+        },
+    )
+    @classmethod
+    def from_runnable_config(
+        cls, config: Optional[RunnableConfig] = None
+    ) -> Configuration:
+        """Load configuration w/ defaults for the given invocation."""
+        config = ensure_config(config)
+        configurable = config.get("configurable") or {}
+        _fields = {f.name for f in fields(cls) if f.init}
+        return cls(**{k: v for k, v in configurable.items() if k in _fields})

src/enrichment_agent/graph.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""Define a data enrichment agent.
+Works with a chat model with tool calling support.
+"""
+import json
+from typing import Any, Dict, List, Literal, Optional, cast
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, ToolMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.graph import StateGraph
+from langgraph.prebuilt import ToolNode
+from pydantic import BaseModel, Field
+from . import prompts
+from .configuration import Configuration
+from .state import InputState, OutputState, State
+from .tools import scrape_website, search
+from .utils import init_model
+async def call_agent_model(
+    state: State, *, config: Optional[RunnableConfig] = None
+) -> Dict[str, Any]:
+    """Call the primary Language Model (LLM) to decide on the next research action.
+    This asynchronous function performs the following steps:
+    1. Initializes configuration and sets up the 'Info' tool, which is the user-defined extraction schema.
+    2. Prepares the prompt and message history for the LLM.
+    3. Initializes and configures the LLM with available tools.
+    4. Invokes the LLM and processes its response.
+    5. Handles the LLM's decision to either continue research or submit final info.
+    """
+    # Load configuration from the provided RunnableConfig
+    configuration = Configuration.from_runnable_config(config)
+    # Define the 'Info' tool, which is the user-defined extraction schema
+    info_tool = {
+        "name": "Info",
+        "description": "Call this when you have gathered all the relevant info",
+        "parameters": state.extraction_schema,
+    }
+    # Format the prompt defined in prompts.py with the extraction schema and topic
+    p = configuration.prompt.format(
+        info=json.dumps(state.extraction_schema, indent=2), topic=state.topic
+    )
+    # Create the messages list with the formatted prompt and the previous messages
+    messages = [HumanMessage(content=p)] + state.messages
+    # Initialize the raw model with the provided configuration and bind the tools
+    raw_model = init_model(config)
+    model = raw_model.bind_tools([scrape_website, search, info_tool], tool_choice="any")
+    response = cast(AIMessage, await model.ainvoke(messages))
+    # Initialize info to None
+    info = None
+    # Check if the response has tool calls
+    if response.tool_calls:
+        for tool_call in response.tool_calls:
+            if tool_call["name"] == "Info":
+                info = tool_call["args"]
+                break
+    if info is not None:
+        # The agent is submitting their answer;
+        # ensure it isn't erroneously attempting to simultaneously perform research
+        response.tool_calls = [
+            next(tc for tc in response.tool_calls if tc["name"] == "Info")
+        ]
+    response_messages: List[BaseMessage] = [response]
+    if not response.tool_calls:  # If LLM didn't respect the tool_choice
+        response_messages.append(
+            HumanMessage(content="Please respond by calling one of the provided tools.")
+        )
+    return {
+        "messages": response_messages,
+        "info": info,
+        # Add 1 to the step count
+        "loop_step": 1,
+    }
+class InfoIsSatisfactory(BaseModel):
+    """Validate whether the current extracted info is satisfactory and complete."""
+    reason: List[str] = Field(
+        description="First, provide reasoning for why this is either good or bad as a final result. Must include at least 3 reasons."
+    )
+    is_satisfactory: bool = Field(
+        description="After providing your reasoning, provide a value indicating whether the result is satisfactory. If not, you will continue researching."
+    )
+    improvement_instructions: Optional[str] = Field(
+        description="If the result is not satisfactory, provide clear and specific instructions on what needs to be improved or added to make the information satisfactory."
+        " This should include details on missing information, areas that need more depth, or specific aspects to focus on in further research.",
+        default=None,
+    )
+async def reflect(
+    state: State, *, config: Optional[RunnableConfig] = None
+) -> Dict[str, Any]:
+    """Validate the quality of the data enrichment agent's output.
+    This asynchronous function performs the following steps:
+    1. Prepares the initial prompt using the main prompt template.
+    2. Constructs a message history for the model.
+    3. Prepares a checker prompt to evaluate the presumed info.
+    4. Initializes and configures a language model with structured output.
+    5. Invokes the model to assess the quality of the gathered information.
+    6. Processes the model's response and determines if the info is satisfactory.
+    """
+    p = prompts.MAIN_PROMPT.format(
+        info=json.dumps(state.extraction_schema, indent=2), topic=state.topic
+    )
+    last_message = state.messages[-1]
+    if not isinstance(last_message, AIMessage):
+        raise ValueError(
+            f"{reflect.__name__} expects the last message in the state to be an AI message with tool calls."
+            f" Got: {type(last_message)}"
+        )
+    messages = [HumanMessage(content=p)] + state.messages[:-1]
+    presumed_info = state.info
+    checker_prompt = """I am thinking of calling the info tool with the info below. \
+Is this good? Give your reasoning as well. \
+You can encourage the Assistant to look at specific URLs if that seems relevant, or do more searches.
+If you don't think it is good, you should be very specific about what could be improved.
+{presumed_info}"""
+    p1 = checker_prompt.format(presumed_info=json.dumps(presumed_info or {}, indent=2))
+    messages.append(HumanMessage(content=p1))
+    raw_model = init_model(config)
+    bound_model = raw_model.with_structured_output(InfoIsSatisfactory)
+    response = cast(InfoIsSatisfactory, await bound_model.ainvoke(messages))
+    if response.is_satisfactory and presumed_info:
+        return {
+            "info": presumed_info,
+            "messages": [
+                ToolMessage(
+                    tool_call_id=last_message.tool_calls[0]["id"],
+                    content="\n".join(response.reason),
+                    name="Info",
+                    additional_kwargs={"artifact": response.model_dump()},
+                    status="success",
+                )
+            ],
+        }
+    else:
+        return {
+            "messages": [
+                ToolMessage(
+                    tool_call_id=last_message.tool_calls[0]["id"],
+                    content=f"Unsatisfactory response:\n{response.improvement_instructions}",
+                    name="Info",
+                    additional_kwargs={"artifact": response.model_dump()},
+                    status="error",
+                )
+            ]
+        }
+def route_after_agent(
+    state: State,
+) -> Literal["reflect", "tools", "call_agent_model", "__end__"]:
+    """Schedule the next node after the agent's action.
+    This function determines the next step in the research process based on the
+    last message in the state. It handles three main scenarios:
+    1. Error recovery: If the last message is unexpectedly not an AIMessage.
+    2. Info submission: If the agent has called the "Info" tool to submit findings.
+    3. Continued research: If the agent has called any other tool.
+    """
+    last_message = state.messages[-1]
+    # "If for some reason the last message is not an AIMessage (due to a bug or unexpected behavior elsewhere in the code),
+    # it ensures the system doesn't crash but instead tries to recover by calling the agent model again.
+    if not isinstance(last_message, AIMessage):
+        return "call_agent_model"
+    # If the "Into" tool was called, then the model provided its extraction output. Reflect on the result
+    if last_message.tool_calls and last_message.tool_calls[0]["name"] == "Info":
+        return "reflect"
+    # The last message is a tool call that is not "Info" (extraction output)
+    else:
+        return "tools"
+def route_after_checker(
+    state: State, config: RunnableConfig
+) -> Literal["__end__", "call_agent_model"]:
+    """Schedule the next node after the checker's evaluation.
+    This function determines whether to continue the research process or end it
+    based on the checker's evaluation and the current state of the research.
+    """
+    configurable = Configuration.from_runnable_config(config)
+    last_message = state.messages[-1]
+    if state.loop_step < configurable.max_loops:
+        if not state.info:
+            return "call_agent_model"
+        if not isinstance(last_message, ToolMessage):
+            raise ValueError(
+                f"{route_after_checker.__name__} expected a tool messages. Received: {type(last_message)}."
+            )
+        if last_message.status == "error":
+            # Research deemed unsatisfactory
+            return "call_agent_model"
+        # It's great!
+        return "__end__"
+    else:
+        return "__end__"
+# Create the graph
+workflow = StateGraph(
+    State, input=InputState, output=OutputState, config_schema=Configuration
+)
+workflow.add_node(call_agent_model)
+workflow.add_node(reflect)
+workflow.add_node("tools", ToolNode([search, scrape_website]))
+workflow.add_edge("__start__", "call_agent_model")
+workflow.add_conditional_edges("call_agent_model", route_after_agent)
+workflow.add_edge("tools", "call_agent_model")
+workflow.add_conditional_edges("reflect", route_after_checker)
+graph = workflow.compile()
+graph.name = "ResearchTopic"

src/enrichment_agent/prompts.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Default prompts used in this project."""
+MAIN_PROMPT = """You are doing web research on behalf of a user. You are trying to figure out this information:
+<info>
+{info}
+</info>
+You have access to the following tools:
+- `Search`: call a search tool and get back some results
+- `ScrapeWebsite`: scrape a website and get relevant notes about the given request. This will update the notes above.
+- `Info`: call this when you are done and have gathered all the relevant info
+Here is the information you have about the topic you are researching:
+Topic: {topic}"""

src/enrichment_agent/state.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""State definitions.
+State is the interface between the graph and end user as well as the
+data model used internally by the graph.
+"""
+import operator
+from dataclasses import dataclass, field
+from typing import Annotated, Any, List, Optional
+from langchain_core.messages import BaseMessage
+from langgraph.graph import add_messages
+@dataclass(kw_only=True)
+class InputState:
+    """Input state defines the interface between the graph and the user (external API)."""
+    topic: str
+    "The topic for which the agent is tasked to gather information."
+    extraction_schema: dict[str, Any]
+    "The json schema defines the information the agent is tasked with filling out."
+    info: Optional[dict[str, Any]] = field(default=None)
+    "The info state tracks the current extracted data for the given topic, conforming to the provided schema. This is primarily populated by the agent."
+@dataclass(kw_only=True)
+class State(InputState):
+    """A graph's State defines three main things.
+    1. The structure of the data to be passed between nodes (which "channels" to read from/write to and their types)
+    2. Default values for each field
+    3. Reducers for the state's fields. Reducers are functions that determine how to apply updates to the state.
+    See [Reducers](https://langchain-ai.github.io/langgraph/concepts/low_level/#reducers) for more information.
+    """
+    messages: Annotated[List[BaseMessage], add_messages] = field(default_factory=list)
+    """
+    Messages track the primary execution state of the agent.
+    Typically accumulates a pattern of:
+    1. HumanMessage - user input
+    2. AIMessage with .tool_calls - agent picking tool(s) to use to collect
+        information
+    3. ToolMessage(s) - the responses (or errors) from the executed tools
+        (... repeat steps 2 and 3 as needed ...)
+    4. AIMessage without .tool_calls - agent responding in unstructured
+        format to the user.
+    5. HumanMessage - user responds with the next conversational turn.
+        (... repeat steps 2-5 as needed ... )
+    Merges two lists of messages, updating existing messages by ID.
+    By default, this ensures the state is "append-only", unless the
+    new message has the same ID as an existing message.
+    Returns:
+        A new list of messages with the messages from `right` merged into `left`.
+        If a message in `right` has the same ID as a message in `left`, the
+        message from `right` will replace the message from `left`.
+        """
+    loop_step: Annotated[int, operator.add] = field(default=0)
+    # Feel free to add additional attributes to your state as needed.
+    # Common examples include retrieved documents, extracted entities, API connections, etc.
+@dataclass(kw_only=True)
+class OutputState:
+    """The response object for the end user.
+    This class defines the structure of the output that will be provided
+    to the user after the graph's execution is complete.
+    """
+    info: dict[str, Any]
+    """
+    A dictionary containing the extracted and processed information
+    based on the user's query and the graph's execution.
+    This is the primary output of the enrichment process.
+    """

src/enrichment_agent/tools.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Tools for data enrichment.
+This module contains functions that are directly exposed to the LLM as tools.
+These tools can be used for tasks such as web searching and scraping.
+Users can edit and extend these tools as needed.
+"""
+import json
+from typing import Any, Optional, cast
+import aiohttp
+from langchain_community.tools.tavily_search import TavilySearchResults
+from langchain_core.runnables import RunnableConfig
+from langchain_core.tools import InjectedToolArg
+from langgraph.prebuilt import InjectedState
+from typing_extensions import Annotated
+from .configuration import Configuration
+from .state import State
+from .utils import init_model
+async def search(
+    query: str, *, config: Annotated[RunnableConfig, InjectedToolArg]
+) -> Optional[list[dict[str, Any]]]:
+    """Query a search engine.
+    This function queries the web to fetch comprehensive, accurate, and trusted results. It's particularly useful
+    for answering questions about current events. Provide as much context in the query as needed to ensure high recall.
+    """
+    configuration = Configuration.from_runnable_config(config)
+    wrapped = TavilySearchResults(max_results=configuration.max_search_results)
+    result = await wrapped.ainvoke({"query": query})
+    return cast(list[dict[str, Any]], result)
+_INFO_PROMPT = """You are doing web research on behalf of a user. You are trying to find out this information:
+<info>
+{info}
+</info>
+You just scraped the following website: {url}
+Based on the website content below, jot down some notes about the website.
+<Website content>
+{content}
+</Website content>"""
+async def scrape_website(
+    url: str,
+    *,
+    state: Annotated[State, InjectedState],
+    config: Annotated[RunnableConfig, InjectedToolArg],
+) -> str:
+    """Scrape and summarize content from a given URL.
+    Returns:
+        str: A summary of the scraped content, tailored to the extraction schema.
+    """
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            content = await response.text()
+    p = _INFO_PROMPT.format(
+        info=json.dumps(state.extraction_schema, indent=2),
+        url=url,
+        content=content[:40_000],
+    )
+    raw_model = init_model(config)
+    result = await raw_model.ainvoke(p)
+    return str(result.content)

src/enrichment_agent/utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""Utility functions used in our graph."""
+from typing import Optional
+from langchain.chat_models import init_chat_model
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import AnyMessage
+from langchain_core.runnables import RunnableConfig
+from .configuration import Configuration
+def get_message_text(msg: AnyMessage) -> str:
+    """Get the text content of a message."""
+    content = msg.content
+    if isinstance(content, str):
+        return content
+    elif isinstance(content, dict):
+        return content.get("text", "")
+    else:
+        txts = [c if isinstance(c, str) else (c.get("text") or "") for c in content]
+        return "".join(txts).strip()
+def init_model(config: Optional[RunnableConfig] = None) -> BaseChatModel:
+    """Initialize the configured chat model."""
+    configuration = Configuration.from_runnable_config(config)
+    fully_specified_name = configuration.model
+    if "/" in fully_specified_name:
+        provider, model = fully_specified_name.split("/", maxsplit=1)
+    else:
+        provider = None
+        model = fully_specified_name
+    return init_chat_model(model, model_provider=provider)