Spaces:

GIZ
/

chatfed_orchestrator

Running

App Files Files Community

mtyrrell commited on Aug 27

Commit

f2fa19f

1 Parent(s): 6fcf71e

ingestor node added

Browse files

Files changed (3) hide show

Dockerfile +2 -0
app/main.py +98 -5
params.cfg +3 -0

Dockerfile CHANGED Viewed

@@ -1,3 +1,5 @@
 FROM python:3.10-slim
 WORKDIR /app

+#CHATFED_ORCHESTRATOR
 FROM python:3.10-slim
 WORKDIR /app

app/main.py CHANGED Viewed

@@ -1,9 +1,10 @@
-# Gradio UI not currenlty working.
 import gradio as gr
-from fastapi import FastAPI
 from langserve import add_routes
 from langgraph.graph import StateGraph, START, END
-from typing import Optional, Dict, Any
 from typing_extensions import TypedDict
 from pydantic import BaseModel
 from gradio_client import Client
@@ -14,12 +15,14 @@ import logging
 from contextlib import asynccontextmanager
 import threading
 from langchain_core.runnables import RunnableLambda
 from utils import getconfig
 config = getconfig("params.cfg")
 RETRIEVER = config.get("retriever", "RETRIEVER")
 GENERATOR = config.get("generator", "GENERATOR")
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
@@ -29,11 +32,15 @@ logger = logging.getLogger(__name__)
 class GraphState(TypedDict):
     query: str
     context: str
     result: str
     reports_filter: str
     sources_filter: str
     subtype_filter: str
     year_filter: str
     metadata: Optional[Dict[str, Any]]
 class ChatFedInput(TypedDict):
@@ -44,6 +51,8 @@ class ChatFedInput(TypedDict):
     year_filter: Optional[str]
     session_id: Optional[str]
     user_id: Optional[str]
 class ChatFedOutput(TypedDict):
     result: str
@@ -53,6 +62,76 @@ class ChatUIInput(BaseModel):
     text: str
 # Module functions
 def retrieve_node(state: GraphState) -> GraphState:
     start_time = datetime.now()
     logger.info(f"Retrieval: {state['query'][:50]}...")
@@ -95,10 +174,22 @@ def generate_node(state: GraphState) -> GraphState:
     logger.info(f"Generation: {state['query'][:50]}...")
     try:
         client = Client(GENERATOR)
         result = client.predict(
             query=state["query"],
-            context=state["context"],
             api_name="/generate"
         )
@@ -126,9 +217,11 @@ def generate_node(state: GraphState) -> GraphState:
 # start the graph
 workflow = StateGraph(GraphState)
 workflow.add_node("retrieve", retrieve_node)
 workflow.add_node("generate", generate_node)
-workflow.add_edge(START, "retrieve")
 workflow.add_edge("retrieve", "generate")
 workflow.add_edge("generate", END)
 compiled_graph = workflow.compile()

+#CHATFED_ORCHESTRATOR
 import gradio as gr
+from fastapi import FastAPI, UploadFile, File, Form
 from langserve import add_routes
 from langgraph.graph import StateGraph, START, END
+from typing import Optional, Dict, Any, List
 from typing_extensions import TypedDict
 from pydantic import BaseModel
 from gradio_client import Client
 from contextlib import asynccontextmanager
 import threading
 from langchain_core.runnables import RunnableLambda
+import tempfile
 from utils import getconfig
 config = getconfig("params.cfg")
 RETRIEVER = config.get("retriever", "RETRIEVER")
 GENERATOR = config.get("generator", "GENERATOR")
+INGESTOR = config.get("ingestor", "INGESTOR")
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 class GraphState(TypedDict):
     query: str
     context: str
+    ingestor_context: str
     result: str
     reports_filter: str
     sources_filter: str
     subtype_filter: str
     year_filter: str
+    file_content: Optional[bytes]
+    filename: Optional[str]
+    doc_id: Optional[str]
     metadata: Optional[Dict[str, Any]]
 class ChatFedInput(TypedDict):
     year_filter: Optional[str]
     session_id: Optional[str]
     user_id: Optional[str]
+    file_content: Optional[bytes]
+    filename: Optional[str]
 class ChatFedOutput(TypedDict):
     result: str
     text: str
 # Module functions
+def ingest_node(state: GraphState) -> GraphState:
+    """Process file through ingestor if file is provided"""
+    start_time = datetime.now()
+    # If no file provided, skip this step
+    if not state.get("file_content") or not state.get("filename"):
+        logger.info("No file provided, skipping ingestion")
+        return {"ingestor_context": "", "metadata": state.get("metadata", {})}
+    logger.info(f"Ingesting file: {state['filename']}")
+    try:
+        client = Client(INGESTOR)
+        # Create a temporary file to upload
+        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(state["filename"])[1]) as tmp_file:
+            tmp_file.write(state["file_content"])
+            tmp_file_path = tmp_file.name
+        try:
+            # Call the ingestor's /ingest endpoint
+            ingest_result = client.predict(
+                file=tmp_file_path,
+                api_name="/ingest"
+            )
+            # Extract doc_id from result
+            # The ingest endpoint returns an IngestResponse object
+            doc_id = ingest_result.get("doc_id") if isinstance(ingest_result, dict) else ingest_result
+            # Get processed context using doc_id
+            context_result = client.predict(
+                doc_id=doc_id,
+                max_chunks=10,  # configurable
+                api_name="/context"
+            )
+            ingestor_context = context_result.get("context", "") if isinstance(context_result, dict) else str(context_result)
+        finally:
+            # Clean up temporary file
+            os.unlink(tmp_file_path)
+        duration = (datetime.now() - start_time).total_seconds()
+        metadata = state.get("metadata", {})
+        metadata.update({
+            "ingestion_duration": duration,
+            "doc_id": doc_id,
+            "ingestor_context_length": len(ingestor_context) if ingestor_context else 0,
+            "ingestion_success": True
+        })
+        return {
+            "ingestor_context": ingestor_context,
+            "doc_id": doc_id,
+            "metadata": metadata
+        }
+    except Exception as e:
+        duration = (datetime.now() - start_time).total_seconds()
+        logger.error(f"Ingestion failed: {str(e)}")
+        metadata = state.get("metadata", {})
+        metadata.update({
+            "ingestion_duration": duration,
+            "ingestion_success": False,
+            "ingestion_error": str(e)
+        })
+        return {"ingestor_context": "", "metadata": metadata}
 def retrieve_node(state: GraphState) -> GraphState:
     start_time = datetime.now()
     logger.info(f"Retrieval: {state['query'][:50]}...")
     logger.info(f"Generation: {state['query'][:50]}...")
     try:
+        # Combine retriever context with ingestor context
+        retrieved_context = state.get("context", "")
+        ingestor_context = state.get("ingestor_context", "")
+        combined_context = ""
+        if ingestor_context and retrieved_context:
+            combined_context = f"=== UPLOADED DOCUMENT CONTEXT ===\n{ingestor_context}\n\n=== RETRIEVED CONTEXT ===\n{retrieved_context}"
+        elif ingestor_context:
+            combined_context = f"=== UPLOADED DOCUMENT CONTEXT ===\n{ingestor_context}"
+        elif retrieved_context:
+            combined_context = retrieved_context
         client = Client(GENERATOR)
         result = client.predict(
             query=state["query"],
+            context=combined_context,
             api_name="/generate"
         )
 # start the graph
 workflow = StateGraph(GraphState)
+workflow.add_node("ingest", ingest_node)
 workflow.add_node("retrieve", retrieve_node)
 workflow.add_node("generate", generate_node)
+workflow.add_edge(START, "ingest")
+workflow.add_edge("ingest", "retrieve")
 workflow.add_edge("retrieve", "generate")
 workflow.add_edge("generate", END)
 compiled_graph = workflow.compile()

params.cfg CHANGED Viewed

@@ -3,3 +3,6 @@ RETRIEVER = giz/chatfed_retriever
 [generator]
 GENERATOR = giz/chatfed_generator

 [generator]
 GENERATOR = giz/chatfed_generator
+[ingestor]
+INGESTOR = mtyrrell/chatfed_ingestor