visual-deepsearch

Sleeping

App Files Files Community

manu commited on Aug 20

Commit

6785135

verified ·

1 Parent(s): 82b43ed

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -111

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
-# app.py — Unified ColPali + MCP Agent (indices-only search, agent receives images)
 import os
 import base64
 import tempfile
 from io import BytesIO
 from urllib.request import urlretrieve
-from typing import List, Tuple, Dict, Any
 import gradio as gr
 from gradio_pdf import PDF
@@ -18,7 +19,7 @@ from tqdm import tqdm
 from colpali_engine.models import ColQwen2, ColQwen2Processor
-# Optional (used by the streaming agent)
 from openai import OpenAI
@@ -26,9 +27,10 @@ from openai import OpenAI
 # Globals & Config
 # =============================
 api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
 ds: List[torch.Tensor] = []     # page embeddings
 images: List[Image.Image] = []  # PIL images in page order
-current_pdf_path: str | None = None
 device_map = (
     "cuda:0"
@@ -125,36 +127,13 @@ def index_from_url(url: str) -> Tuple[str, str]:
     return status, local_path
-def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
-    """Turn page indices into OpenAI vision content parts."""
-    parts: List[Dict[str, Any]] = []
-    seen = sorted({i for i in indices if 0 <= i < len(images)})
-    for idx in seen:
-        b64 = encode_image_to_base64(images[idx])
-        parts.append({
-            "type": "input_image",
-            "image_url": f"data:image/jpeg;base64,{b64}",
-        })
-    return parts
 # =============================
-# MCP Tools
 # =============================
 def search(query: str, k: int = 5) -> List[int]:
     """
     Search within an indexed PDF and return ONLY the indices of the most relevant pages (0-based).
-    MCP tool description:
-      - name: mcp_test_search
-      - description: Search within the indexed PDF for the most relevant pages and return their 0-based indices only.
-      - input_schema:
-          type: object
-          properties:
-            query: {type: string, description: "User query in natural language."}
-            k: {type: integer, minimum: 1, maximum: 50, default: 5, description: "Number of top pages to retrieve (before neighbor expansion)."}
-          required: ["query"]
     Returns:
       List[int]: Sorted unique 0-based indices of pages to inspect (includes neighbor expansion).
     """
@@ -166,16 +145,14 @@ def search(query: str, k: int = 5) -> List[int]:
     k = max(1, min(int(k), len(images)))
     device = _ensure_model_device()
-    # Encode query
     with torch.no_grad():
         batch_query = processor.process_queries([query]).to(model.device)
         embeddings_query = model(**batch_query)
         q_vecs = list(torch.unbind(embeddings_query.to("cpu")))
-    # Score and select top-k
     scores = processor.score(q_vecs, ds, device=device)
     top_k_indices = scores[0].topk(k).indices.tolist()
-    print(query, top_k_indices)
     # Neighbor expansion for context
     base = set(top_k_indices)
@@ -183,65 +160,91 @@ def search(query: str, k: int = 5) -> List[int]:
     for i in base:
         expanded.add(i - 1)
         expanded.add(i + 1)
-    expanded = {i for i in expanded if 0 <= i < len(images)}  # strict bounds
     return sorted(expanded)
 # =============================
-# Gradio UI — Unified App
 # =============================
 SYSTEM = (
     """
-You are a PDF research agent with two tools:
-  • mcp_test_search(query: string, k: int) → returns ONLY 0-based page indices.
-  • mcp_test_get_pages(indices: int[]) → returns the actual page images (as base64 images) for vision.
-Policy & procedure:
-  1) Break the user task into 1–4 targeted sub-queries (in English).
-  2) For each sub-query, call mcp_test_search to get indices; Once you receive the indices to use, print "Received" and stop generating. Images will be injected in your stream.
-  3) Continue reasoning using ONLY the provided images. If info is insufficient, iterate: refine sub-queries and call the tools again. You may make further tool calls later in the conversation as needed.
-Grounding & citations:
-  • Use ONLY information visible in the provided page images.
-  • After any claim, cite as (p.<page>).
   • If an answer is not present, say “Not found in the provided pages.”
-Final deliverable:
-  • Write a clear, standalone Markdown answer in the user's language. For lists of dates/items, include a concise table.
-  • Do not refer to “the above” or “previous messages”.
 """
 ).strip()
 DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
 DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
-DEFAULT_ALLOWED_TOOLS = "mcp_test_search,mcp_test_get_pages"
 def stream_agent(question: str,
                  api_key: str,
-                 model: str,
                  server_url: str,
                  server_label: str,
                  require_approval: str,
                  allowed_tools: str):
     """
-    Streaming generator for the agent.
-    NOTE: We rely on OpenAI's MCP tool routing. The mcp_test_search tool returns indices only;
-    the agent is instructed to call mcp_get_pages next to receive images and continue reasoning.
     """
-    final_text = "Answer:"
-    summary_text = "Reasoning:"
-    log_lines = ["Log"]
     if not api_key:
         yield "⚠️ **Please provide your OpenAI API key.**", "", ""
         return
     client = OpenAI(api_key=api_key)
     prev_response_id: Optional[str] = None
     tools = [{
         "type": "mcp",
         "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
@@ -250,20 +253,34 @@ def stream_agent(question: str,
         "require_approval": require_approval or "never",
     }]
-    # seed pages once (optional)
-    seed_indices = search(question, k=5) or []
-    pending_indices = list(seed_indices)
     def run_round(round_idx: int, attached_indices: List[int]):
         nonlocal prev_response_id
-        assembled_text = ""
-        assembled_summary = ""
-        # Will hold the most recent indices returned by mcp_test_search in THIS stream
-        last_search_indices: List[int] = []
-        # Build user parts (attach any seed pages we already have)
-        parts: List[Dict[str, Any]] = [{"type": "input_text", "text": question if round_idx == 1 else "Continue with new pages."}]
         parts += _build_image_parts_from_indices(attached_indices)
         # First call includes system; follow-ups use previous_response_id
         if prev_response_id:
@@ -279,21 +296,20 @@ def stream_agent(question: str,
             input=req_input,
             reasoning={"effort": "medium", "summary": "auto"},
             tools=tools,
-            store=True,
         )
         if prev_response_id:
             req_kwargs["previous_response_id"] = prev_response_id
-        # Helper to try extracting a JSON int array from tool result text
         def _maybe_parse_indices(chunk: str) -> List[int]:
             import json, re
-            # Find the last bracketed JSON array in the chunk
             arrs = re.findall(r'\[[^\]]*\]', chunk)
             for s in reversed(arrs):
                 try:
                     val = json.loads(s)
                     if isinstance(val, list) and all(isinstance(x, int) for x in val):
-                        return sorted({x for x in val if isinstance(x, int)})
                 except Exception:
                     pass
             return []
@@ -306,74 +322,74 @@ def stream_agent(question: str,
                     etype = getattr(event, "type", "")
                     if etype == "response.output_text.delta":
-                        assembled_text += event.delta
-                        yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
                     elif etype == "response.reasoning_summary_text.delta":
-                        assembled_summary += event.delta
-                        yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
-                    # Capture tool *arguments* in the log for transparency (optional)
                     elif etype in ("response.function_call_arguments.delta", "response.tool_call_arguments.delta"):
-                        log_lines.append(str(event.delta))
-                    # ⬇️ NEW: capture tool *results* (indices JSON) from MCP
                     elif etype.startswith("response.tool_result"):
-                        # Different SDKs expose .delta or .output_text; handle both
-                        delta = getattr(event, "delta", "") or getattr(event, "output_text", "")
-                        if delta:
-                            tool_result_buffer += str(delta)
-                            # opportunistic parse so UI can progress early
                             parsed_now = _maybe_parse_indices(tool_result_buffer)
                             if parsed_now:
-                                print(parsed_now)
-                                last_search_indices = parsed_now
-                                log_lines.append(f"[tool-result] indices={last_search_indices}")
-                                yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
-                # Finalize, remember response id for follow-ups
                 _final = stream.get_final_response()
                 try:
                     prev_response_id = getattr(_final, "id", None)
                 except Exception:
                     prev_response_id = None
-            # If the model produced search results this round, hand them back to the controller
-            if last_search_indices:
-                return sorted(set(last_search_indices))
-            # Otherwise, just render whatever text we have
-            yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
-            return None
         except Exception as e:
             log_lines.append(f"[round {round_idx}] stream error: {e}")
-            yield f"❌ {e}", assembled_summary or "", "\n".join(log_lines[-400:])
-            return None
-    # Controller: iterate rounds until model stops searching
     max_rounds = 3
     round_idx = 1
     while round_idx <= max_rounds:
-        # Start a round with any pending images we already have
-        next_indices = None
         for final_md, summary_md, log_md in run_round(round_idx, pending_indices):
             yield final_md, summary_md, log_md
-        # If the model called mcp_test_search, we got indices back; fetch those pages next.
-        # (We ignore pending_indices now—move to the model-chosen ones.)
-        if isinstance(next_indices, list) and next_indices:
             pending_indices = next_indices
-            # Attach those pages in a **new** GPT-5 call using previous_response_id
             round_idx += 1
             continue
-        # No tool search results this round → we’re done
         break
     return
 CUSTOM_CSS = """
 :root {
   --bg: #0e1117;
@@ -433,19 +449,20 @@ body {background: radial-gradient(1200px 600px at 20% -10%, rgba(124,58,237,.25)
 def build_ui():
     theme = gr.themes.Soft()
-    with gr.Blocks(title="ColPali PDF RAG + MCP Agent (Indices-only)", theme=theme, css=CUSTOM_CSS) as demo:
         gr.HTML(
             """
             <div class="app-header">
               <div class="icon">📚</div>
               <div>
-                <h1>ColPali PDF Search + Streaming Agent</h1>
-                <p>Index PDFs with ColQwen2 (ColPali). The search tool returns page indices only; the agent fetches images and reasons visually.</p>
               </div>
             </div>
             """
         )
         with gr.Tab("1) Index & Preview"):
             with gr.Row():
                 with gr.Column(scale=1):
@@ -482,19 +499,20 @@ def build_ui():
             index_btn.click(handle_upload, inputs=[pdf_input], outputs=[status_box, pdf_view])
             index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
         with gr.Tab("2) Ask (Direct — returns indices)"):
             with gr.Row():
                 with gr.Column(scale=1):
                     query_box = gr.Textbox(placeholder="Enter your question…", label="Query", lines=4)
                     k_slider = gr.Slider(minimum=1, maximum=10, step=1, label="Number of results (k)", value=5)
                     search_button = gr.Button("🔍 Search", variant="primary")
-                    get_pages_button = gr.Button("🔍 Get Pages", variant="primary")
                 with gr.Column(scale=2):
                     output_text = gr.Textbox(label="Indices (0-based)", lines=12, placeholder="[0, 1, 2, ...]")
             search_button.click(search, inputs=[query_box, k_slider], outputs=[output_text])
         with gr.Tab("3) Agent (Streaming)"):
             with gr.Row(equal_height=True):
                 with gr.Column(scale=1):
@@ -522,7 +540,7 @@ def build_ui():
                             )
                         with gr.Row():
                             server_url_box = gr.Textbox(
-                                label="MCP Server URL",
                                 value=DEFAULT_MCP_SERVER_URL,
                             )
                             server_label_box = gr.Textbox(
@@ -550,7 +568,15 @@ def build_ui():
             run_btn.click(
                 stream_agent,
-                inputs=[question, api_key_box, model_box, server_url_box, server_label_box, require_approval_box, allowed_tools_box],
                 outputs=[final_md, summary_md, log_md],
             )
@@ -560,4 +586,5 @@ def build_ui():
 if __name__ == "__main__":
     demo = build_ui()
     # mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
     demo.queue(max_size=5).launch(debug=True, mcp_server=True)

+# app.py — ColPali + MCP (search-only) + GPT-5 follow-up responses
+# Images are injected by the app in new calls; no base64 is passed through MCP.
 import os
 import base64
 import tempfile
 from io import BytesIO
 from urllib.request import urlretrieve
+from typing import List, Tuple, Dict, Any, Optional
 import gradio as gr
 from gradio_pdf import PDF
 from colpali_engine.models import ColQwen2, ColQwen2Processor
+# Streaming Responses API
 from openai import OpenAI
 # Globals & Config
 # =============================
 api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
 ds: List[torch.Tensor] = []     # page embeddings
 images: List[Image.Image] = []  # PIL images in page order
+current_pdf_path: Optional[str] = None
 device_map = (
     "cuda:0"
     return status, local_path
 # =============================
+# Local Search (ColPali)
 # =============================
 def search(query: str, k: int = 5) -> List[int]:
     """
     Search within an indexed PDF and return ONLY the indices of the most relevant pages (0-based).
     Returns:
       List[int]: Sorted unique 0-based indices of pages to inspect (includes neighbor expansion).
     """
     k = max(1, min(int(k), len(images)))
     device = _ensure_model_device()
     with torch.no_grad():
         batch_query = processor.process_queries([query]).to(model.device)
         embeddings_query = model(**batch_query)
         q_vecs = list(torch.unbind(embeddings_query.to("cpu")))
     scores = processor.score(q_vecs, ds, device=device)
     top_k_indices = scores[0].topk(k).indices.tolist()
+    print("[search]", query, top_k_indices)
     # Neighbor expansion for context
     base = set(top_k_indices)
     for i in base:
         expanded.add(i - 1)
         expanded.add(i + 1)
+    expanded = {i for i in expanded if 0 <= i < len(images)}
     return sorted(expanded)
+def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
+    """Turn page indices into OpenAI vision content parts."""
+    parts: List[Dict[str, Any]] = []
+    seen = sorted({i for i in indices if 0 <= i < len(images)})
+    for idx in seen:
+        b64 = encode_image_to_base64(images[idx])
+        parts.append({
+            "type": "input_image",
+            "image_url": f"data:image/jpeg;base64,{b64}",
+        })
+    return parts
 # =============================
+# Agent System Prompt
 # =============================
 SYSTEM = (
     """
+You are a PDF research agent.
+Workflow:
+  • When you need pages, call the tool: mcp_test_search(query: string, k: int).
+  • The app will attach the images for the LAST search result you produced in this turn in a follow-up message.
+  • Use ONLY the provided images for grounding and cite as (p.<page>).
   • If an answer is not present, say “Not found in the provided pages.”
+Deliverable:
+  • Return a clear, standalone Markdown answer in the user's language. Include concise tables for lists of dates/items.
 """
 ).strip()
+# =============================
+# MCP config (search-only)
+# =============================
 DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
 DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
+DEFAULT_ALLOWED_TOOLS = "mcp_test_search"  # search-only; no get_pages
+# =============================
+# Streaming Agent (multi-round with previous_response_id)
+# =============================
 def stream_agent(question: str,
                  api_key: str,
+                 model_name: str,
                  server_url: str,
                  server_label: str,
                  require_approval: str,
                  allowed_tools: str):
     """
+    Multi-round streaming:
+      • Seed: optional local ColPali search on the user question to attach initial pages.
+      • Each round: open a GPT-5 stream with *attached images* (if any).
+      • If the model calls mcp_test_search and returns indices, we end the stream and
+        start a NEW API call with previous_response_id + the requested pages attached.
     """
     if not api_key:
         yield "⚠️ **Please provide your OpenAI API key.**", "", ""
         return
+    if not images or not ds:
+        yield "⚠️ **Index a PDF first in tab 1.**", "", ""
+        return
     client = OpenAI(api_key=api_key)
+    # Optional seeding: attach some likely pages on round 1
+    try:
+        seed_indices = search(question, k=5) or []
+    except Exception as e:
+        yield f"❌ Search failed: {e}", "", ""
+        return
+    log_lines = ["Log", f"[seed] indices={seed_indices}"]
     prev_response_id: Optional[str] = None
+    # MCP tool routing (search-only)
     tools = [{
         "type": "mcp",
         "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
         "require_approval": require_approval or "never",
     }]
+    # Shared mutable state for each round
+    round_state: Dict[str, Any] = {
+        "last_search_indices": None,
+        "final_text": "",
+        "summary_text": "",
+    }
     def run_round(round_idx: int, attached_indices: List[int]):
+        """
+        Stream one round. If tool results (indices) arrive, store them in round_state["last_search_indices"].
+        """
         nonlocal prev_response_id
+        round_state["last_search_indices"] = None
+        round_state["final_text"] = ""
+        round_state["summary_text"] = ""
+        # Build the user content for this round
+        parts: List[Dict[str, Any]] = []
+        if round_idx == 1:
+            parts.append({"type": "input_text", "text": question})
+        else:
+            parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages."})
         parts += _build_image_parts_from_indices(attached_indices)
+        if attached_indices:
+            pages_str = ", ".join(str(i + 1) for i in sorted(set(attached_indices)))
+            parts.append({"type": "input_text", "text": f"(Attached pages: {pages_str}). Use ONLY these images; cite as (p.X)."})
         # First call includes system; follow-ups use previous_response_id
         if prev_response_id:
             input=req_input,
             reasoning={"effort": "medium", "summary": "auto"},
             tools=tools,
+            store=True,  # persist conversation state on server
         )
         if prev_response_id:
             req_kwargs["previous_response_id"] = prev_response_id
+        # Helper: parse a JSON array of ints from tool result text
         def _maybe_parse_indices(chunk: str) -> List[int]:
             import json, re
             arrs = re.findall(r'\[[^\]]*\]', chunk)
             for s in reversed(arrs):
                 try:
                     val = json.loads(s)
                     if isinstance(val, list) and all(isinstance(x, int) for x in val):
+                        return sorted({x for x in val if 0 <= x < len(images)})
                 except Exception:
                     pass
             return []
                     etype = getattr(event, "type", "")
                     if etype == "response.output_text.delta":
+                        round_state["final_text"] += event.delta
+                        yield round_state["final_text"] or " ", round_state["summary_text"] or " ", "\n".join(log_lines[-400:])
                     elif etype == "response.reasoning_summary_text.delta":
+                        round_state["summary_text"] += event.delta
+                        yield round_state["final_text"] or " ", round_state["summary_text"] or " ", "\n".join(log_lines[-400:])
+                    # Log tool call argument deltas (optional)
                     elif etype in ("response.function_call_arguments.delta", "response.tool_call_arguments.delta"):
+                        delta = getattr(event, "delta", None)
+                        if delta:
+                            log_lines.append(str(delta))
+                    # Capture tool RESULT text and try to parse indices
                     elif etype.startswith("response.tool_result"):
+                        print("here")
+                        delta_text = getattr(event, "delta", "") or getattr(event, "output_text", "")
+                        if delta_text:
+                            tool_result_buffer += str(delta_text)
                             parsed_now = _maybe_parse_indices(tool_result_buffer)
                             if parsed_now:
+                                round_state["last_search_indices"] = parsed_now
+                                log_lines.append(f"[tool-result] indices={parsed_now}")
+                                yield round_state["final_text"] or " ", round_state["summary_text"] or " ", "\n".join(log_lines[-400:])
+                # Finalize this response; remember ID for follow-ups
                 _final = stream.get_final_response()
                 try:
                     prev_response_id = getattr(_final, "id", None)
                 except Exception:
                     prev_response_id = None
+            # Emit one last update after stream ends
+            yield round_state["final_text"] or " ", round_state["summary_text"] or " ", "\n".join(log_lines[-400:])
         except Exception as e:
             log_lines.append(f"[round {round_idx}] stream error: {e}")
+            yield f"❌ {e}", round_state["summary_text"] or "", "\n".join(log_lines[-400:])
+            return
+    # Controller: iterate rounds; if the model searched, attach those pages next
     max_rounds = 3
     round_idx = 1
+    pending_indices = list(seed_indices)
     while round_idx <= max_rounds:
+        print(round_idx, pending_indices)
         for final_md, summary_md, log_md in run_round(round_idx, pending_indices):
             yield final_md, summary_md, log_md
+        # If the model returned indices via the tool, use them in a fresh call
+        next_indices = round_state.get("last_search_indices") or []
+        if next_indices:
             pending_indices = next_indices
             round_idx += 1
             continue
+        # No further tool-driven retrieval → done
         break
     return
+# =============================
+# Gradio UI
+# =============================
 CUSTOM_CSS = """
 :root {
   --bg: #0e1117;
 def build_ui():
     theme = gr.themes.Soft()
+    with gr.Blocks(title="ColPali PDF RAG + Follow-up Responses", theme=theme, css=CUSTOM_CSS) as demo:
         gr.HTML(
             """
             <div class="app-header">
               <div class="icon">📚</div>
               <div>
+                <h1>ColPali PDF Search + Streaming Agent (Follow-up Responses)</h1>
+                <p>Index PDFs with ColQwen2. The agent attaches images in follow-up GPT-5 calls; MCP is search-only.</p>
               </div>
             </div>
             """
         )
+        # ---- Tab 1: Index & Preview
         with gr.Tab("1) Index & Preview"):
             with gr.Row():
                 with gr.Column(scale=1):
             index_btn.click(handle_upload, inputs=[pdf_input], outputs=[status_box, pdf_view])
             index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
+        # ---- Tab 2: Ask (Direct — returns indices)
         with gr.Tab("2) Ask (Direct — returns indices)"):
             with gr.Row():
                 with gr.Column(scale=1):
                     query_box = gr.Textbox(placeholder="Enter your question…", label="Query", lines=4)
                     k_slider = gr.Slider(minimum=1, maximum=10, step=1, label="Number of results (k)", value=5)
                     search_button = gr.Button("🔍 Search", variant="primary")
                 with gr.Column(scale=2):
                     output_text = gr.Textbox(label="Indices (0-based)", lines=12, placeholder="[0, 1, 2, ...]")
             search_button.click(search, inputs=[query_box, k_slider], outputs=[output_text])
+        # ---- Tab 3: Agent (Streaming)
         with gr.Tab("3) Agent (Streaming)"):
             with gr.Row(equal_height=True):
                 with gr.Column(scale=1):
                             )
                         with gr.Row():
                             server_url_box = gr.Textbox(
+                                label="MCP Server URL (search-only)",
                                 value=DEFAULT_MCP_SERVER_URL,
                             )
                             server_label_box = gr.Textbox(
             run_btn.click(
                 stream_agent,
+                inputs=[
+                    question,
+                    api_key_box,
+                    model_box,
+                    server_url_box,
+                    server_label_box,
+                    require_approval_box,
+                    allowed_tools_box,
+                ],
                 outputs=[final_md, summary_md, log_md],
             )
 if __name__ == "__main__":
     demo = build_ui()
     # mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
+    # We keep the MCP server available, but the agent never uses MCP to pass images.
     demo.queue(max_size=5).launch(debug=True, mcp_server=True)