visual-deepsearch

Sleeping

App Files Files Community

manu commited on Aug 20

Commit

82b43ed

verified ·

1 Parent(s): bf520fb

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -81

app.py CHANGED Viewed

@@ -125,6 +125,18 @@ def index_from_url(url: str) -> Tuple[str, str]:
     return status, local_path
 # =============================
 # MCP Tools
 # =============================
@@ -176,46 +188,6 @@ def search(query: str, k: int = 5) -> List[int]:
     return sorted(expanded)
-def get_pages(indices: List[int]) -> Dict[str, Any]:
-    """
-    Return page images (as data URLs) for the given 0-based indices.
-    MCP tool description:
-      - name: mcp_test_get_pages
-      - description: Given 0-based indices from mcp_test_search, return the corresponding page images as data URLs for vision reasoning.
-      - input_schema:
-          type: object
-          properties:
-            indices: {
-              type: array,
-              items: { type: integer, minimum: 0 },
-              description: "0-based page indices to fetch",
-            }
-          required: ["indices"]
-    Returns:
-      {"images": [{"index": int, "page": int, "image_url": str}], "count": int}
-    """
-    global images
-    indices = eval(indices)
-    print("indices to get", indices)
-    if not images:
-        return {"images": [], "count": 0}
-    uniq = sorted({i for i in indices if 0 <= i < len(images)})
-    payload = []
-    for idx in uniq:
-        im = images[idx]
-        b64 = encode_image_to_base64(im)
-        payload.append({
-            "index": idx,
-            "page": idx + 1,
-            "image_url": f"data:image/jpeg;base64,{b64}",
-        })
-    return {"images": payload, "count": len(payload)}
 # =============================
 # Gradio UI — Unified App
 # =============================
@@ -228,7 +200,7 @@ You are a PDF research agent with two tools:
 Policy & procedure:
   1) Break the user task into 1–4 targeted sub-queries (in English).
-  2) For each sub-query, call mcp_test_search to get indices; THEN immediately call mcp_get_pages with those indices to obtain the page images.
   3) Continue reasoning using ONLY the provided images. If info is insufficient, iterate: refine sub-queries and call the tools again. You may make further tool calls later in the conversation as needed.
 Grounding & citations:
@@ -269,6 +241,7 @@ def stream_agent(question: str,
     client = OpenAI(api_key=api_key)
     tools = [{
         "type": "mcp",
         "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
@@ -277,43 +250,128 @@ def stream_agent(question: str,
         "require_approval": require_approval or "never",
     }]
-    req_kwargs = dict(
-        model=model,
-        input=[
-            {"role": "system", "content": SYSTEM},
-            {"role": "user", "content": question},
-        ],
-        reasoning={"effort": "medium", "summary": "auto"},
-        tools=tools,
-    )
-    try:
-        with client.responses.stream(**req_kwargs) as stream:
-            for event in stream:
-                etype = getattr(event, "type", "")
-                if etype == "response.output_text.delta":
-                    final_text += event.delta
-                    yield final_text, summary_text, "\n".join(log_lines[-400:])
-                elif etype == "response.reasoning_summary_text.delta":
-                    summary_text += event.delta
-                    yield final_text, summary_text, "\n".join(log_lines[-400:])
-                elif etype in ("response.function_call_arguments.delta", "response.tool_call_arguments.delta"):
-                    # Show tool call argument deltas in the log for transparency
-                    log_lines.append(str(event.delta))
-                elif etype == "response.error":
-                    log_lines.append(f"[error] {getattr(event, 'error', '')}")
-                    yield final_text, summary_text, "\n".join(log_lines[-400:])
-            # finalize
-            _final = stream.get_final_response()
-            yield final_text, summary_text, "\n".join(log_lines[-400:])
-    except Exception as e:
-        yield f"❌ {e}", summary_text, "\n".join(log_lines[-400:])
 CUSTOM_CSS = """
@@ -434,11 +492,8 @@ def build_ui():
                 with gr.Column(scale=2):
                     output_text = gr.Textbox(label="Indices (0-based)", lines=12, placeholder="[0, 1, 2, ...]")
-                    output_payload = gr.Textbox(label="Indices (0-based)", lines=12, placeholder="[0, 1, 2, ...]")
             search_button.click(search, inputs=[query_box, k_slider], outputs=[output_text])
-            get_pages_button.click(get_pages, inputs=[output_text], outputs=[output_payload])
         with gr.Tab("3) Agent (Streaming)"):
             with gr.Row(equal_height=True):

     return status, local_path
+def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
+    """Turn page indices into OpenAI vision content parts."""
+    parts: List[Dict[str, Any]] = []
+    seen = sorted({i for i in indices if 0 <= i < len(images)})
+    for idx in seen:
+        b64 = encode_image_to_base64(images[idx])
+        parts.append({
+            "type": "input_image",
+            "image_url": f"data:image/jpeg;base64,{b64}",
+        })
+    return parts
 # =============================
 # MCP Tools
 # =============================
     return sorted(expanded)
 # =============================
 # Gradio UI — Unified App
 # =============================
 Policy & procedure:
   1) Break the user task into 1–4 targeted sub-queries (in English).
+  2) For each sub-query, call mcp_test_search to get indices; Once you receive the indices to use, print "Received" and stop generating. Images will be injected in your stream.
   3) Continue reasoning using ONLY the provided images. If info is insufficient, iterate: refine sub-queries and call the tools again. You may make further tool calls later in the conversation as needed.
 Grounding & citations:
     client = OpenAI(api_key=api_key)
+    prev_response_id: Optional[str] = None
     tools = [{
         "type": "mcp",
         "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
         "require_approval": require_approval or "never",
     }]
+    # seed pages once (optional)
+    seed_indices = search(question, k=5) or []
+    pending_indices = list(seed_indices)
+    def run_round(round_idx: int, attached_indices: List[int]):
+        nonlocal prev_response_id
+        assembled_text = ""
+        assembled_summary = ""
+        # Will hold the most recent indices returned by mcp_test_search in THIS stream
+        last_search_indices: List[int] = []
+        # Build user parts (attach any seed pages we already have)
+        parts: List[Dict[str, Any]] = [{"type": "input_text", "text": question if round_idx == 1 else "Continue with new pages."}]
+        parts += _build_image_parts_from_indices(attached_indices)
+        # First call includes system; follow-ups use previous_response_id
+        if prev_response_id:
+            req_input = [{"role": "user", "content": parts}]
+        else:
+            req_input = [
+                {"role": "system", "content": SYSTEM},
+                {"role": "user", "content": parts},
+            ]
+        req_kwargs = dict(
+            model=model_name,
+            input=req_input,
+            reasoning={"effort": "medium", "summary": "auto"},
+            tools=tools,
+            store=True,
+        )
+        if prev_response_id:
+            req_kwargs["previous_response_id"] = prev_response_id
+        # Helper to try extracting a JSON int array from tool result text
+        def _maybe_parse_indices(chunk: str) -> List[int]:
+            import json, re
+            # Find the last bracketed JSON array in the chunk
+            arrs = re.findall(r'\[[^\]]*\]', chunk)
+            for s in reversed(arrs):
+                try:
+                    val = json.loads(s)
+                    if isinstance(val, list) and all(isinstance(x, int) for x in val):
+                        return sorted({x for x in val if isinstance(x, int)})
+                except Exception:
+                    pass
+            return []
+        tool_result_buffer = ""  # accumulate tool result deltas
+        try:
+            with client.responses.stream(**req_kwargs) as stream:
+                for event in stream:
+                    etype = getattr(event, "type", "")
+                    if etype == "response.output_text.delta":
+                        assembled_text += event.delta
+                        yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
+                    elif etype == "response.reasoning_summary_text.delta":
+                        assembled_summary += event.delta
+                        yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
+                    # Capture tool *arguments* in the log for transparency (optional)
+                    elif etype in ("response.function_call_arguments.delta", "response.tool_call_arguments.delta"):
+                        log_lines.append(str(event.delta))
+                    # ⬇️ NEW: capture tool *results* (indices JSON) from MCP
+                    elif etype.startswith("response.tool_result"):
+                        # Different SDKs expose .delta or .output_text; handle both
+                        delta = getattr(event, "delta", "") or getattr(event, "output_text", "")
+                        if delta:
+                            tool_result_buffer += str(delta)
+                            # opportunistic parse so UI can progress early
+                            parsed_now = _maybe_parse_indices(tool_result_buffer)
+                            if parsed_now:
+                                print(parsed_now)
+                                last_search_indices = parsed_now
+                                log_lines.append(f"[tool-result] indices={last_search_indices}")
+                                yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
+                # Finalize, remember response id for follow-ups
+                _final = stream.get_final_response()
+                try:
+                    prev_response_id = getattr(_final, "id", None)
+                except Exception:
+                    prev_response_id = None
+            # If the model produced search results this round, hand them back to the controller
+            if last_search_indices:
+                return sorted(set(last_search_indices))
+            # Otherwise, just render whatever text we have
+            yield assembled_text or " ", assembled_summary or " ", "\n".join(log_lines[-400:])
+            return None
+        except Exception as e:
+            log_lines.append(f"[round {round_idx}] stream error: {e}")
+            yield f"❌ {e}", assembled_summary or "", "\n".join(log_lines[-400:])
+            return None
+    # Controller: iterate rounds until model stops searching
+    max_rounds = 3
+    round_idx = 1
+    while round_idx <= max_rounds:
+        # Start a round with any pending images we already have
+        next_indices = None
+        for final_md, summary_md, log_md in run_round(round_idx, pending_indices):
+            yield final_md, summary_md, log_md
+        # If the model called mcp_test_search, we got indices back; fetch those pages next.
+        # (We ignore pending_indices now—move to the model-chosen ones.)
+        if isinstance(next_indices, list) and next_indices:
+            pending_indices = next_indices
+            # Attach those pages in a **new** GPT-5 call using previous_response_id
+            round_idx += 1
+            continue
+        # No tool search results this round → we’re done
+        break
+    return
 CUSTOM_CSS = """
                 with gr.Column(scale=2):
                     output_text = gr.Textbox(label="Indices (0-based)", lines=12, placeholder="[0, 1, 2, ...]")
             search_button.click(search, inputs=[query_box, k_slider], outputs=[output_text])
         with gr.Tab("3) Agent (Streaming)"):
             with gr.Row(equal_height=True):