manu commited on
Commit
a1627f5
Β·
verified Β·
1 Parent(s): 40e26e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -56
app.py CHANGED
@@ -28,6 +28,10 @@ from openai import OpenAI
28
  # =============================
29
  api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
30
 
 
 
 
 
31
  ds: List[torch.Tensor] = [] # page embeddings
32
  images: List[Image.Image] = [] # PIL images in page order
33
  current_pdf_path: Optional[str] = None
@@ -174,7 +178,7 @@ def image_search(query: str, k: int = 5) -> List[int]:
174
  """
175
  Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
176
  MCP tool description:
177
- - name: visual_deepsearch_image_search
178
  - description: Search within a PDF document for the most relevant pages to answer a query.
179
  - input_schema:
180
  type: object
@@ -212,7 +216,7 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
212
  """
213
  Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
214
  MCP tool description:
215
- - name: visual_deepsearch_search_synthetize
216
  - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
217
  - input_schema:
218
  type: object
@@ -227,13 +231,13 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
227
  ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
228
  """
229
  top_k_indices = image_search(query, k)
230
- expanded = set(top_k_indices)
231
- for i in top_k_indices:
232
- expanded.add(i - 1)
233
- expanded.add(i + 1)
234
- expanded = {i for i in expanded if 0 <= i < len(images)}
235
- expanded = sorted(expanded)
236
- expanded = expanded if len(expanded) < 20 else sorted(top_k_indices)
237
 
238
 
239
  # Build gallery results with 1-based page numbering
@@ -270,12 +274,12 @@ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
270
 
271
  SYSTEM1 = (
272
  """
273
- You are a PDF research agent with a single tool: visual_deepsearch_image_search(query: string, k: int).
274
  Act iteratively:
275
  1) If you are given images, analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Most often, you should run new search calls using the tool to find additional missing information.
276
  2) To run new searches, split the query into 1–3 focused sub-queries. You can use the potentially provided page images to help you ask relevant followup queries. Subqueries should be asked as natural language questions, not just keywords.
277
- 3) For each sub-query, call visual_deepsearch_image_search (k=5 by default; increase to up to 10 if you need to go deep).
278
- 4) You will receive the output of visual_deepsearch_image_search as a list of indices corresponding to page numbers. Print the page numbers out and stop generating. An external system will take over and convert the indices into image for you.
279
  5) Back to step 1. Analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Otherwise run new search calls using the tool to find additional missing information.
280
 
281
  Workflow:
@@ -290,10 +294,10 @@ Deliverable:
290
 
291
 
292
  SYSTEM2 = """
293
- You are a PDF research agent with a single tool: visual_deepsearch_search_synthetize(query: string, k: int).
294
  Act iteratively:
295
  1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
296
- 2) For each sub-query, call visual_deepsearch_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
297
  3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
298
 
299
  Grounding & citations:
@@ -342,7 +346,7 @@ def stream_agent(question: str,
342
 
343
  visual_reasoning: bool = True if "Visual Reasoning" in visual_reasoning else False
344
 
345
- allowed_tools = "visual_deepsearch_image_search" if visual_reasoning else "visual_deepsearch_search_synthetize"
346
  SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
347
 
348
  if not api_key:
@@ -498,12 +502,12 @@ def stream_agent(question: str,
498
  if next_indices and visual_reasoning:
499
  # Neighbor expansion for context
500
  base = set(next_indices)
501
- expanded = set(base)
502
- for i in base:
503
- expanded.add(i - 1)
504
- expanded.add(i + 1)
505
- expanded = {i for i in expanded if 0 <= i < len(images)}
506
- pending_indices = sorted(expanded) if len(expanded) < 20 else sorted(base)
507
  round_idx += 1
508
  continue
509
 
@@ -592,41 +596,41 @@ def build_ui():
592
  )
593
 
594
  # ---- Tab 1: Index & Preview
595
- with gr.Tab("1) Index & Preview"):
596
- with gr.Row():
597
- with gr.Column(scale=1):
598
- pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
599
- index_btn = gr.Button("πŸ“₯ Index Uploaded PDF", variant="secondary")
600
- url_box = gr.Textbox(
601
- label="Or index from URL",
602
- placeholder="https://example.com/file.pdf",
603
- value="",
604
- )
605
- index_url_btn = gr.Button("🌐 Load From URL", variant="secondary")
606
- status_box = gr.Textbox(label="Status", interactive=False)
607
- with gr.Column(scale=2):
608
- pdf_view = PDF(label="PDF Preview")
609
-
610
- # wiring
611
- def handle_upload(file):
612
- global current_pdf_path
613
- if file is None:
614
- return "Please upload a PDF.", None
615
- path = getattr(file, "name", file)
616
- status = index_from_path(path)
617
- current_pdf_path = path
618
- return status, path
619
-
620
- def handle_url(url: str):
621
- global current_pdf_path
622
- if not url or not url.lower().endswith(".pdf"):
623
- return "Please provide a direct PDF URL ending in .pdf", None
624
- status, path = index_from_url(url)
625
- current_pdf_path = path
626
- return status, path
627
-
628
- index_btn.click(handle_upload, inputs=[pdf_input], outputs=[status_box, pdf_view])
629
- index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
630
 
631
  # ---- Tab 2: Ask (Direct β€” returns indices)
632
  with gr.Tab("2) Direct Search"):
@@ -712,6 +716,9 @@ def build_ui():
712
 
713
  if __name__ == "__main__":
714
  demo = build_ui()
 
 
 
715
  # mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
716
  # We keep the MCP server available, but the agent never uses MCP to pass images.
717
  demo.queue(max_size=5).launch(debug=True, mcp_server=True)
 
28
  # =============================
29
  api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
30
 
31
+ from datasets import load_dataset
32
+
33
+
34
+
35
  ds: List[torch.Tensor] = [] # page embeddings
36
  images: List[Image.Image] = [] # PIL images in page order
37
  current_pdf_path: Optional[str] = None
 
178
  """
179
  Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
180
  MCP tool description:
181
+ - name: test_deepsearch_image_search
182
  - description: Search within a PDF document for the most relevant pages to answer a query.
183
  - input_schema:
184
  type: object
 
216
  """
217
  Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
218
  MCP tool description:
219
+ - name: test_deepsearch_search_synthetize
220
  - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
221
  - input_schema:
222
  type: object
 
231
  ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
232
  """
233
  top_k_indices = image_search(query, k)
234
+ # expanded = set(top_k_indices)
235
+ # for i in top_k_indices:
236
+ # expanded.add(i - 1)
237
+ # expanded.add(i + 1)
238
+ # expanded = {i for i in expanded if 0 <= i < len(images)}
239
+ # expanded = sorted(expanded)
240
+ expanded = top_k_indices
241
 
242
 
243
  # Build gallery results with 1-based page numbering
 
274
 
275
  SYSTEM1 = (
276
  """
277
+ You are a PDF research agent with a single tool: test_deepsearch_image_search(query: string, k: int).
278
  Act iteratively:
279
  1) If you are given images, analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Most often, you should run new search calls using the tool to find additional missing information.
280
  2) To run new searches, split the query into 1–3 focused sub-queries. You can use the potentially provided page images to help you ask relevant followup queries. Subqueries should be asked as natural language questions, not just keywords.
281
+ 3) For each sub-query, call test_deepsearch_image_search (k=5 by default; increase to up to 10 if you need to go deep).
282
+ 4) You will receive the output of test_deepsearch_image_search as a list of indices corresponding to page numbers. Print the page numbers out and stop generating. An external system will take over and convert the indices into image for you.
283
  5) Back to step 1. Analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Otherwise run new search calls using the tool to find additional missing information.
284
 
285
  Workflow:
 
294
 
295
 
296
  SYSTEM2 = """
297
+ You are a PDF research agent with a single tool: test_deepsearch_search_synthetize(query: string, k: int).
298
  Act iteratively:
299
  1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
300
+ 2) For each sub-query, call test_deepsearch_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
301
  3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
302
 
303
  Grounding & citations:
 
346
 
347
  visual_reasoning: bool = True if "Visual Reasoning" in visual_reasoning else False
348
 
349
+ allowed_tools = "test_deepsearch_image_search" if visual_reasoning else "test_deepsearch_search_synthetize"
350
  SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
351
 
352
  if not api_key:
 
502
  if next_indices and visual_reasoning:
503
  # Neighbor expansion for context
504
  base = set(next_indices)
505
+ # expanded = set(base)
506
+ # for i in base:
507
+ # expanded.add(i - 1)
508
+ # expanded.add(i + 1)
509
+ # expanded = {i for i in expanded if 0 <= i < len(images)}
510
+ pending_indices = sorted(base)
511
  round_idx += 1
512
  continue
513
 
 
596
  )
597
 
598
  # ---- Tab 1: Index & Preview
599
+ # with gr.Tab("1) Index & Preview"):
600
+ # with gr.Row():
601
+ # with gr.Column(scale=1):
602
+ # pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
603
+ # index_btn = gr.Button("πŸ“₯ Index Uploaded PDF", variant="secondary")
604
+ # url_box = gr.Textbox(
605
+ # label="Or index from URL",
606
+ # placeholder="https://example.com/file.pdf",
607
+ # value="",
608
+ # )
609
+ # index_url_btn = gr.Button("🌐 Load From URL", variant="secondary")
610
+ # status_box = gr.Textbox(label="Status", interactive=False)
611
+ # with gr.Column(scale=2):
612
+ # pdf_view = PDF(label="PDF Preview")
613
+
614
+ # # wiring
615
+ # def handle_upload(file):
616
+ # global current_pdf_path
617
+ # if file is None:
618
+ # return "Please upload a PDF.", None
619
+ # path = getattr(file, "name", file)
620
+ # status = index_from_path(path)
621
+ # current_pdf_path = path
622
+ # return status, path
623
+
624
+ # def handle_url(url: str):
625
+ # global current_pdf_path
626
+ # if not url or not url.lower().endswith(".pdf"):
627
+ # return "Please provide a direct PDF URL ending in .pdf", None
628
+ # status, path = index_from_url(url)
629
+ # current_pdf_path = path
630
+ # return status, path
631
+
632
+ # index_btn.click(handle_upload, inputs=[pdf_input], outputs=[status_box, pdf_view])
633
+ # index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
634
 
635
  # ---- Tab 2: Ask (Direct β€” returns indices)
636
  with gr.Tab("2) Direct Search"):
 
716
 
717
  if __name__ == "__main__":
718
  demo = build_ui()
719
+ images = load_dataset("vidore/esg_reports_human_labeled_v2", "corpus", split="test")["image"]
720
+ print("Indexing")
721
+ print(index_gpu(images))
722
  # mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
723
  # We keep the MCP server available, but the agent never uses MCP to pass images.
724
  demo.queue(max_size=5).launch(debug=True, mcp_server=True)