Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -28,6 +28,10 @@ from openai import OpenAI
|
|
28 |
# =============================
|
29 |
api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
|
30 |
|
|
|
|
|
|
|
|
|
31 |
ds: List[torch.Tensor] = [] # page embeddings
|
32 |
images: List[Image.Image] = [] # PIL images in page order
|
33 |
current_pdf_path: Optional[str] = None
|
@@ -174,7 +178,7 @@ def image_search(query: str, k: int = 5) -> List[int]:
|
|
174 |
"""
|
175 |
Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
|
176 |
MCP tool description:
|
177 |
-
- name:
|
178 |
- description: Search within a PDF document for the most relevant pages to answer a query.
|
179 |
- input_schema:
|
180 |
type: object
|
@@ -212,7 +216,7 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
|
|
212 |
"""
|
213 |
Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
214 |
MCP tool description:
|
215 |
-
- name:
|
216 |
- description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
217 |
- input_schema:
|
218 |
type: object
|
@@ -227,13 +231,13 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
|
|
227 |
ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
|
228 |
"""
|
229 |
top_k_indices = image_search(query, k)
|
230 |
-
expanded = set(top_k_indices)
|
231 |
-
for i in top_k_indices:
|
232 |
-
|
233 |
-
|
234 |
-
expanded = {i for i in expanded if 0 <= i < len(images)}
|
235 |
-
expanded = sorted(expanded)
|
236 |
-
expanded =
|
237 |
|
238 |
|
239 |
# Build gallery results with 1-based page numbering
|
@@ -270,12 +274,12 @@ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
|
|
270 |
|
271 |
SYSTEM1 = (
|
272 |
"""
|
273 |
-
You are a PDF research agent with a single tool:
|
274 |
Act iteratively:
|
275 |
1) If you are given images, analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Most often, you should run new search calls using the tool to find additional missing information.
|
276 |
2) To run new searches, split the query into 1β3 focused sub-queries. You can use the potentially provided page images to help you ask relevant followup queries. Subqueries should be asked as natural language questions, not just keywords.
|
277 |
-
3) For each sub-query, call
|
278 |
-
4) You will receive the output of
|
279 |
5) Back to step 1. Analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Otherwise run new search calls using the tool to find additional missing information.
|
280 |
|
281 |
Workflow:
|
@@ -290,10 +294,10 @@ Deliverable:
|
|
290 |
|
291 |
|
292 |
SYSTEM2 = """
|
293 |
-
You are a PDF research agent with a single tool:
|
294 |
Act iteratively:
|
295 |
1) Split the user question into 1β4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
|
296 |
-
2) For each sub-query, call
|
297 |
3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
|
298 |
|
299 |
Grounding & citations:
|
@@ -342,7 +346,7 @@ def stream_agent(question: str,
|
|
342 |
|
343 |
visual_reasoning: bool = True if "Visual Reasoning" in visual_reasoning else False
|
344 |
|
345 |
-
allowed_tools = "
|
346 |
SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
|
347 |
|
348 |
if not api_key:
|
@@ -498,12 +502,12 @@ def stream_agent(question: str,
|
|
498 |
if next_indices and visual_reasoning:
|
499 |
# Neighbor expansion for context
|
500 |
base = set(next_indices)
|
501 |
-
expanded = set(base)
|
502 |
-
for i in base:
|
503 |
-
|
504 |
-
|
505 |
-
expanded = {i for i in expanded if 0 <= i < len(images)}
|
506 |
-
pending_indices = sorted(
|
507 |
round_idx += 1
|
508 |
continue
|
509 |
|
@@ -592,41 +596,41 @@ def build_ui():
|
|
592 |
)
|
593 |
|
594 |
# ---- Tab 1: Index & Preview
|
595 |
-
with gr.Tab("1) Index & Preview"):
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
|
631 |
# ---- Tab 2: Ask (Direct β returns indices)
|
632 |
with gr.Tab("2) Direct Search"):
|
@@ -712,6 +716,9 @@ def build_ui():
|
|
712 |
|
713 |
if __name__ == "__main__":
|
714 |
demo = build_ui()
|
|
|
|
|
|
|
715 |
# mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
|
716 |
# We keep the MCP server available, but the agent never uses MCP to pass images.
|
717 |
demo.queue(max_size=5).launch(debug=True, mcp_server=True)
|
|
|
28 |
# =============================
|
29 |
api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
|
30 |
|
31 |
+
from datasets import load_dataset
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
ds: List[torch.Tensor] = [] # page embeddings
|
36 |
images: List[Image.Image] = [] # PIL images in page order
|
37 |
current_pdf_path: Optional[str] = None
|
|
|
178 |
"""
|
179 |
Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
|
180 |
MCP tool description:
|
181 |
+
- name: test_deepsearch_image_search
|
182 |
- description: Search within a PDF document for the most relevant pages to answer a query.
|
183 |
- input_schema:
|
184 |
type: object
|
|
|
216 |
"""
|
217 |
Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
218 |
MCP tool description:
|
219 |
+
- name: test_deepsearch_search_synthetize
|
220 |
- description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
221 |
- input_schema:
|
222 |
type: object
|
|
|
231 |
ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
|
232 |
"""
|
233 |
top_k_indices = image_search(query, k)
|
234 |
+
# expanded = set(top_k_indices)
|
235 |
+
# for i in top_k_indices:
|
236 |
+
# expanded.add(i - 1)
|
237 |
+
# expanded.add(i + 1)
|
238 |
+
# expanded = {i for i in expanded if 0 <= i < len(images)}
|
239 |
+
# expanded = sorted(expanded)
|
240 |
+
expanded = top_k_indices
|
241 |
|
242 |
|
243 |
# Build gallery results with 1-based page numbering
|
|
|
274 |
|
275 |
SYSTEM1 = (
|
276 |
"""
|
277 |
+
You are a PDF research agent with a single tool: test_deepsearch_image_search(query: string, k: int).
|
278 |
Act iteratively:
|
279 |
1) If you are given images, analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Most often, you should run new search calls using the tool to find additional missing information.
|
280 |
2) To run new searches, split the query into 1β3 focused sub-queries. You can use the potentially provided page images to help you ask relevant followup queries. Subqueries should be asked as natural language questions, not just keywords.
|
281 |
+
3) For each sub-query, call test_deepsearch_image_search (k=5 by default; increase to up to 10 if you need to go deep).
|
282 |
+
4) You will receive the output of test_deepsearch_image_search as a list of indices corresponding to page numbers. Print the page numbers out and stop generating. An external system will take over and convert the indices into image for you.
|
283 |
5) Back to step 1. Analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Otherwise run new search calls using the tool to find additional missing information.
|
284 |
|
285 |
Workflow:
|
|
|
294 |
|
295 |
|
296 |
SYSTEM2 = """
|
297 |
+
You are a PDF research agent with a single tool: test_deepsearch_search_synthetize(query: string, k: int).
|
298 |
Act iteratively:
|
299 |
1) Split the user question into 1β4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
|
300 |
+
2) For each sub-query, call test_deepsearch_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
|
301 |
3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
|
302 |
|
303 |
Grounding & citations:
|
|
|
346 |
|
347 |
visual_reasoning: bool = True if "Visual Reasoning" in visual_reasoning else False
|
348 |
|
349 |
+
allowed_tools = "test_deepsearch_image_search" if visual_reasoning else "test_deepsearch_search_synthetize"
|
350 |
SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
|
351 |
|
352 |
if not api_key:
|
|
|
502 |
if next_indices and visual_reasoning:
|
503 |
# Neighbor expansion for context
|
504 |
base = set(next_indices)
|
505 |
+
# expanded = set(base)
|
506 |
+
# for i in base:
|
507 |
+
# expanded.add(i - 1)
|
508 |
+
# expanded.add(i + 1)
|
509 |
+
# expanded = {i for i in expanded if 0 <= i < len(images)}
|
510 |
+
pending_indices = sorted(base)
|
511 |
round_idx += 1
|
512 |
continue
|
513 |
|
|
|
596 |
)
|
597 |
|
598 |
# ---- Tab 1: Index & Preview
|
599 |
+
# with gr.Tab("1) Index & Preview"):
|
600 |
+
# with gr.Row():
|
601 |
+
# with gr.Column(scale=1):
|
602 |
+
# pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
603 |
+
# index_btn = gr.Button("π₯ Index Uploaded PDF", variant="secondary")
|
604 |
+
# url_box = gr.Textbox(
|
605 |
+
# label="Or index from URL",
|
606 |
+
# placeholder="https://example.com/file.pdf",
|
607 |
+
# value="",
|
608 |
+
# )
|
609 |
+
# index_url_btn = gr.Button("π Load From URL", variant="secondary")
|
610 |
+
# status_box = gr.Textbox(label="Status", interactive=False)
|
611 |
+
# with gr.Column(scale=2):
|
612 |
+
# pdf_view = PDF(label="PDF Preview")
|
613 |
+
|
614 |
+
# # wiring
|
615 |
+
# def handle_upload(file):
|
616 |
+
# global current_pdf_path
|
617 |
+
# if file is None:
|
618 |
+
# return "Please upload a PDF.", None
|
619 |
+
# path = getattr(file, "name", file)
|
620 |
+
# status = index_from_path(path)
|
621 |
+
# current_pdf_path = path
|
622 |
+
# return status, path
|
623 |
+
|
624 |
+
# def handle_url(url: str):
|
625 |
+
# global current_pdf_path
|
626 |
+
# if not url or not url.lower().endswith(".pdf"):
|
627 |
+
# return "Please provide a direct PDF URL ending in .pdf", None
|
628 |
+
# status, path = index_from_url(url)
|
629 |
+
# current_pdf_path = path
|
630 |
+
# return status, path
|
631 |
+
|
632 |
+
# index_btn.click(handle_upload, inputs=[pdf_input], outputs=[status_box, pdf_view])
|
633 |
+
# index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
|
634 |
|
635 |
# ---- Tab 2: Ask (Direct β returns indices)
|
636 |
with gr.Tab("2) Direct Search"):
|
|
|
716 |
|
717 |
if __name__ == "__main__":
|
718 |
demo = build_ui()
|
719 |
+
images = load_dataset("vidore/esg_reports_human_labeled_v2", "corpus", split="test")["image"]
|
720 |
+
print("Indexing")
|
721 |
+
print(index_gpu(images))
|
722 |
# mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
|
723 |
# We keep the MCP server available, but the agent never uses MCP to pass images.
|
724 |
demo.queue(max_size=5).launch(debug=True, mcp_server=True)
|