manu commited on
Commit
b78d4d3
·
verified ·
1 Parent(s): 1a2f284

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +387 -175
app.py CHANGED
@@ -1,8 +1,11 @@
 
 
1
  import os
2
  import base64
3
  import tempfile
4
  from io import BytesIO
5
  from urllib.request import urlretrieve
 
6
 
7
  import gradio as gr
8
  from gradio_pdf import PDF
@@ -15,91 +18,65 @@ from tqdm import tqdm
15
 
16
  from colpali_engine.models import ColQwen2, ColQwen2Processor
17
 
18
- # -----------------------------
19
- # Globals
20
- # -----------------------------
21
- api_key = os.getenv("OPENAI_API_KEY", "") # <- use env var
22
- ds = [] # list of document embeddings (torch tensors)
23
- images = [] # list of PIL images (page-order)
24
- current_pdf_path = None # last (indexed) pdf path for preview
 
 
 
 
 
 
 
 
 
 
25
 
26
- # -----------------------------
27
- # Model & processor
28
- # -----------------------------
29
- device_map = "cuda:0" if torch.cuda.is_available() else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
30
 
 
 
 
31
  model = ColQwen2.from_pretrained(
32
  "vidore/colqwen2-v1.0",
33
  torch_dtype=torch.bfloat16,
34
  device_map=device_map,
35
- attn_implementation="flash_attention_2"
36
  ).eval()
 
37
  processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v1.0")
38
 
39
 
40
- # -----------------------------
41
  # Utilities
42
- # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
43
  def encode_image_to_base64(image: Image.Image) -> str:
44
- """Encodes a PIL image to a base64 string."""
45
  buffered = BytesIO()
46
  image.save(buffered, format="JPEG")
47
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
48
 
49
 
50
- def query_gpt(query: str, retrieved_images: list[tuple[Image.Image, str]]) -> str:
51
- """Calls OpenAI's GPT model with the query and image data."""
52
- if api_key and api_key.startswith("sk"):
53
- try:
54
- from openai import OpenAI
55
-
56
- base64_images = [encode_image_to_base64(im_caption[0]) for im_caption in retrieved_images]
57
- client = OpenAI(api_key=api_key.strip())
58
- PROMPT = """
59
- You are a smart assistant designed to answer questions about a PDF document.
60
- You are given relevant information in the form of PDF pages. Use them to construct a short response to the question, and cite your sources (page numbers, etc).
61
- If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
62
- Give detailed and extensive answers, only containing info in the pages you are given.
63
- You can answer using information contained in plots and figures if necessary.
64
- Answer in the same language as the query.
65
-
66
- Query: {query}
67
- PDF pages:
68
- """.strip()
69
-
70
- response = client.responses.create(
71
- model="gpt-5-mini",
72
- input=[
73
- {
74
- "role": "user",
75
- "content": (
76
- [{"type": "input_text", "text": PROMPT.format(query=query)}] +
77
- [{"type": "input_image",
78
- "image_url": f"data:image/jpeg;base64,{im}"}
79
- for im in base64_images]
80
- )
81
- }
82
- ],
83
- # max_tokens=500,
84
- )
85
- return response.output_text
86
- except Exception as e:
87
- print(e)
88
- return "OpenAI API connection failure. Verify that OPENAI_API_KEY is set and valid (sk-***)."
89
- return "Set OPENAI_API_KEY in your environment to get a custom response."
90
-
91
-
92
- def _ensure_model_device():
93
- dev = "cuda:0" if torch.cuda.is_available() else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
94
- if str(model.device) != dev:
95
- model.to(dev)
96
- return dev
97
 
98
-
99
- # -----------------------------
100
- # Indexing helpers
101
- # -----------------------------
102
- def convert_files(pdf_path: str) -> list[Image.Image]:
103
  """Convert a single PDF path into a list of PIL Images (pages)."""
104
  imgs = convert_from_path(pdf_path, thread_count=4)
105
  if len(imgs) >= 800:
@@ -107,8 +84,8 @@ def convert_files(pdf_path: str) -> list[Image.Image]:
107
  return imgs
108
 
109
 
110
- def index_gpu(imgs: list[Image.Image]) -> str:
111
- """Embed a list of images (pages) with ColPali and store in globals."""
112
  global ds, images
113
  device = _ensure_model_device()
114
 
@@ -132,17 +109,14 @@ def index_gpu(imgs: list[Image.Image]) -> str:
132
 
133
 
134
  def index_from_path(pdf_path: str) -> str:
135
- """Public: index a local PDF file path."""
136
  imgs = convert_files(pdf_path)
137
  return index_gpu(imgs)
138
 
139
 
140
- def index_from_url(url: str) -> tuple[str, str]:
141
  """
142
  Download a PDF from URL and index it.
143
-
144
- Returns:
145
- status message, saved pdf path
146
  """
147
  tmp_dir = tempfile.mkdtemp(prefix="colpali_")
148
  local_path = os.path.join(tmp_dir, "document.pdf")
@@ -151,142 +125,380 @@ def index_from_url(url: str) -> tuple[str, str]:
151
  return status, local_path
152
 
153
 
154
- # -----------------------------
155
- # Search (MCP tool-friendly)
156
- # -----------------------------
157
- def search(query: str, k: int = 5):
 
158
  """
159
- Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
160
 
161
  MCP tool description:
162
  - name: mcp_test_search
163
- - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
164
  - input_schema:
165
  type: object
166
  properties:
167
  query: {type: string, description: "User query in natural language."}
168
- k: {type: integer, minimum: 1, maximum: 10, default: 5. description: "Number of top pages to retrieve."}
169
  required: ["query"]
170
 
171
- Args:
172
- query (str): Natural-language question to search for.
173
- k (int): Number of top results to return (1–10).
174
-
175
  Returns:
176
- ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
177
  """
178
  global ds, images
179
 
180
  if not images or not ds:
181
- return [], "No document indexed yet. Upload a PDF or load the sample, then run Search."
182
 
183
  k = max(1, min(int(k), len(images)))
184
  device = _ensure_model_device()
185
 
186
- print(query)
187
-
188
  # Encode query
189
- qs = []
190
  with torch.no_grad():
191
  batch_query = processor.process_queries([query]).to(model.device)
192
  embeddings_query = model(**batch_query)
193
- qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
194
 
195
  # Score and select top-k
196
- scores = processor.score(qs, ds, device=device)
197
  top_k_indices = scores[0].topk(k).indices.tolist()
198
 
199
- # Base set & neighbor expansion
200
  base = set(top_k_indices)
201
  expanded = set(base)
202
  for i in base:
203
  expanded.add(i - 1)
204
  expanded.add(i + 1)
 
205
 
206
- expanded = {i for i in expanded if i >= 0 and i<=len(images)}
207
-
208
- expanded_indices = sorted(expanded)
209
- print(top_k_indices, expanded_indices)
210
-
211
- # Build gallery results with 1-based page numbering
212
- results = []
213
- for idx in expanded_indices:
214
- page_num = idx + 1
215
- results.append((images[idx], f"Page {page_num}"))
216
-
217
- # Generate grounded response
218
- ai_response = query_gpt(query, results)
219
- print(ai_response)
220
- return ai_response
221
-
222
-
223
- # -----------------------------
224
- # Gradio UI callbacks
225
- # -----------------------------
226
- def handle_upload(file) -> tuple[str, str | None]:
227
- """Index a user-uploaded PDF file."""
228
- global current_pdf_path
229
- if file is None:
230
- return "Please upload a PDF.", None
231
- path = getattr(file, "name", file)
232
- status = index_from_path(path)
233
- current_pdf_path = path
234
- return status, path
235
-
236
-
237
- def handle_url(url: str) -> tuple[str, str | None]:
238
- """Index a PDF from URL (e.g., a sample)."""
239
- global current_pdf_path
240
- if not url or not url.lower().endswith(".pdf"):
241
- return "Please provide a direct PDF URL.", None
242
- status, path = index_from_url(url)
243
- current_pdf_path = path
244
- return status, path
245
-
246
-
247
- print("Uploading")
248
- print(handle_url("https://ecss.nl/wp-content/uploads/2025/05/ECSS-E-ST-40C-Rev.1(30April2025).pdf"))
249
-
250
- # -----------------------------
251
- # Gradio App
252
- # -----------------------------
253
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
254
- gr.Markdown("# ColPali: Efficient Document Retrieval with Vision Language Models (ColQwen2) 📚")
255
- gr.Markdown(
256
- """Demo to test ColQwen2 (ColPali) on PDF documents.
257
- ColPali is implemented from the [ColPali paper](https://arxiv.org/abs/2407.01449)."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  )
259
 
260
- with gr.Row():
261
- # with gr.Column(scale=2):
262
- # gr.Markdown("## 1️⃣ Load a PDF")
263
- # pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
264
- # index_btn = gr.Button("📥 Index Uploaded PDF", variant="secondary")
265
- # url_box = gr.Textbox(
266
- # label="Or index from URL",
267
- # placeholder="https://example.com/file.pdf",
268
- # value="https://sist.sathyabama.ac.in/sist_coursematerial/uploads/SAR1614.pdf",
269
- # )
270
- # index_url_btn = gr.Button("🌐 Load Sample / From URL", variant="secondary")
271
- # status_box = gr.Textbox(label="Status", interactive=False)
272
- # pdf_view = PDF(label="PDF Preview")
273
-
274
- with gr.Column(scale=3):
275
- gr.Markdown("## 2️⃣ Search")
276
- query = gr.Textbox(placeholder="Enter your query here", label="Query")
277
- k_slider = gr.Slider(minimum=1, maximum=10, step=1, label="Number of results", value=5)
278
- search_button = gr.Button("🔍 Search", variant="primary")
279
- output_text = gr.Textbox(label="AI Response", placeholder="Generated response based on retrieved documents")
280
-
281
- # Wiring
282
- # index_btn.click(handle_upload, inputs=[pdf_input], outputs=[status_box, pdf_view])
283
- # index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
284
- search_button.click(search, inputs=[query, k_slider], outputs=[output_text])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
- if __name__ == "__main__":
287
- # Optional: pre-load the default sample at startup.
288
- # Comment these two lines if you prefer a "cold" start.
289
- # msg, path = index_from_url("https://sist.sathyabama.ac.in/sist_coursematerial/uploads/SAR1614.pdf")
290
- # print(msg, "->", path)
291
 
 
 
 
292
  demo.queue(max_size=5).launch(debug=True, mcp_server=True)
 
1
+ # app.py — Unified ColPali + MCP Agent (indices-only search, agent receives images)
2
+
3
  import os
4
  import base64
5
  import tempfile
6
  from io import BytesIO
7
  from urllib.request import urlretrieve
8
+ from typing import List, Tuple, Dict, Any
9
 
10
  import gradio as gr
11
  from gradio_pdf import PDF
 
18
 
19
  from colpali_engine.models import ColQwen2, ColQwen2Processor
20
 
21
+ # Optional (used by the streaming agent)
22
+ from openai import OpenAI
23
+
24
+
25
+ # =============================
26
+ # Globals & Config
27
+ # =============================
28
+ api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
29
+ ds: List[torch.Tensor] = [] # page embeddings
30
+ images: List[Image.Image] = [] # PIL images in page order
31
+ current_pdf_path: str | None = None
32
+
33
+ device_map = (
34
+ "cuda:0"
35
+ if torch.cuda.is_available()
36
+ else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
37
+ )
38
 
 
 
 
 
39
 
40
+ # =============================
41
+ # Load Model & Processor
42
+ # =============================
43
  model = ColQwen2.from_pretrained(
44
  "vidore/colqwen2-v1.0",
45
  torch_dtype=torch.bfloat16,
46
  device_map=device_map,
47
+ attn_implementation="flash_attention_2",
48
  ).eval()
49
+
50
  processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v1.0")
51
 
52
 
53
+ # =============================
54
  # Utilities
55
+ # =============================
56
+
57
+ def _ensure_model_device() -> str:
58
+ dev = (
59
+ "cuda:0"
60
+ if torch.cuda.is_available()
61
+ else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
62
+ )
63
+ if str(model.device) != dev:
64
+ model.to(dev)
65
+ return dev
66
+
67
+
68
  def encode_image_to_base64(image: Image.Image) -> str:
69
+ """Encodes a PIL image to base64 (JPEG)."""
70
  buffered = BytesIO()
71
  image.save(buffered, format="JPEG")
72
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
73
 
74
 
75
+ # =============================
76
+ # Indexing Helpers
77
+ # =============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ def convert_files(pdf_path: str) -> List[Image.Image]:
 
 
 
 
80
  """Convert a single PDF path into a list of PIL Images (pages)."""
81
  imgs = convert_from_path(pdf_path, thread_count=4)
82
  if len(imgs) >= 800:
 
84
  return imgs
85
 
86
 
87
+ def index_gpu(imgs: List[Image.Image]) -> str:
88
+ """Embed a list of images (pages) with ColQwen2 (ColPali) and store in globals."""
89
  global ds, images
90
  device = _ensure_model_device()
91
 
 
109
 
110
 
111
  def index_from_path(pdf_path: str) -> str:
 
112
  imgs = convert_files(pdf_path)
113
  return index_gpu(imgs)
114
 
115
 
116
+ def index_from_url(url: str) -> Tuple[str, str]:
117
  """
118
  Download a PDF from URL and index it.
119
+ Returns: (status_message, saved_pdf_path)
 
 
120
  """
121
  tmp_dir = tempfile.mkdtemp(prefix="colpali_")
122
  local_path = os.path.join(tmp_dir, "document.pdf")
 
125
  return status, local_path
126
 
127
 
128
+ # =============================
129
+ # MCP Tools
130
+ # =============================
131
+
132
+ def mcp_test_search(query: str, k: int = 5) -> List[int]:
133
  """
134
+ Search within an indexed PDF and return ONLY the indices of the most relevant pages (0-based).
135
 
136
  MCP tool description:
137
  - name: mcp_test_search
138
+ - description: Search within the indexed PDF for the most relevant pages and return their 0-based indices only.
139
  - input_schema:
140
  type: object
141
  properties:
142
  query: {type: string, description: "User query in natural language."}
143
+ k: {type: integer, minimum: 1, maximum: 50, default: 5, description: "Number of top pages to retrieve (before neighbor expansion)."}
144
  required: ["query"]
145
 
 
 
 
 
146
  Returns:
147
+ List[int]: Sorted unique 0-based indices of pages to inspect (includes neighbor expansion).
148
  """
149
  global ds, images
150
 
151
  if not images or not ds:
152
+ return []
153
 
154
  k = max(1, min(int(k), len(images)))
155
  device = _ensure_model_device()
156
 
 
 
157
  # Encode query
 
158
  with torch.no_grad():
159
  batch_query = processor.process_queries([query]).to(model.device)
160
  embeddings_query = model(**batch_query)
161
+ q_vecs = list(torch.unbind(embeddings_query.to("cpu")))
162
 
163
  # Score and select top-k
164
+ scores = processor.score(q_vecs, ds, device=device)
165
  top_k_indices = scores[0].topk(k).indices.tolist()
166
 
167
+ # Neighbor expansion for context
168
  base = set(top_k_indices)
169
  expanded = set(base)
170
  for i in base:
171
  expanded.add(i - 1)
172
  expanded.add(i + 1)
173
+ expanded = {i for i in expanded if 0 <= i < len(images)} # strict bounds
174
 
175
+ return sorted(expanded)
176
+
177
+
178
+ def mcp_get_pages(indices: List[int]) -> Dict[str, Any]:
179
+ """
180
+ Return page images (as data URLs) for the given 0-based indices.
181
+
182
+ MCP tool description:
183
+ - name: mcp_get_pages
184
+ - description: Given 0-based indices from mcp_test_search, return the corresponding page images as data URLs for vision reasoning.
185
+ - input_schema:
186
+ type: object
187
+ properties:
188
+ indices: {
189
+ type: array,
190
+ items: { type: integer, minimum: 0 },
191
+ description: "0-based page indices to fetch",
192
+ }
193
+ required: ["indices"]
194
+
195
+ Returns:
196
+ {"images": [{"index": int, "page": int, "image_url": str}], "count": int}
197
+ """
198
+ global images
199
+
200
+ if not images:
201
+ return {"images": [], "count": 0}
202
+
203
+ uniq = sorted({i for i in indices if 0 <= i < len(images)})
204
+ payload = []
205
+ for idx in uniq:
206
+ im = images[idx]
207
+ b64 = encode_image_to_base64(im)
208
+ payload.append({
209
+ "index": idx,
210
+ "page": idx + 1,
211
+ "image_url": f"data:image/jpeg;base64,{b64}",
212
+ })
213
+ return {"images": payload, "count": len(payload)}
214
+
215
+
216
+ # =============================
217
+ # Gradio UI — Unified App
218
+ # =============================
219
+
220
+ SYSTEM = (
221
+ """
222
+ You are a PDF research agent with two tools:
223
+ • mcp_test_search(query: string, k: int) returns ONLY 0-based page indices.
224
+ • mcp_get_pages(indices: int[]) → returns the actual page images (as data URLs) for vision.
225
+
226
+ Policy & procedure:
227
+ 1) Break the user task into 1–4 targeted sub-queries (in English).
228
+ 2) For each sub-query, call mcp_test_search to get indices; THEN immediately call mcp_get_pages with those indices to obtain the page images.
229
+ 3) Continue reasoning using ONLY the provided images. If info is insufficient, iterate: refine sub-queries and call the tools again. You may make further tool calls later in the conversation as needed.
230
+
231
+ Grounding & citations:
232
+ • Use ONLY information visible in the provided page images.
233
+ • After any claim, cite as (p.<page>).
234
+ • If an answer is not present, say “Not found in the provided pages.”
235
+
236
+ Final deliverable:
237
+ • Write a clear, standalone Markdown answer in the user's language. For lists of dates/items, include a concise table.
238
+ • Do not refer to “the above” or “previous messages”.
239
+ """
240
+ ).strip()
241
+
242
+ DEFAULT_MCP_SERVER_URL = "https://manu-mcp-test.hf.space/gradio_api/mcp/"
243
+ DEFAULT_MCP_SERVER_LABEL = "colpali_rag"
244
+ DEFAULT_ALLOWED_TOOLS = "mcp_test_search,mcp_get_pages"
245
+
246
+
247
+ def stream_agent(question: str,
248
+ api_key: str,
249
+ model: str,
250
+ server_url: str,
251
+ server_label: str,
252
+ require_approval: str,
253
+ allowed_tools: str):
254
+ """
255
+ Streaming generator for the agent.
256
+ NOTE: We rely on OpenAI's MCP tool routing. The mcp_test_search tool returns indices only;
257
+ the agent is instructed to call mcp_get_pages next to receive images and continue reasoning.
258
+ """
259
+ final_text = "Answer:"
260
+ summary_text = "Reasoning:"
261
+ log_lines = ["Log"]
262
+
263
+ if not api_key:
264
+ yield "⚠️ **Please provide your OpenAI API key.**", "", ""
265
+ return
266
+
267
+ client = OpenAI(api_key=api_key)
268
+
269
+ tools = [{
270
+ "type": "mcp",
271
+ "server_label": server_label or DEFAULT_MCP_SERVER_LABEL,
272
+ "server_url": server_url or DEFAULT_MCP_SERVER_URL,
273
+ "allowed_tools": [t.strip() for t in (allowed_tools or DEFAULT_ALLOWED_TOOLS).split(",") if t.strip()],
274
+ "require_approval": require_approval or "never",
275
+ }]
276
+
277
+ req_kwargs = dict(
278
+ model=model,
279
+ input=[
280
+ {"role": "system", "content": SYSTEM},
281
+ {"role": "user", "content": question},
282
+ ],
283
+ reasoning={"effort": "medium", "summary": "auto"},
284
+ tools=tools,
285
  )
286
 
287
+ try:
288
+ with client.responses.stream(**req_kwargs) as stream:
289
+ for event in stream:
290
+ etype = getattr(event, "type", "")
291
+
292
+ if etype == "response.output_text.delta":
293
+ final_text += event.delta
294
+ yield final_text, summary_text, "\n".join(log_lines[-400:])
295
+
296
+ elif etype == "response.reasoning_summary_text.delta":
297
+ summary_text += event.delta
298
+ yield final_text, summary_text, "\n".join(log_lines[-400:])
299
+
300
+ elif etype in ("response.function_call_arguments.delta", "response.tool_call_arguments.delta"):
301
+ # Show tool call argument deltas in the log for transparency
302
+ log_lines.append(str(event.delta))
303
+
304
+ elif etype == "response.error":
305
+ log_lines.append(f"[error] {getattr(event, 'error', '')}")
306
+ yield final_text, summary_text, "\n".join(log_lines[-400:])
307
+
308
+ # finalize
309
+ _final = stream.get_final_response()
310
+ yield final_text, summary_text, "\n".join(log_lines[-400:])
311
+
312
+ except Exception as e:
313
+ yield f"❌ {e}", summary_text, "\n".join(log_lines[-400:])
314
+
315
+
316
+ CUSTOM_CSS = """
317
+ :root {
318
+ --bg: #0e1117;
319
+ --panel: #111827;
320
+ --accent: #7c3aed;
321
+ --accent-2: #06b6d4;
322
+ --text: #e5e7eb;
323
+ --muted: #9ca3af;
324
+ --border: #1f2937;
325
+ }
326
+ .gradio-container {max-width: 1180px !important; margin: 0 auto !important;}
327
+
328
+ body {background: radial-gradient(1200px 600px at 20% -10%, rgba(124,58,237,.25), transparent 60%),
329
+ radial-gradient(1000px 500px at 120% 10%, rgba(6,182,212,.2), transparent 60%),
330
+ var(--bg) !important;}
331
+
332
+ .app-header {
333
+ display:flex; gap:16px; align-items:center; padding:20px 18px; margin:8px 0 12px;
334
+ border:1px solid var(--border); border-radius:20px;
335
+ background: linear-gradient(180deg, rgba(255,255,255,.02), rgba(255,255,255,.01));
336
+ box-shadow: 0 10px 30px rgba(0,0,0,.25), inset 0 1px 0 rgba(255,255,255,.05);
337
+ }
338
+ .app-header .icon {
339
+ width:48px; height:48px; display:grid; place-items:center; border-radius:14px;
340
+ background: linear-gradient(135deg, var(--accent), var(--accent-2));
341
+ color:white; font-size:26px;
342
+ }
343
+ .app-header h1 {font-size:22px; margin:0; color:var(--text); letter-spacing:.2px;}
344
+ .app-header p {margin:2px 0 0; color:var(--muted); font-size:14px;}
345
+
346
+ .card {
347
+ border:1px solid var(--border); border-radius:18px; padding:14px 16px;
348
+ background: linear-gradient(180deg, rgba(255,255,255,.02), rgba(255,255,255,.01));
349
+ box-shadow: 0 12px 28px rgba(0,0,0,.18), inset 0 1px 0 rgba(255,255,255,.04);
350
+ }
351
+
352
+ .gr-button-primary {border-radius:12px !important; font-weight:600;}
353
+ .gradio-container .tabs {border-radius:16px; overflow:hidden; border:1px solid var(--border);}
354
+
355
+ .markdown-wrap {min-height: 260px;}
356
+ .summary-wrap {min-height: 180px;}
357
+
358
+ .gr-markdown, .gr-prose { color: var(--text) !important; }
359
+ .gr-markdown h1, .gr-markdown h2, .gr-markdown h3 {color: #f3f4f6;}
360
+ .gr-markdown a {color: var(--accent-2); text-decoration: none;}
361
+ .gr-markdown a:hover {text-decoration: underline;}
362
+ .gr-markdown table {width: 100%; border-collapse: collapse; margin: 10px 0 16px;}
363
+ .gr-markdown th, .gr-markdown td {border: 1px solid var(--border); padding: 8px 10px;}
364
+ .gr-markdown th {background: rgba(255,255,255,.03);}
365
+ .gr-markdown pre, .gr-markdown code { background: #0b1220; color: #eaeaf0; border-radius: 12px; border: 1px solid #172036; }
366
+ .gr-markdown pre {padding: 12px 14px; overflow:auto;}
367
+ .gr-markdown blockquote { border-left: 4px solid var(--accent); padding: 6px 12px; margin: 8px 0; color: #d1d5db; background: rgba(124,58,237,.06); border-radius: 8px; }
368
+
369
+ .log-box { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; white-space: pre-wrap; color: #d1d5db; background:#0b1220; border:1px solid #172036; border-radius:14px; padding:12px; max-height:280px; overflow:auto; }
370
+ """
371
+
372
+
373
+ def build_ui():
374
+ theme = gr.themes.Soft()
375
+ with gr.Blocks(title="ColPali PDF RAG + MCP Agent (Indices-only)", theme=theme, css=CUSTOM_CSS) as demo:
376
+ gr.HTML(
377
+ """
378
+ <div class="app-header">
379
+ <div class="icon">📚</div>
380
+ <div>
381
+ <h1>ColPali PDF Search + Streaming Agent</h1>
382
+ <p>Index PDFs with ColQwen2 (ColPali). The search tool returns page indices only; the agent fetches images and reasons visually.</p>
383
+ </div>
384
+ </div>
385
+ """
386
+ )
387
+
388
+ with gr.Tab("1) Index & Preview"):
389
+ with gr.Row():
390
+ with gr.Column(scale=1):
391
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
392
+ index_btn = gr.Button("📥 Index Uploaded PDF", variant="secondary")
393
+ url_box = gr.Textbox(
394
+ label="Or index from URL",
395
+ placeholder="https://example.com/file.pdf",
396
+ value="",
397
+ )
398
+ index_url_btn = gr.Button("🌐 Load From URL", variant="secondary")
399
+ status_box = gr.Textbox(label="Status", interactive=False)
400
+ with gr.Column(scale=2):
401
+ pdf_view = PDF(label="PDF Preview")
402
+
403
+ # wiring
404
+ def handle_upload(file):
405
+ global current_pdf_path
406
+ if file is None:
407
+ return "Please upload a PDF.", None
408
+ path = getattr(file, "name", file)
409
+ status = index_from_path(path)
410
+ current_pdf_path = path
411
+ return status, path
412
+
413
+ def handle_url(url: str):
414
+ global current_pdf_path
415
+ if not url or not url.lower().endswith(".pdf"):
416
+ return "Please provide a direct PDF URL ending in .pdf", None
417
+ status, path = index_from_url(url)
418
+ current_pdf_path = path
419
+ return status, path
420
+
421
+ index_btn.click(handle_upload, inputs=[pdf_input], outputs=[status_box, pdf_view])
422
+ index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
423
+
424
+ with gr.Tab("2) Ask (Direct — returns indices)"):
425
+ with gr.Row():
426
+ with gr.Column(scale=1):
427
+ query_box = gr.Textbox(placeholder="Enter your question…", label="Query", lines=4)
428
+ k_slider = gr.Slider(minimum=1, maximum=50, step=1, label="Number of results (k)", value=5)
429
+ search_button = gr.Button("🔍 Search", variant="primary")
430
+ with gr.Column(scale=2):
431
+ output_text = gr.Textbox(label="Indices (0-based)", lines=12, placeholder="[0, 1, 2, ...]")
432
+
433
+ def run_direct_indices(query: str, k: int) -> str:
434
+ idxs = mcp_test_search(query=query, k=k)
435
+ return str(idxs)
436
+
437
+ search_button.click(run_direct_indices, inputs=[query_box, k_slider], outputs=[output_text])
438
+
439
+ with gr.Tab("3) Agent (Streaming)"):
440
+ with gr.Row(equal_height=True):
441
+ with gr.Column(scale=1):
442
+ with gr.Group():
443
+ question = gr.Textbox(
444
+ label="Your question",
445
+ placeholder="Enter your question…",
446
+ lines=8,
447
+ elem_classes=["card"],
448
+ )
449
+ run_btn = gr.Button("Run", variant="primary")
450
+
451
+ with gr.Accordion("Connection & Model", open=False, elem_classes=["card"]):
452
+ with gr.Row():
453
+ api_key_box = gr.Textbox(
454
+ label="OpenAI API Key",
455
+ placeholder="sk-...",
456
+ type="password",
457
+ value=api_key_env,
458
+ )
459
+ model_box = gr.Dropdown(
460
+ label="Model",
461
+ choices=["gpt-5", "gpt-4.1", "gpt-4o"],
462
+ value="gpt-5",
463
+ )
464
+ with gr.Row():
465
+ server_url_box = gr.Textbox(
466
+ label="MCP Server URL",
467
+ value=DEFAULT_MCP_SERVER_URL,
468
+ )
469
+ server_label_box = gr.Textbox(
470
+ label="MCP Server Label",
471
+ value=DEFAULT_MCP_SERVER_LABEL,
472
+ )
473
+ with gr.Row():
474
+ allowed_tools_box = gr.Textbox(
475
+ label="Allowed Tools (comma-separated)",
476
+ value=DEFAULT_ALLOWED_TOOLS,
477
+ )
478
+ require_approval_box = gr.Dropdown(
479
+ label="Require Approval",
480
+ choices=["never", "auto", "always"],
481
+ value="never",
482
+ )
483
+
484
+ with gr.Column(scale=3):
485
+ with gr.Tab("Answer (Markdown)"):
486
+ final_md = gr.Markdown(value="", elem_classes=["card", "markdown-wrap"])
487
+ with gr.Tab("Live Summary (Markdown)"):
488
+ summary_md = gr.Markdown(value="", elem_classes=["card", "summary-wrap"])
489
+ with gr.Tab("Event Log"):
490
+ log_md = gr.Markdown(value="", elem_classes=["card", "log-box"])
491
+
492
+ run_btn.click(
493
+ stream_agent,
494
+ inputs=[question, api_key_box, model_box, server_url_box, server_label_box, require_approval_box, allowed_tools_box],
495
+ outputs=[final_md, summary_md, log_md],
496
+ )
497
+
498
+ return demo
499
 
 
 
 
 
 
500
 
501
+ if __name__ == "__main__":
502
+ demo = build_ui()
503
+ # mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
504
  demo.queue(max_size=5).launch(debug=True, mcp_server=True)