Abid Ali Awan commited on
Commit
22b0228
Β·
1 Parent(s): f15d60c

deploying the app

Browse files
Files changed (3) hide show
  1. README.md +7 -3
  2. app.py +526 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  title: Gemini 2 Pro Chat
3
- emoji: 🐠
4
- colorFrom: pink
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.15.0
@@ -11,4 +11,8 @@ license: mit
11
  short_description: 'Image, Audio, and Document understanding + Code Execution. '
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
  ---
2
  title: Gemini 2 Pro Chat
3
+ emoji: β™ŠπŸ’¬
4
+ colorFrom: Green
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.15.0
 
11
  short_description: 'Image, Audio, and Document understanding + Code Execution. '
12
  ---
13
 
14
+ ## Gemini 2.0 Pro Multi-modal Chatbot
15
+ This module sets up a Gradio interface for a multi-modal chatbot powered by the Gemini 2.0 Pro model.
16
+ It supports text, image, audio, and document inputs and uses the google.genai library to generate responses.
17
+ All response-generation operations now use the streaming endpoint (generate_content_stream) so that the UI
18
+ receives incremental updates.
app.py ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import os
4
+ import time
5
+ from typing import Dict, List, Optional, Union
6
+
7
+ import gradio as gr
8
+ from google import genai
9
+ from google.genai import types # New types module from google-genai
10
+ from PIL import Image
11
+
12
+ # Retrieve API key for Google GenAI from the environment variables.
13
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
14
+
15
+ # Initialize the client so that it can be reused across functions.
16
+ CLIENT = genai.Client(api_key=GOOGLE_API_KEY)
17
+
18
+ # General constants for the UI
19
+ TITLE = """<h1 align="center">Gemini 2.0 Pro Multi-modal Chatbot</h1>"""
20
+ AVATAR_IMAGES = (None, "https://media.roboflow.com/spaces/gemini-icon.png")
21
+ IMAGE_WIDTH = 512
22
+
23
+
24
+ def preprocess_stop_sequences(stop_sequences: str) -> Optional[List[str]]:
25
+ """
26
+ Convert a comma-separated string of stop sequences into a list.
27
+
28
+ Parameters:
29
+ stop_sequences (str): A string containing stop sequences separated by commas.
30
+
31
+ Returns:
32
+ Optional[List[str]]: A list of trimmed stop sequences if provided; otherwise, None.
33
+ """
34
+ if not stop_sequences:
35
+ return None
36
+ return [sequence.strip() for sequence in stop_sequences.split(",")]
37
+
38
+
39
+ def preprocess_image(image: Image.Image) -> Image.Image:
40
+ """
41
+ Resize an image to a fixed width while maintaining the aspect ratio.
42
+
43
+ Parameters:
44
+ image (Image.Image): The original image.
45
+
46
+ Returns:
47
+ Image.Image: The resized image with width fixed at IMAGE_WIDTH.
48
+ """
49
+ image_height = int(image.height * IMAGE_WIDTH / image.width)
50
+ return image.resize((IMAGE_WIDTH, image_height))
51
+
52
+
53
+ def image_to_base64_html_from_pil(image: Image.Image, max_width: int = 150) -> str:
54
+ """
55
+ Convert a PIL Image to an HTML <img> tag with base64-encoded image data.
56
+
57
+ Parameters:
58
+ image (Image.Image): The image to encode.
59
+ max_width (int): Maximum width (in pixels) for the displayed image.
60
+
61
+ Returns:
62
+ str: An HTML string with the embedded image.
63
+ """
64
+ buffered = io.BytesIO()
65
+ image.save(buffered, format="JPEG")
66
+ b64_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
67
+ return (
68
+ f'<img src="data:image/jpeg;base64,{b64_data}" alt="Uploaded Image" '
69
+ f'style="max-width:{max_width}px;">'
70
+ )
71
+
72
+
73
+ def preprocess_chat_history_messages(
74
+ chat_history: List[Union[dict, gr.ChatMessage]],
75
+ ) -> List[Dict[str, Union[str, List[str]]]]:
76
+ """
77
+ Normalize chat history messages into a consistent list of dictionaries.
78
+
79
+ Each message (whether as a dict or gr.ChatMessage) is converted into a dictionary
80
+ containing a role and a list of parts (message content).
81
+
82
+ Parameters:
83
+ chat_history (List[Union[dict, gr.ChatMessage]]): The conversation history.
84
+
85
+ Returns:
86
+ List[Dict[str, Union[str, List[str]]]]: A normalized list of messages.
87
+ """
88
+ messages = []
89
+ for msg in chat_history:
90
+ if isinstance(msg, dict):
91
+ content = msg.get("content")
92
+ role = msg.get("role")
93
+ else:
94
+ content = msg.content
95
+ role = msg.role
96
+
97
+ if content is not None:
98
+ # Convert "assistant" role to "model" if needed.
99
+ role = "model" if role == "assistant" else role
100
+ messages.append({"role": role, "parts": [content]})
101
+ return messages
102
+
103
+
104
+ def chat_history_to_prompt(chat_history: List[Union[dict, gr.ChatMessage]]) -> str:
105
+ """
106
+ Convert the entire chat conversation into a single text prompt.
107
+
108
+ Each message is prefixed by β€œUser:” or β€œAssistant:” to form a full conversation.
109
+
110
+ Parameters:
111
+ chat_history (List[Union[dict, gr.ChatMessage]]): The conversation history.
112
+
113
+ Returns:
114
+ str: A string that concatenates the conversation history.
115
+ """
116
+ conversation = ""
117
+ for msg in chat_history:
118
+ content = get_message_content(msg)
119
+ role = msg.get("role") if isinstance(msg, dict) else msg.role
120
+ if role in ["assistant", "model"]:
121
+ conversation += f"Assistant: {content}\n"
122
+ else:
123
+ conversation += f"User: {content}\n"
124
+ return conversation
125
+
126
+
127
+ def upload(files: Optional[List[str]], chatbot: List[Union[dict, gr.ChatMessage]]):
128
+ """
129
+ Process uploaded image files: resize them, convert to an HTML <img> tag (with base64 data),
130
+ and append it as a new user message to the chatbot history.
131
+
132
+ Parameters:
133
+ files (Optional[List[str]]): List of image file paths.
134
+ chatbot (List[Union[dict, gr.ChatMessage]]): The current conversation history.
135
+
136
+ Returns:
137
+ List[Union[dict, gr.ChatMessage]]: Updated conversation history.
138
+ """
139
+ for file in files:
140
+ image = Image.open(file).convert("RGB")
141
+ image = preprocess_image(image)
142
+ image_html = image_to_base64_html_from_pil(image)
143
+ chatbot.append(gr.ChatMessage(role="user", content=image_html))
144
+ return chatbot
145
+
146
+
147
+ def upload_audio(
148
+ files: Optional[List[str]], chatbot: List[Union[dict, gr.ChatMessage]]
149
+ ):
150
+ """
151
+ Process uploaded audio files: read and base64-encode them, wrap the data in an HTML audio player,
152
+ and append it as a new user message.
153
+
154
+ Parameters:
155
+ files (Optional[List[str]]): List of audio file paths.
156
+ chatbot (List[Union[dict, gr.ChatMessage]]): The conversation history.
157
+
158
+ Returns:
159
+ List[Union[dict, gr.ChatMessage]]: The updated chatbot history.
160
+ """
161
+ for file in files:
162
+ with open(file, "rb") as f:
163
+ audio_bytes = f.read()
164
+ b64_data = base64.b64encode(audio_bytes).decode("utf-8")
165
+ audio_html = f"""<audio controls style="max-width:150px;">
166
+ <source src="data:audio/mp3;base64,{b64_data}" type="audio/mp3">
167
+ Your browser does not support the audio element.
168
+ </audio>"""
169
+ chatbot.append(gr.ChatMessage(role="user", content=audio_html))
170
+ return chatbot
171
+
172
+
173
+ def upload_document(
174
+ files: Optional[List[str]], chatbot: List[Union[dict, gr.ChatMessage]]
175
+ ):
176
+ """
177
+ Process uploaded document files (assumed to be PDFs) and add a notification message
178
+ (with an HTML snippet) indicating that the document has been uploaded.
179
+
180
+ Parameters:
181
+ files (Optional[List[str]]): List of document file paths.
182
+ chatbot (List[Union[dict, gr.ChatMessage]]): The conversation history.
183
+
184
+ Returns:
185
+ List[Union[dict, gr.ChatMessage]]: The updated chatbot history.
186
+ """
187
+ for file in files:
188
+ filename = os.path.basename(file)
189
+ doc_html = f"<p>πŸ“„ Document uploaded: {filename}</p>"
190
+ chatbot.append(gr.ChatMessage(role="user", content=doc_html))
191
+ return chatbot
192
+
193
+
194
+ def user(text_prompt: str, chatbot: List[gr.ChatMessage]):
195
+ """
196
+ Append a new user text message to the chat history.
197
+
198
+ Parameters:
199
+ text_prompt (str): The input text provided by the user.
200
+ chatbot (List[gr.ChatMessage]): The existing conversation history.
201
+
202
+ Returns:
203
+ Tuple[str, List[gr.ChatMessage]]: A tuple of an empty string (clearing the prompt)
204
+ and the updated conversation history.
205
+ """
206
+ if text_prompt:
207
+ chatbot.append(gr.ChatMessage(role="user", content=text_prompt))
208
+ return "", chatbot
209
+
210
+
211
+ def get_message_content(msg):
212
+ """
213
+ Retrieve the content of a message that can be either a dictionary or a gr.ChatMessage.
214
+
215
+ Parameters:
216
+ msg (Union[dict, gr.ChatMessage]): The message object.
217
+
218
+ Returns:
219
+ str: The textual content of the message.
220
+ """
221
+ if isinstance(msg, dict):
222
+ return msg.get("content", "")
223
+ return msg.content
224
+
225
+
226
+ def bot(
227
+ image_files: Optional[List[str]],
228
+ audio_files: Optional[List[str]],
229
+ doc_files: Optional[List[str]],
230
+ chatbot: List[Union[dict, gr.ChatMessage]],
231
+ ):
232
+ """
233
+ Generate a chatbot response from Gemini 2.0 based on provided inputs.
234
+ This function supports three branches:
235
+ 1. Document branch: when doc_files are provided.
236
+ 2. Multi-modal branch: when image and/or audio files are provided.
237
+ 3. Text-only conversation branch.
238
+ All branches now use generate_content_stream to yield incremental responses.
239
+
240
+ Parameters:
241
+ image_files (Optional[List[str]]): List of image file paths.
242
+ audio_files (Optional[List[str]]): List of audio file paths.
243
+ doc_files (Optional[List[str]]): List of document file paths.
244
+ chatbot (List[Union[dict, gr.ChatMessage]]): The conversation history.
245
+
246
+ Yields:
247
+ List[Union[dict, gr.ChatMessage]]: The updated conversation history with streamed responses.
248
+ """
249
+ if len(chatbot) == 0:
250
+ return chatbot
251
+
252
+ # Append a placeholder for the assistant's response.
253
+ chatbot.append(gr.ChatMessage(role="assistant", content=""))
254
+
255
+ generation_config = types.GenerateContentConfig(
256
+ temperature=0.4,
257
+ max_output_tokens=4096,
258
+ top_k=32,
259
+ top_p=1,
260
+ )
261
+
262
+ # Branch 1: Document uploads.
263
+ if doc_files and len(doc_files) > 0:
264
+ prev_msg_content = get_message_content(chatbot[-2]) if len(chatbot) >= 2 else ""
265
+ prompt = [prev_msg_content] if prev_msg_content else []
266
+ doc_parts = []
267
+ for file in doc_files:
268
+ with open(file, "rb") as f:
269
+ doc_bytes = f.read()
270
+ doc_parts.append(
271
+ types.Part.from_bytes(
272
+ data=doc_bytes,
273
+ mime_type="application/pdf",
274
+ )
275
+ )
276
+ # Combine document parts and previous text.
277
+ contents = doc_parts + prompt
278
+ # Use the streaming endpoint.
279
+ response = CLIENT.models.generate_content_stream(
280
+ model="gemini-2.0-pro-exp-02-05",
281
+ contents=contents,
282
+ config=generation_config,
283
+ )
284
+ for chunk in response:
285
+ for i in range(0, len(chunk.text), 10):
286
+ section = chunk.text[i : i + 10]
287
+ if isinstance(chatbot[-1], dict):
288
+ chatbot[-1]["content"] += section
289
+ else:
290
+ chatbot[-1].content += section
291
+ time.sleep(0.01)
292
+ yield chatbot
293
+ return
294
+
295
+ # Branch 2: Image or audio uploads.
296
+ elif (image_files and len(image_files) > 0) or (
297
+ audio_files and len(audio_files) > 0
298
+ ):
299
+ prev_msg_content = get_message_content(chatbot[-2]) if len(chatbot) >= 2 else ""
300
+ text_prompt = [prev_msg_content] if prev_msg_content else []
301
+ image_prompt = (
302
+ [Image.open(file).convert("RGB") for file in image_files]
303
+ if image_files
304
+ else []
305
+ )
306
+ audio_prompt = []
307
+ if audio_files:
308
+ for file in audio_files:
309
+ with open(file, "rb") as f:
310
+ audio_bytes = f.read()
311
+ audio_prompt.append(
312
+ types.Part.from_bytes(
313
+ data=audio_bytes,
314
+ mime_type="audio/mp3",
315
+ )
316
+ )
317
+ # Combine all inputs into a multi-modal prompt.
318
+ contents = text_prompt + image_prompt + audio_prompt
319
+ response = CLIENT.models.generate_content_stream(
320
+ model="gemini-2.0-pro-exp-02-05",
321
+ contents=contents,
322
+ config=generation_config,
323
+ )
324
+ for chunk in response:
325
+ for i in range(0, len(chunk.text), 10):
326
+ section = chunk.text[i : i + 10]
327
+ if isinstance(chatbot[-1], dict):
328
+ chatbot[-1]["content"] += section
329
+ else:
330
+ chatbot[-1].content += section
331
+ time.sleep(0.01)
332
+ yield chatbot
333
+ return
334
+
335
+ # Branch 3: Text-only conversation.
336
+ else:
337
+ conversation_text = chat_history_to_prompt(chatbot)
338
+ response = CLIENT.models.generate_content_stream(
339
+ model="gemini-2.0-pro-exp-02-05",
340
+ contents=[conversation_text],
341
+ config=generation_config,
342
+ )
343
+ for chunk in response:
344
+ for i in range(0, len(chunk.text), 10):
345
+ section = chunk.text[i : i + 10]
346
+ if isinstance(chatbot[-1], dict):
347
+ chatbot[-1]["content"] += section
348
+ else:
349
+ chatbot[-1].content += section
350
+ time.sleep(0.01)
351
+ yield chatbot
352
+ return
353
+
354
+
355
+ def run_code_execution(code_prompt: str, chatbot: List[Union[dict, gr.ChatMessage]]):
356
+ """
357
+ Append the user's code execution query to the chat history, then call Gemini
358
+ with code execution enabled using the user's input. The results (including any
359
+ generated code and execution output) are appended as a new assistant message.
360
+ """
361
+ # Only add a user message if there is content.
362
+ if code_prompt.strip():
363
+ chatbot.append(gr.ChatMessage(role="user", content=code_prompt))
364
+ # Append an empty assistant message to update with the code execution response.
365
+ chatbot.append(gr.ChatMessage(role="assistant", content=""))
366
+
367
+ generation_config = types.GenerateContentConfig(
368
+ tools=[types.Tool(code_execution=types.ToolCodeExecution)]
369
+ )
370
+ response = CLIENT.models.generate_content(
371
+ model="gemini-2.0-pro-exp-02-05",
372
+ contents=code_prompt,
373
+ config=generation_config,
374
+ )
375
+
376
+ output_text = ""
377
+ for part in response.candidates[0].content.parts:
378
+ if part.text is not None:
379
+ output_text += f"{part.text}\n"
380
+ if part.executable_code is not None:
381
+ # Display the executable code in a code block (using markdown formatting)
382
+ output_text += (
383
+ f"\n**Generated Code:**\n```python\n{part.executable_code.code}\n```\n"
384
+ )
385
+ if part.code_execution_result is not None:
386
+ output_text += (
387
+ f"\n**Output:**\n```\n{part.code_execution_result.output}\n```\n"
388
+ )
389
+ if part.inline_data is not None:
390
+ image_data = base64.b64decode(part.inline_data.data)
391
+ image = Image.open(io.BytesIO(image_data))
392
+ buffered = io.BytesIO()
393
+ image.save(buffered, format="PNG")
394
+ b64_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
395
+ output_text += f'\n<img src="data:image/png;base64,{b64_data}" alt="Inline Image" style="max-width:300px;"/>\n'
396
+ output_text += "\n---\n"
397
+
398
+ # Update the last assistant message with the code execution result.
399
+ if isinstance(chatbot[-1], dict):
400
+ chatbot[-1]["content"] = output_text
401
+ else:
402
+ chatbot[-1].content = output_text
403
+
404
+ # Clear the text prompt after processing.
405
+ return "", chatbot
406
+
407
+
408
+ # Define the Gradio UI components.
409
+ chatbot_component = gr.Chatbot(
410
+ label="Gemini 2.0 Pro",
411
+ type="messages", # Using message objects.
412
+ bubble_full_width=False,
413
+ avatar_images=AVATAR_IMAGES,
414
+ scale=2,
415
+ height=400,
416
+ )
417
+ text_prompt_component = gr.Textbox(
418
+ placeholder="Enter your message or code query here...",
419
+ show_label=False,
420
+ autofocus=True,
421
+ scale=19,
422
+ )
423
+ upload_button_component = gr.UploadButton(
424
+ label="Upload Images",
425
+ file_count="multiple",
426
+ file_types=["image"],
427
+ scale=1,
428
+ )
429
+ upload_audio_button_component = gr.UploadButton(
430
+ label="Upload Audio",
431
+ file_count="multiple",
432
+ file_types=["audio"],
433
+ scale=1,
434
+ )
435
+ upload_doc_button_component = gr.UploadButton(
436
+ label="Upload Documents",
437
+ file_count="multiple",
438
+ file_types=[".pdf"],
439
+ scale=1,
440
+ )
441
+ run_button_component = gr.Button(value="Run", variant="primary", scale=1, min_width=60)
442
+ run_code_execution_button = gr.Button(
443
+ value="Run Code Execution", variant="secondary", scale=1
444
+ )
445
+
446
+ # Define input lists for button chaining.
447
+ user_inputs = [text_prompt_component, chatbot_component]
448
+ bot_inputs = [
449
+ upload_button_component,
450
+ upload_audio_button_component,
451
+ upload_doc_button_component,
452
+ chatbot_component,
453
+ ]
454
+
455
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
456
+ gr.HTML(TITLE)
457
+ with gr.Column():
458
+ chatbot_component.render()
459
+ with gr.Row(equal_height=True):
460
+ text_prompt_component.render()
461
+ run_button_component.render()
462
+ with gr.Row():
463
+ # Render file-upload buttons and the code execution button in a single row.
464
+ upload_button_component.render()
465
+ upload_audio_button_component.render()
466
+ upload_doc_button_component.render()
467
+ run_code_execution_button.render()
468
+
469
+ # When the Run button is clicked, first process the user text then stream a response.
470
+ run_button_component.click(
471
+ fn=user,
472
+ inputs=user_inputs,
473
+ outputs=[text_prompt_component, chatbot_component],
474
+ queue=False,
475
+ ).then(
476
+ fn=bot,
477
+ inputs=bot_inputs,
478
+ outputs=[chatbot_component],
479
+ )
480
+
481
+ # Allow submission using the Enter key.
482
+ text_prompt_component.submit(
483
+ fn=user,
484
+ inputs=user_inputs,
485
+ outputs=[text_prompt_component, chatbot_component],
486
+ queue=False,
487
+ ).then(
488
+ fn=bot,
489
+ inputs=bot_inputs,
490
+ outputs=[chatbot_component],
491
+ )
492
+
493
+ # Handle image uploads.
494
+ upload_button_component.upload(
495
+ fn=upload,
496
+ inputs=[upload_button_component, chatbot_component],
497
+ outputs=[chatbot_component],
498
+ queue=False,
499
+ )
500
+
501
+ # Handle audio uploads.
502
+ upload_audio_button_component.upload(
503
+ fn=upload_audio,
504
+ inputs=[upload_audio_button_component, chatbot_component],
505
+ outputs=[chatbot_component],
506
+ queue=False,
507
+ )
508
+
509
+ # Handle document uploads.
510
+ upload_doc_button_component.upload(
511
+ fn=upload_document,
512
+ inputs=[upload_doc_button_component, chatbot_component],
513
+ outputs=[chatbot_component],
514
+ queue=False,
515
+ )
516
+
517
+ # When the Code Execution button is clicked, process the code prompt and stream the output.
518
+ run_code_execution_button.click(
519
+ fn=run_code_execution,
520
+ inputs=[text_prompt_component, chatbot_component],
521
+ outputs=[text_prompt_component, chatbot_component],
522
+ queue=False,
523
+ )
524
+
525
+ # Launch the demo interface with queuing enabled.
526
+ demo.queue(max_size=99, api_open=False).launch(debug=False, pwa=True, show_error=True)
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ google-genai==1.0.0