玙珲 commited on
Commit
e3eced0
·
1 Parent(s): c3e9c46
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +174 -93
  3. examples/video_demo.mp4 +3 -0
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.jpg filter=lfs diff=lfs merge=lfs -text
37
  *.png filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.jpg filter=lfs diff=lfs merge=lfs -text
37
  *.png filter=lfs diff=lfs merge=lfs -text
38
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -3,25 +3,74 @@ subprocess.run('pip install flash-attn==2.7.0.post2 --no-build-isolation', env={
3
 
4
  import spaces
5
 
6
-
7
  import argparse
8
  import os
9
  import re
10
- from typing import List, Optional, Tuple
 
 
11
 
12
  import gradio as gr
13
  import PIL.Image
14
  import torch
15
  import numpy as np
16
  from moviepy.editor import VideoFileClip
17
- from transformers import AutoModelForCausalLM
 
 
 
 
18
 
19
- # --- Global Model Variable ---
20
- # model = None
 
21
  # This should point to the directory containing your SVG file.
22
  CUR_DIR = os.path.dirname(os.path.abspath(__file__))
23
 
 
 
 
 
 
 
24
  # --- Helper Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[List[PIL.Image.Image]]:
27
  """Extracts a specified number of frames from a video file."""
@@ -42,44 +91,62 @@ def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[
42
  def parse_model_output(response_text: str, enable_thinking: bool) -> str:
43
  """Formats the model output, separating 'thinking' and 'response' parts if enabled."""
44
  if enable_thinking:
 
45
  think_match = re.search(r"<think>(.*?)</think>", response_text, re.DOTALL)
46
  if think_match:
47
  thinking_content = think_match.group(1).strip()
 
48
  response_content = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
49
  return f"**Thinking:**\n```\n{thinking_content}\n```\n\n**Response:**\n{response_content}"
50
  else:
51
- return response_text
52
  else:
53
- return response_text
 
 
54
 
55
- # --- Core Inference Logic ---
56
  @spaces.GPU
57
  def run_inference(
 
58
  image_input: Optional[PIL.Image.Image],
59
  video_input: Optional[str],
60
- prompt: str,
61
  do_sample: bool,
62
  max_new_tokens: int,
63
  enable_thinking: bool,
64
- ) -> List[List[str]]:
65
- """Runs a single turn of inference and formats the output for a gr.Chatbot."""
 
 
 
 
66
  if (not image_input and not video_input and not prompt) or not prompt:
67
  gr.Warning("A text prompt is required for generation.")
68
- return []
 
 
 
 
 
 
69
 
70
  content = []
71
  if image_input:
72
  content.append({"type": "image", "image": image_input})
73
  if video_input:
74
  frames = load_video_frames(video_input)
75
- if frames: content.append({"type": "video", "video": frames})
 
76
  else:
77
  gr.Warning("Failed to process the video file.")
78
- return [[prompt, "Error: Could not process the video file."]]
79
-
 
 
80
  content.append({"type": "text", "text": prompt})
81
 
82
  messages = [{"role": "user", "content": content}]
 
83
 
84
  try:
85
  if video_input:
@@ -87,7 +154,9 @@ def run_inference(
87
  else:
88
  input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, enable_thinking=enable_thinking)
89
  except Exception as e:
90
- return [[prompt, f"Error during input preprocessing: {e}"]]
 
 
91
 
92
  input_ids = input_ids.to(model.device)
93
  if pixel_values is not None:
@@ -96,28 +165,54 @@ def run_inference(
96
  grid_thws = grid_thws.to(model.device)
97
 
98
  gen_kwargs = {
99
- "max_new_tokens": max_new_tokens, "do_sample": do_sample,
100
- "eos_token_id": model.text_tokenizer.eos_token_id, "pad_token_id": model.text_tokenizer.pad_token_id
 
 
 
 
101
  }
102
-
103
- with torch.inference_mode():
104
- try:
105
- outputs = model.generate(inputs=input_ids, pixel_values=pixel_values, grid_thws=grid_thws, **gen_kwargs)
106
- except Exception as e:
107
- return [[prompt, f"Error during model generation: {e}"]]
108
-
109
- response_text = model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)
110
- formatted_response = parse_model_output(response_text, enable_thinking)
111
-
112
- return [[prompt, formatted_response]]
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  # --- UI Helper Functions ---
116
  def toggle_media_input(choice: str) -> Tuple:
117
  """Switches visibility between Image/Video inputs and their corresponding examples."""
118
  if choice == "Image":
119
  return gr.update(visible=True, value=None), gr.update(visible=False, value=None), gr.update(visible=True), gr.update(visible=False)
120
- else: # Video
121
  return gr.update(visible=False, value=None), gr.update(visible=True, value=None), gr.update(visible=False), gr.update(visible=True)
122
 
123
 
@@ -125,19 +220,23 @@ def toggle_media_input(choice: str) -> Tuple:
125
  # @spaces.GPU
126
  def build_demo(model_path: str):
127
  """Builds the Gradio user interface for the model."""
128
- global model
129
- device = f"cuda"
130
  print(f"Loading model {model_path} onto device {device}...")
131
-
132
  model = AutoModelForCausalLM.from_pretrained(
133
- model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
 
 
134
  ).to(device).eval()
135
 
 
 
 
136
  print("Model loaded successfully.")
137
 
138
  model_name_display = model_path.split('/')[-1]
139
-
140
- # --- Logo & Header ---
141
  logo_html = ""
142
  logo_svg_path = os.path.join(CUR_DIR, "resource", "logo.svg")
143
  if os.path.exists(logo_svg_path):
@@ -147,7 +246,6 @@ def build_demo(model_path: str):
147
  svg_content_styled = re.sub(r'(<svg[^>]*)(>)', rf'\1 height="{font_size}" style="vertical-align: middle; display: inline-block;"\2', svg_content)
148
  logo_html = f'<span style="display: inline-block; vertical-align: middle;">{svg_content_styled}</span>'
149
  else:
150
- # Fallback if SVG is not found
151
  logo_html = '<span style="font-weight: bold; font-size: 2.5em; display: inline-block; vertical-align: middle;">Ovis</span>'
152
  print(f"Warning: Logo file not found at {logo_svg_path}. Using text fallback.")
153
 
@@ -159,26 +257,23 @@ def build_demo(model_path: str):
159
  <center><font size=3><b>Ovis</b> has been open-sourced on <a href='https://huggingface.co/{model_path}'>😊 Huggingface</a> and <a href='https://github.com/AIDC-AI/Ovis'>🌟 GitHub</a>. If you find Ovis useful, a like❤️ or a star🌟 would be appreciated.</font></center>
160
  """
161
 
 
162
  with gr.Blocks(theme=gr.themes.Ocean()) as demo:
163
  gr.HTML(html_header)
164
- gr.Markdown(f"This interface is served by a single model. Each submission starts a new, independent conversation.")
165
-
166
  with gr.Row():
167
- # --- Left Column (Media Inputs, Settings, Prompt & Actions) ---
168
  with gr.Column(scale=4):
169
- input_type_radio = gr.Radio(choices=["Image"], value="Image", label="Select Input Type")
170
  image_input = gr.Image(label="Image Input", type="pil", visible=True)
171
  video_input = gr.Video(label="Video Input", visible=False)
172
-
173
  with gr.Accordion("Generation Settings", open=True):
174
- do_sample = gr.Checkbox(label="Enable Sampling (Do Sample)", value=False)
175
- max_new_tokens = gr.Slider(minimum=32, maximum=4096, value=1024, step=32, label="Max New Tokens")
176
- enable_thinking = gr.Checkbox(label="Enable Deep Thinking", value=True)
177
 
178
- prompt_input = gr.Textbox(label="Prompt", placeholder="Enter your text here and press ENTER", lines=3)
179
- with gr.Row():
180
- generate_btn = gr.Button("Send", variant="primary")
181
- clear_btn = gr.Button("Clear", variant="secondary")
182
 
183
  with gr.Column(visible=True) as image_examples_col:
184
  gr.Examples(
@@ -186,74 +281,60 @@ def build_demo(model_path: str):
186
  [os.path.join(CUR_DIR, "examples", "ovis2_math0.jpg"), "Each face of the polyhedron shown is either a triangle or a square. Each square borders 4 triangles, and each triangle borders 3 squares. The polyhedron has 6 squares. How many triangles does it have?\n\nEnd your response with 'Final answer: '."],
187
  [os.path.join(CUR_DIR, "examples", "ovis2_math1.jpg"), "A large square touches another two squares, as shown in the picture. The numbers inside the smaller squares indicate their areas. What is the area of the largest square?\n\nEnd your response with 'Final answer: '."],
188
  [os.path.join(CUR_DIR, "examples", "ovis2_figure0.png"), "Explain this model."],
189
- [os.path.join(CUR_DIR, "examples", "ovis2_figure1.png"), "Organize the notes about GRPO in the figure."],
190
  [os.path.join(CUR_DIR, "examples", "ovis2_multi0.jpg"), "Posso avere un frappuccino e un caffè americano di taglia M? Quanto costa in totale?"],
191
  ],
192
  inputs=[image_input, prompt_input]
193
  )
194
- # with gr.Column(visible=False) as video_examples_col:
195
- # gr.Examples(examples=[[os.path.join(CUR_DIR, "examples", "video_demo_1.mp4"), "Describe the video."]],
196
- # inputs=[video_input, prompt_input])
197
-
198
- # --- Right Column (Chat Display) ---
199
- with gr.Column(scale=6):
200
- chatbot = gr.Chatbot(label="Ovis", height=750, show_copy_button=True, layout="panel")
201
-
202
- # --- Event Handlers ---
 
 
203
  input_type_radio.change(
204
  fn=toggle_media_input,
205
  inputs=input_type_radio,
206
- outputs=[image_input, video_input, image_examples_col]
207
  )
208
-
209
- run_inputs = [image_input, video_input, prompt_input, do_sample, max_new_tokens, enable_thinking]
210
 
211
- generate_btn.click(fn=run_inference, inputs=run_inputs, outputs=chatbot)
212
- prompt_input.submit(fn=run_inference, inputs=run_inputs, outputs=chatbot)
213
-
 
 
 
 
214
  clear_btn.click(
215
- fn=lambda: ([], None, None, "", "Image", False, 1024, True),
216
  outputs=[chatbot, image_input, video_input, prompt_input, input_type_radio, do_sample, max_new_tokens, enable_thinking]
217
  ).then(
218
  fn=toggle_media_input,
219
  inputs=input_type_radio,
220
- outputs=[image_input, video_input, image_examples_col]
221
  )
 
222
  return demo
223
 
224
  # --- Main Execution Block ---
225
  # def parse_args():
226
  # parser = argparse.ArgumentParser(description="Gradio interface for a single Multimodal Large Language Model.")
227
- # parser.add_argument("--model-path", type=str, default='AIDC-AI/Ovis2.5-2B', help="Path to the model checkpoint on Hugging Face Hub or local directory.")
228
  # parser.add_argument("--gpu", type=int, default=0, help="GPU index to run the model on.")
229
  # parser.add_argument("--port", type=int, default=7860, help="Port to run the Gradio server on.")
230
  # parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Server name for the Gradio app.")
231
  # return parser.parse_args()
232
 
 
233
  # if __name__ == "__main__":
234
- # if not os.path.exists("examples"): os.makedirs("examples")
235
- # if not os.path.exists("resource"): os.makedirs("resource")
236
- # print("Note: For the logo to display correctly, place 'logo.svg' inside the 'resource' directory.")
237
-
238
- # example_files = [
239
- # "ovis2_math0.jpg",
240
- # "ovis2_math1.jpg",
241
- # "ovis2_figure0.png",
242
- # "ovis2_figure1.png",
243
- # "ovis2_multi0.jpg",
244
- # "video_demo_1.mp4",
245
- # ]
246
- # for fname in example_files:
247
- # fpath = os.path.join("examples", fname)
248
- # if not os.path.exists(fpath):
249
- # if fname.endswith(".mp4"):
250
- # os.system(f'ffmpeg -y -f lavfi -i "smptebars=size=128x72:rate=10" -t 3 -pix_fmt yuv420p "{fpath}" >/dev/null 2>&1')
251
- # else:
252
- # PIL.Image.new('RGB', (224, 224), color = 'grey').save(fpath)
253
-
254
-
255
- model_path = 'AIDC-AI/Ovis2.5-2B'
256
  demo = build_demo(model_path=model_path)
257
- # print(f"Launching Gradio app on http://{args.server_name}:{args.port}")
258
- # demo.queue().launch(server_name=args.server_name, server_port=args.port, share=False, ssl_verify=False)
259
- demo.launch()
 
3
 
4
  import spaces
5
 
 
6
  import argparse
7
  import os
8
  import re
9
+ import logging
10
+ from typing import List, Optional, Tuple, Generator
11
+ from threading import Thread
12
 
13
  import gradio as gr
14
  import PIL.Image
15
  import torch
16
  import numpy as np
17
  from moviepy.editor import VideoFileClip
18
+ from transformers import AutoModelForCausalLM, TextIteratorStreamer
19
+
20
+ logging.getLogger("httpx").setLevel(logging.WARNING)
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
 
24
+ # --- Global Model Variables ---
25
+ model = None
26
+ streamer = None
27
  # This should point to the directory containing your SVG file.
28
  CUR_DIR = os.path.dirname(os.path.abspath(__file__))
29
 
30
+ def submit_chat(chatbot, text_input):
31
+ response = ''
32
+ chatbot.append([text_input, response])
33
+ return chatbot, ''
34
+
35
+
36
  # --- Helper Functions ---
37
+ latex_delimiters_set = [
38
+ {
39
+ "left": "\\(",
40
+ "right": "\\)",
41
+ "display": False
42
+ },
43
+ {
44
+ "left": "\\begin{equation}",
45
+ "right": "\\end{equation}",
46
+ "display": True
47
+ },
48
+ {
49
+ "left": "\\begin{align}",
50
+ "right": "\\end{align}",
51
+ "display": True
52
+ },
53
+ {
54
+ "left": "\\begin{alignat}",
55
+ "right": "\\end{alignat}",
56
+ "display": True
57
+ },
58
+ {
59
+ "left": "\\begin{gather}",
60
+ "right": "\\end{gather}",
61
+ "display": True
62
+ },
63
+ {
64
+ "left": "\\begin{CD}",
65
+ "right": "\\end{CD}",
66
+ "display": True
67
+ },
68
+ {
69
+ "left": "\\[",
70
+ "right": "\\]",
71
+ "display": True
72
+ }
73
+ ]
74
 
75
  def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[List[PIL.Image.Image]]:
76
  """Extracts a specified number of frames from a video file."""
 
91
  def parse_model_output(response_text: str, enable_thinking: bool) -> str:
92
  """Formats the model output, separating 'thinking' and 'response' parts if enabled."""
93
  if enable_thinking:
94
+ # Use a more robust regex to handle nested content and variations
95
  think_match = re.search(r"<think>(.*?)</think>", response_text, re.DOTALL)
96
  if think_match:
97
  thinking_content = think_match.group(1).strip()
98
+ # Remove the think block from the original text to get the response
99
  response_content = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
100
  return f"**Thinking:**\n```\n{thinking_content}\n```\n\n**Response:**\n{response_content}"
101
  else:
102
+ return response_text # No think tag found, return as is
103
  else:
104
+ # If thinking is disabled, strip the tags just in case the model still generates them
105
+ return re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
106
+
107
 
108
+ # --- MODIFIED Core Inference Logic (Now with Streaming) ---
109
  @spaces.GPU
110
  def run_inference(
111
+ chatbot: List,
112
  image_input: Optional[PIL.Image.Image],
113
  video_input: Optional[str],
 
114
  do_sample: bool,
115
  max_new_tokens: int,
116
  enable_thinking: bool,
117
+ ):
118
+ """
119
+ Runs a single turn of inference and yields the output stream for a gr.Chatbot.
120
+ This function is now a generator.
121
+ """
122
+ prompt = chatbot[-1][0]
123
  if (not image_input and not video_input and not prompt) or not prompt:
124
  gr.Warning("A text prompt is required for generation.")
125
+ # MODIFICATION: Yield the current state and return to avoid errors
126
+ yield chatbot
127
+ return
128
+
129
+ # MODIFICATION: Append the new prompt to the existing history
130
+ # chatbot.append([prompt, ""])
131
+ # yield chatbot, "" # Yield the updated chat to show the user's prompt immediately
132
 
133
  content = []
134
  if image_input:
135
  content.append({"type": "image", "image": image_input})
136
  if video_input:
137
  frames = load_video_frames(video_input)
138
+ if frames:
139
+ content.append({"type": "video", "video": frames})
140
  else:
141
  gr.Warning("Failed to process the video file.")
142
+ chatbot[-1][1] = "Error: Could not process the video file."
143
+ yield chatbot
144
+ return
145
+
146
  content.append({"type": "text", "text": prompt})
147
 
148
  messages = [{"role": "user", "content": content}]
149
+ logger.info(messages)
150
 
151
  try:
152
  if video_input:
 
154
  else:
155
  input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, enable_thinking=enable_thinking)
156
  except Exception as e:
157
+ chatbot[-1][1] = f"Error during input preprocessing: {e}"
158
+ yield chatbot
159
+ return
160
 
161
  input_ids = input_ids.to(model.device)
162
  if pixel_values is not None:
 
165
  grid_thws = grid_thws.to(model.device)
166
 
167
  gen_kwargs = {
168
+ "max_new_tokens": max_new_tokens,
169
+ "do_sample": do_sample,
170
+ "eos_token_id": model.text_tokenizer.eos_token_id,
171
+ "pad_token_id": model.text_tokenizer.pad_token_id,
172
+ "streamer": streamer,
173
+ "use_cache": True
174
  }
 
 
 
 
 
 
 
 
 
 
 
175
 
176
+ with torch.inference_mode():
177
+ thread = Thread(target=model.generate, kwargs={
178
+ "inputs": input_ids,
179
+ "pixel_values": pixel_values,
180
+ "grid_thws": grid_thws,
181
+ **gen_kwargs
182
+ })
183
+ thread.start()
184
+
185
+ # MODIFICATION: Stream output token by token
186
+ response_text = ""
187
+ for new_text in streamer:
188
+ response_text += new_text
189
+ # Append only the new text chunk to the last response
190
+ chatbot[-1][1] = response_text
191
+ yield chatbot # Yield the updated history
192
+
193
+ thread.join()
194
+
195
+ # MODIFICATION: Format the final response once generation is complete
196
+ formatted_response = parse_model_output(response_text, enable_thinking)
197
+ chatbot[-1][1] = formatted_response
198
+ yield chatbot # Yield the final, formatted response
199
+
200
+ logger.info("[OVIS_CONV_START]")
201
+ [print(f'Q{i}:\n {request}\nA{i}:\n {answer}') for i, (request, answer) in enumerate(chatbot, 1)]
202
+ # print('New_Q:\n', text_input)
203
+ # print('New_A:\n', response)
204
+ logger.info("[OVIS_CONV_END]")
205
+
206
+
207
+ def clear_chat():
208
+ return [], None, ""
209
 
210
  # --- UI Helper Functions ---
211
  def toggle_media_input(choice: str) -> Tuple:
212
  """Switches visibility between Image/Video inputs and their corresponding examples."""
213
  if choice == "Image":
214
  return gr.update(visible=True, value=None), gr.update(visible=False, value=None), gr.update(visible=True), gr.update(visible=False)
215
+ else: # Video
216
  return gr.update(visible=False, value=None), gr.update(visible=True, value=None), gr.update(visible=False), gr.update(visible=True)
217
 
218
 
 
220
  # @spaces.GPU
221
  def build_demo(model_path: str):
222
  """Builds the Gradio user interface for the model."""
223
+ global model, streamer
224
+ device = "cuda"
225
  print(f"Loading model {model_path} onto device {device}...")
226
+
227
  model = AutoModelForCausalLM.from_pretrained(
228
+ model_path,
229
+ torch_dtype=torch.bfloat16,
230
+ trust_remote_code=True
231
  ).to(device).eval()
232
 
233
+ text_tokenizer = model.text_tokenizer
234
+ streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
235
+
236
  print("Model loaded successfully.")
237
 
238
  model_name_display = model_path.split('/')[-1]
239
+
 
240
  logo_html = ""
241
  logo_svg_path = os.path.join(CUR_DIR, "resource", "logo.svg")
242
  if os.path.exists(logo_svg_path):
 
246
  svg_content_styled = re.sub(r'(<svg[^>]*)(>)', rf'\1 height="{font_size}" style="vertical-align: middle; display: inline-block;"\2', svg_content)
247
  logo_html = f'<span style="display: inline-block; vertical-align: middle;">{svg_content_styled}</span>'
248
  else:
 
249
  logo_html = '<span style="font-weight: bold; font-size: 2.5em; display: inline-block; vertical-align: middle;">Ovis</span>'
250
  print(f"Warning: Logo file not found at {logo_svg_path}. Using text fallback.")
251
 
 
257
  <center><font size=3><b>Ovis</b> has been open-sourced on <a href='https://huggingface.co/{model_path}'>😊 Huggingface</a> and <a href='https://github.com/AIDC-AI/Ovis'>🌟 GitHub</a>. If you find Ovis useful, a like❤️ or a star🌟 would be appreciated.</font></center>
258
  """
259
 
260
+ prompt_input = gr.Textbox(label="Prompt", placeholder="Enter your text here and press ENTER", lines=1, container=False)
261
  with gr.Blocks(theme=gr.themes.Ocean()) as demo:
262
  gr.HTML(html_header)
263
+ gr.Markdown("Note: you might have to increase \"Max New Tokens\" and wait longer to obtain answer when Deep Thinking is enabled.")
264
+
265
  with gr.Row():
 
266
  with gr.Column(scale=4):
267
+ input_type_radio = gr.Radio(choices=["Image", "Video"], value="Image", label="Select Input Type")
268
  image_input = gr.Image(label="Image Input", type="pil", visible=True)
269
  video_input = gr.Video(label="Video Input", visible=False)
270
+
271
  with gr.Accordion("Generation Settings", open=True):
272
+ do_sample = gr.Checkbox(label="Enable Sampling (Do Sample)", value=True)
273
+ max_new_tokens = gr.Slider(minimum=32, maximum=4096, value=2048, step=32, label="Max New Tokens")
274
+ enable_thinking = gr.Checkbox(label="Enable Deep Thinking", value=False)
275
 
276
+
 
 
 
277
 
278
  with gr.Column(visible=True) as image_examples_col:
279
  gr.Examples(
 
281
  [os.path.join(CUR_DIR, "examples", "ovis2_math0.jpg"), "Each face of the polyhedron shown is either a triangle or a square. Each square borders 4 triangles, and each triangle borders 3 squares. The polyhedron has 6 squares. How many triangles does it have?\n\nEnd your response with 'Final answer: '."],
282
  [os.path.join(CUR_DIR, "examples", "ovis2_math1.jpg"), "A large square touches another two squares, as shown in the picture. The numbers inside the smaller squares indicate their areas. What is the area of the largest square?\n\nEnd your response with 'Final answer: '."],
283
  [os.path.join(CUR_DIR, "examples", "ovis2_figure0.png"), "Explain this model."],
284
+ # [os.path.join(CUR_DIR, "examples", "ovis2_figure1.png"), "Organize the notes about GRPO in the figure."],
285
  [os.path.join(CUR_DIR, "examples", "ovis2_multi0.jpg"), "Posso avere un frappuccino e un caffè americano di taglia M? Quanto costa in totale?"],
286
  ],
287
  inputs=[image_input, prompt_input]
288
  )
289
+ with gr.Column(visible=False) as video_examples_col:
290
+ gr.Examples(examples=[[os.path.join(CUR_DIR, "examples", "video_demo.mp4"), "Describe the video."]],
291
+ inputs=[video_input, prompt_input])
292
+
293
+ with gr.Column(scale=7):
294
+ chatbot = gr.Chatbot(label="Ovis", height=600, show_copy_button=True, layout="panel", latex_delimiters=latex_delimiters_set)
295
+ prompt_input.render()
296
+ with gr.Row():
297
+ generate_btn = gr.Button("Send", variant="primary")
298
+ clear_btn = gr.Button("Clear", variant="secondary")
299
+
300
  input_type_radio.change(
301
  fn=toggle_media_input,
302
  inputs=input_type_radio,
303
+ outputs=[image_input, video_input, image_examples_col, video_examples_col]
304
  )
 
 
305
 
306
+ # MODIFICATION: Update event handlers to use the new function and manage state
307
+ run_inputs = [chatbot, image_input, video_input, do_sample, max_new_tokens, enable_thinking]
308
+ # run_outputs = [image_input, prompt_input]
309
+
310
+ generat_click_event = generate_btn.click(submit_chat, [chatbot, prompt_input], [chatbot, prompt_input]).then(run_inference, run_inputs, chatbot)
311
+ submit_event = prompt_input.submit(submit_chat, [chatbot, prompt_input], [chatbot, prompt_input]).then(run_inference, run_inputs, chatbot)
312
+
313
  clear_btn.click(
314
+ fn=lambda: ([], None, None, "", "Image", True, 1024, False),
315
  outputs=[chatbot, image_input, video_input, prompt_input, input_type_radio, do_sample, max_new_tokens, enable_thinking]
316
  ).then(
317
  fn=toggle_media_input,
318
  inputs=input_type_radio,
319
+ outputs=[image_input, video_input, image_examples_col, video_examples_col]
320
  )
321
+
322
  return demo
323
 
324
  # --- Main Execution Block ---
325
  # def parse_args():
326
  # parser = argparse.ArgumentParser(description="Gradio interface for a single Multimodal Large Language Model.")
327
+ # parser.add_argument("--model-path", type=str, default='AIDC-AI/Ovis2.5-9B', help="Path to the model checkpoint on Hugging Face Hub or local directory.")
328
  # parser.add_argument("--gpu", type=int, default=0, help="GPU index to run the model on.")
329
  # parser.add_argument("--port", type=int, default=7860, help="Port to run the Gradio server on.")
330
  # parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Server name for the Gradio app.")
331
  # return parser.parse_args()
332
 
333
+
334
  # if __name__ == "__main__":
335
+ # args = parse_args()
336
+ model_path = 'AIDC-AI/Ovis2.5-9B'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  demo = build_demo(model_path=model_path)
338
+ # demo = build_demo(model_path=args.model_path)
339
+ # demo.launch(server_name=args.server_name, server_port=args.port, share=False, ssl_verify=False, show_error=True)
340
+ demo.queue().launch()
examples/video_demo.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4476e4fd82da4fc37b4c167ec6a4f56fa270c0ad3f2724fd47c0ff92b87d6c6
3
+ size 103118