玙珲 commited on
Commit
939e0e4
·
1 Parent(s): eb0a0f3

support multi-turn, video

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +183 -90
  3. examples/video_demo.mp4 +3 -0
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.jpg filter=lfs diff=lfs merge=lfs -text
37
  *.png filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.jpg filter=lfs diff=lfs merge=lfs -text
37
  *.png filter=lfs diff=lfs merge=lfs -text
38
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -3,25 +3,74 @@ subprocess.run('pip install flash-attn==2.7.0.post2 --no-build-isolation', env={
3
 
4
  import spaces
5
 
6
-
7
  import argparse
8
  import os
9
  import re
10
- from typing import List, Optional, Tuple
 
 
11
 
12
  import gradio as gr
13
  import PIL.Image
14
  import torch
15
  import numpy as np
16
  from moviepy.editor import VideoFileClip
17
- from transformers import AutoModelForCausalLM
 
 
 
 
18
 
19
- # --- Global Model Variable ---
20
- # model = None
 
21
  # This should point to the directory containing your SVG file.
22
  CUR_DIR = os.path.dirname(os.path.abspath(__file__))
23
 
 
 
 
 
 
 
24
  # --- Helper Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[List[PIL.Image.Image]]:
27
  """Extracts a specified number of frames from a video file."""
@@ -42,44 +91,62 @@ def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[
42
  def parse_model_output(response_text: str, enable_thinking: bool) -> str:
43
  """Formats the model output, separating 'thinking' and 'response' parts if enabled."""
44
  if enable_thinking:
 
45
  think_match = re.search(r"<think>(.*?)</think>", response_text, re.DOTALL)
46
  if think_match:
47
  thinking_content = think_match.group(1).strip()
 
48
  response_content = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
49
  return f"**Thinking:**\n```\n{thinking_content}\n```\n\n**Response:**\n{response_content}"
50
  else:
51
- return response_text
52
  else:
53
- return response_text
 
54
 
55
- # --- Core Inference Logic ---
56
- @spaces.GPU
 
57
  def run_inference(
 
58
  image_input: Optional[PIL.Image.Image],
59
  video_input: Optional[str],
60
- prompt: str,
61
  do_sample: bool,
62
  max_new_tokens: int,
63
  enable_thinking: bool,
64
- ) -> List[List[str]]:
65
- """Runs a single turn of inference and formats the output for a gr.Chatbot."""
 
 
 
 
66
  if (not image_input and not video_input and not prompt) or not prompt:
67
  gr.Warning("A text prompt is required for generation.")
68
- return []
 
 
 
 
 
 
69
 
70
  content = []
71
  if image_input:
72
  content.append({"type": "image", "image": image_input})
73
  if video_input:
74
  frames = load_video_frames(video_input)
75
- if frames: content.append({"type": "video", "video": frames})
 
76
  else:
77
  gr.Warning("Failed to process the video file.")
78
- return [[prompt, "Error: Could not process the video file."]]
79
-
 
 
80
  content.append({"type": "text", "text": prompt})
81
 
82
  messages = [{"role": "user", "content": content}]
 
83
 
84
  try:
85
  if video_input:
@@ -87,7 +154,9 @@ def run_inference(
87
  else:
88
  input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, enable_thinking=enable_thinking)
89
  except Exception as e:
90
- return [[prompt, f"Error during input preprocessing: {e}"]]
 
 
91
 
92
  input_ids = input_ids.to(model.device)
93
  if pixel_values is not None:
@@ -96,48 +165,90 @@ def run_inference(
96
  grid_thws = grid_thws.to(model.device)
97
 
98
  gen_kwargs = {
99
- "max_new_tokens": max_new_tokens, "do_sample": do_sample,
100
- "eos_token_id": model.text_tokenizer.eos_token_id, "pad_token_id": model.text_tokenizer.pad_token_id
 
 
 
 
101
  }
102
-
103
- with torch.inference_mode():
104
- try:
105
- outputs = model.generate(inputs=input_ids, pixel_values=pixel_values, grid_thws=grid_thws, **gen_kwargs)
106
- except Exception as e:
107
- return [[prompt, f"Error during model generation: {e}"]]
108
-
109
- response_text = model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)
110
- formatted_response = parse_model_output(response_text, enable_thinking)
111
-
112
- return [[prompt, formatted_response]]
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  # --- UI Helper Functions ---
116
  def toggle_media_input(choice: str) -> Tuple:
117
  """Switches visibility between Image/Video inputs and their corresponding examples."""
118
  if choice == "Image":
119
  return gr.update(visible=True, value=None), gr.update(visible=False, value=None), gr.update(visible=True), gr.update(visible=False)
120
- else: # Video
121
  return gr.update(visible=False, value=None), gr.update(visible=True, value=None), gr.update(visible=False), gr.update(visible=True)
122
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  # --- Build Gradio Application ---
125
  # @spaces.GPU
126
  def build_demo(model_path: str):
127
  """Builds the Gradio user interface for the model."""
128
- global model
129
- device = f"cuda"
130
  print(f"Loading model {model_path} onto device {device}...")
131
-
132
  model = AutoModelForCausalLM.from_pretrained(
133
- model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
 
 
134
  ).to(device).eval()
135
 
 
 
 
136
  print("Model loaded successfully.")
137
 
138
  model_name_display = model_path.split('/')[-1]
139
-
140
- # --- Logo & Header ---
141
  logo_html = ""
142
  logo_svg_path = os.path.join(CUR_DIR, "resource", "logo.svg")
143
  if os.path.exists(logo_svg_path):
@@ -147,7 +258,6 @@ def build_demo(model_path: str):
147
  svg_content_styled = re.sub(r'(<svg[^>]*)(>)', rf'\1 height="{font_size}" style="vertical-align: middle; display: inline-block;"\2', svg_content)
148
  logo_html = f'<span style="display: inline-block; vertical-align: middle;">{svg_content_styled}</span>'
149
  else:
150
- # Fallback if SVG is not found
151
  logo_html = '<span style="font-weight: bold; font-size: 2.5em; display: inline-block; vertical-align: middle;">Ovis</span>'
152
  print(f"Warning: Logo file not found at {logo_svg_path}. Using text fallback.")
153
 
@@ -159,26 +269,23 @@ def build_demo(model_path: str):
159
  <center><font size=3><b>Ovis</b> has been open-sourced on <a href='https://huggingface.co/{model_path}'>😊 Huggingface</a> and <a href='https://github.com/AIDC-AI/Ovis'>🌟 GitHub</a>. If you find Ovis useful, a like❤️ or a star🌟 would be appreciated.</font></center>
160
  """
161
 
 
162
  with gr.Blocks(theme=gr.themes.Ocean()) as demo:
163
  gr.HTML(html_header)
164
- gr.Markdown(f"This interface is served by a single model. Each submission starts a new, independent conversation.")
165
-
166
  with gr.Row():
167
- # --- Left Column (Media Inputs, Settings, Prompt & Actions) ---
168
  with gr.Column(scale=4):
169
- input_type_radio = gr.Radio(choices=["Image"], value="Image", label="Select Input Type")
170
  image_input = gr.Image(label="Image Input", type="pil", visible=True)
171
  video_input = gr.Video(label="Video Input", visible=False)
172
-
173
  with gr.Accordion("Generation Settings", open=True):
174
- do_sample = gr.Checkbox(label="Enable Sampling (Do Sample)", value=False)
175
  max_new_tokens = gr.Slider(minimum=32, maximum=4096, value=1024, step=32, label="Max New Tokens")
176
- enable_thinking = gr.Checkbox(label="Enable Deep Thinking", value=True)
177
 
178
- prompt_input = gr.Textbox(label="Prompt", placeholder="Enter your text here and press ENTER", lines=3)
179
- with gr.Row():
180
- generate_btn = gr.Button("Send", variant="primary")
181
- clear_btn = gr.Button("Clear", variant="secondary")
182
 
183
  with gr.Column(visible=True) as image_examples_col:
184
  gr.Examples(
@@ -191,34 +298,39 @@ def build_demo(model_path: str):
191
  ],
192
  inputs=[image_input, prompt_input]
193
  )
194
- # with gr.Column(visible=False) as video_examples_col:
195
- # gr.Examples(examples=[[os.path.join(CUR_DIR, "examples", "video_demo_1.mp4"), "Describe the video."]],
196
- # inputs=[video_input, prompt_input])
197
-
198
- # --- Right Column (Chat Display) ---
199
- with gr.Column(scale=6):
200
- chatbot = gr.Chatbot(label="Ovis", height=750, show_copy_button=True, layout="panel")
201
-
202
- # --- Event Handlers ---
 
 
203
  input_type_radio.change(
204
  fn=toggle_media_input,
205
  inputs=input_type_radio,
206
- outputs=[image_input, video_input, image_examples_col]
207
  )
208
-
209
- run_inputs = [image_input, video_input, prompt_input, do_sample, max_new_tokens, enable_thinking]
210
 
211
- generate_btn.click(fn=run_inference, inputs=run_inputs, outputs=chatbot)
212
- prompt_input.submit(fn=run_inference, inputs=run_inputs, outputs=chatbot)
213
-
 
 
 
 
214
  clear_btn.click(
215
- fn=lambda: ([], None, None, "", "Image", False, 1024, True),
216
  outputs=[chatbot, image_input, video_input, prompt_input, input_type_radio, do_sample, max_new_tokens, enable_thinking]
217
  ).then(
218
  fn=toggle_media_input,
219
  inputs=input_type_radio,
220
- outputs=[image_input, video_input, image_examples_col]
221
  )
 
222
  return demo
223
 
224
  # --- Main Execution Block ---
@@ -230,30 +342,11 @@ def build_demo(model_path: str):
230
  # parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Server name for the Gradio app.")
231
  # return parser.parse_args()
232
 
233
- # if __name__ == "__main__":
234
- # if not os.path.exists("examples"): os.makedirs("examples")
235
- # if not os.path.exists("resource"): os.makedirs("resource")
236
- # print("Note: For the logo to display correctly, place 'logo.svg' inside the 'resource' directory.")
237
-
238
- # example_files = [
239
- # "ovis2_math0.jpg",
240
- # "ovis2_math1.jpg",
241
- # "ovis2_figure0.png",
242
- # "ovis2_figure1.png",
243
- # "ovis2_multi0.jpg",
244
- # "video_demo_1.mp4",
245
- # ]
246
- # for fname in example_files:
247
- # fpath = os.path.join("examples", fname)
248
- # if not os.path.exists(fpath):
249
- # if fname.endswith(".mp4"):
250
- # os.system(f'ffmpeg -y -f lavfi -i "smptebars=size=128x72:rate=10" -t 3 -pix_fmt yuv420p "{fpath}" >/dev/null 2>&1')
251
- # else:
252
- # PIL.Image.new('RGB', (224, 224), color = 'grey').save(fpath)
253
-
254
 
 
 
255
  model_path = 'AIDC-AI/Ovis2.5-9B'
256
  demo = build_demo(model_path=model_path)
257
- # print(f"Launching Gradio app on http://{args.server_name}:{args.port}")
258
- # demo.queue().launch(server_name=args.server_name, server_port=args.port, share=False, ssl_verify=False)
259
- demo.launch()
 
3
 
4
  import spaces
5
 
 
6
  import argparse
7
  import os
8
  import re
9
+ import logging
10
+ from typing import List, Optional, Tuple, Generator
11
+ from threading import Thread
12
 
13
  import gradio as gr
14
  import PIL.Image
15
  import torch
16
  import numpy as np
17
  from moviepy.editor import VideoFileClip
18
+ from transformers import AutoModelForCausalLM, TextIteratorStreamer
19
+
20
+ logging.getLogger("httpx").setLevel(logging.WARNING)
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
 
24
+ # --- Global Model Variables ---
25
+ model = None
26
+ streamer = None
27
  # This should point to the directory containing your SVG file.
28
  CUR_DIR = os.path.dirname(os.path.abspath(__file__))
29
 
30
+ def submit_chat(chatbot, text_input):
31
+ response = ''
32
+ chatbot.append([text_input, response])
33
+ return chatbot, ''
34
+
35
+
36
  # --- Helper Functions ---
37
+ latex_delimiters_set = [
38
+ {
39
+ "left": "\\(",
40
+ "right": "\\)",
41
+ "display": False
42
+ },
43
+ {
44
+ "left": "\\begin{equation}",
45
+ "right": "\\end{equation}",
46
+ "display": True
47
+ },
48
+ {
49
+ "left": "\\begin{align}",
50
+ "right": "\\end{align}",
51
+ "display": True
52
+ },
53
+ {
54
+ "left": "\\begin{alignat}",
55
+ "right": "\\end{alignat}",
56
+ "display": True
57
+ },
58
+ {
59
+ "left": "\\begin{gather}",
60
+ "right": "\\end{gather}",
61
+ "display": True
62
+ },
63
+ {
64
+ "left": "\\begin{CD}",
65
+ "right": "\\end{CD}",
66
+ "display": True
67
+ },
68
+ {
69
+ "left": "\\[",
70
+ "right": "\\]",
71
+ "display": True
72
+ }
73
+ ]
74
 
75
  def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[List[PIL.Image.Image]]:
76
  """Extracts a specified number of frames from a video file."""
 
91
  def parse_model_output(response_text: str, enable_thinking: bool) -> str:
92
  """Formats the model output, separating 'thinking' and 'response' parts if enabled."""
93
  if enable_thinking:
94
+ # Use a more robust regex to handle nested content and variations
95
  think_match = re.search(r"<think>(.*?)</think>", response_text, re.DOTALL)
96
  if think_match:
97
  thinking_content = think_match.group(1).strip()
98
+ # Remove the think block from the original text to get the response
99
  response_content = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
100
  return f"**Thinking:**\n```\n{thinking_content}\n```\n\n**Response:**\n{response_content}"
101
  else:
102
+ return response_text # No think tag found, return as is
103
  else:
104
+ # If thinking is disabled, strip the tags just in case the model still generates them
105
+ return re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
106
 
107
+
108
+ # --- MODIFIED Core Inference Logic (Now with Streaming) ---
109
+ # @spaces.GPU
110
  def run_inference(
111
+ chatbot: List,
112
  image_input: Optional[PIL.Image.Image],
113
  video_input: Optional[str],
 
114
  do_sample: bool,
115
  max_new_tokens: int,
116
  enable_thinking: bool,
117
+ ):
118
+ """
119
+ Runs a single turn of inference and yields the output stream for a gr.Chatbot.
120
+ This function is now a generator.
121
+ """
122
+ prompt = chatbot[-1][0]
123
  if (not image_input and not video_input and not prompt) or not prompt:
124
  gr.Warning("A text prompt is required for generation.")
125
+ # MODIFICATION: Yield the current state and return to avoid errors
126
+ yield chatbot
127
+ return
128
+
129
+ # MODIFICATION: Append the new prompt to the existing history
130
+ # chatbot.append([prompt, ""])
131
+ # yield chatbot, "" # Yield the updated chat to show the user's prompt immediately
132
 
133
  content = []
134
  if image_input:
135
  content.append({"type": "image", "image": image_input})
136
  if video_input:
137
  frames = load_video_frames(video_input)
138
+ if frames:
139
+ content.append({"type": "video", "video": frames})
140
  else:
141
  gr.Warning("Failed to process the video file.")
142
+ chatbot[-1][1] = "Error: Could not process the video file."
143
+ yield chatbot
144
+ return
145
+
146
  content.append({"type": "text", "text": prompt})
147
 
148
  messages = [{"role": "user", "content": content}]
149
+ logger.info(messages)
150
 
151
  try:
152
  if video_input:
 
154
  else:
155
  input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, enable_thinking=enable_thinking)
156
  except Exception as e:
157
+ chatbot[-1][1] = f"Error during input preprocessing: {e}"
158
+ yield chatbot
159
+ return
160
 
161
  input_ids = input_ids.to(model.device)
162
  if pixel_values is not None:
 
165
  grid_thws = grid_thws.to(model.device)
166
 
167
  gen_kwargs = {
168
+ "max_new_tokens": max_new_tokens,
169
+ "do_sample": do_sample,
170
+ "eos_token_id": model.text_tokenizer.eos_token_id,
171
+ "pad_token_id": model.text_tokenizer.pad_token_id,
172
+ "streamer": streamer,
173
+ "use_cache": True
174
  }
 
 
 
 
 
 
 
 
 
 
 
175
 
176
+ with torch.inference_mode():
177
+ thread = Thread(target=model.generate, kwargs={
178
+ "inputs": input_ids,
179
+ "pixel_values": pixel_values,
180
+ "grid_thws": grid_thws,
181
+ **gen_kwargs
182
+ })
183
+ thread.start()
184
+
185
+ # MODIFICATION: Stream output token by token
186
+ response_text = ""
187
+ for new_text in streamer:
188
+ response_text += new_text
189
+ # Append only the new text chunk to the last response
190
+ chatbot[-1][1] = response_text
191
+ yield chatbot # Yield the updated history
192
+
193
+ thread.join()
194
+
195
+ # MODIFICATION: Format the final response once generation is complete
196
+ formatted_response = parse_model_output(response_text, enable_thinking)
197
+ chatbot[-1][1] = formatted_response
198
+ yield chatbot # Yield the final, formatted response
199
+
200
+ logger.info("[OVIS_CONV_START]")
201
+ [print(f'Q{i}:\n {request}\nA{i}:\n {answer}') for i, (request, answer) in enumerate(chatbot, 1)]
202
+ # print('New_Q:\n', text_input)
203
+ # print('New_A:\n', response)
204
+ logger.info("[OVIS_CONV_END]")
205
+
206
+
207
+ def clear_chat():
208
+ return [], None, ""
209
 
210
  # --- UI Helper Functions ---
211
  def toggle_media_input(choice: str) -> Tuple:
212
  """Switches visibility between Image/Video inputs and their corresponding examples."""
213
  if choice == "Image":
214
  return gr.update(visible=True, value=None), gr.update(visible=False, value=None), gr.update(visible=True), gr.update(visible=False)
215
+ else: # Video
216
  return gr.update(visible=False, value=None), gr.update(visible=True, value=None), gr.update(visible=False), gr.update(visible=True)
217
 
218
+ # # --- MODIFIED: New function to handle chat state and input clearing ---
219
+ # def process_and_clear(chatbot: List, image_input: PIL.Image.Image, video_input: str, prompt: str, do_sample: bool, max_new_tokens: int, enable_thinking: bool):
220
+ # """
221
+ # This function now takes the chatbot state as input to maintain conversation history
222
+ # and clears the prompt box after submission.
223
+ # """
224
+ # # Create a generator by calling the main run_inference function
225
+ # generator = run_inference(chatbot, image_input, video_input, prompt, do_sample, max_new_tokens, enable_thinking)
226
+ # # Yield from the generator
227
+ # for chatbot_state, _ in generator:
228
+ # yield chatbot_state, "" # Clear prompt after first yield
229
+
230
 
231
  # --- Build Gradio Application ---
232
  # @spaces.GPU
233
  def build_demo(model_path: str):
234
  """Builds the Gradio user interface for the model."""
235
+ global model, streamer
236
+ device = "cuda"
237
  print(f"Loading model {model_path} onto device {device}...")
238
+
239
  model = AutoModelForCausalLM.from_pretrained(
240
+ model_path,
241
+ torch_dtype=torch.bfloat16,
242
+ trust_remote_code=True
243
  ).to(device).eval()
244
 
245
+ text_tokenizer = model.text_tokenizer
246
+ streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
247
+
248
  print("Model loaded successfully.")
249
 
250
  model_name_display = model_path.split('/')[-1]
251
+
 
252
  logo_html = ""
253
  logo_svg_path = os.path.join(CUR_DIR, "resource", "logo.svg")
254
  if os.path.exists(logo_svg_path):
 
258
  svg_content_styled = re.sub(r'(<svg[^>]*)(>)', rf'\1 height="{font_size}" style="vertical-align: middle; display: inline-block;"\2', svg_content)
259
  logo_html = f'<span style="display: inline-block; vertical-align: middle;">{svg_content_styled}</span>'
260
  else:
 
261
  logo_html = '<span style="font-weight: bold; font-size: 2.5em; display: inline-block; vertical-align: middle;">Ovis</span>'
262
  print(f"Warning: Logo file not found at {logo_svg_path}. Using text fallback.")
263
 
 
269
  <center><font size=3><b>Ovis</b> has been open-sourced on <a href='https://huggingface.co/{model_path}'>😊 Huggingface</a> and <a href='https://github.com/AIDC-AI/Ovis'>🌟 GitHub</a>. If you find Ovis useful, a like❤️ or a star🌟 would be appreciated.</font></center>
270
  """
271
 
272
+ prompt_input = gr.Textbox(label="Prompt", placeholder="Enter your text here and press ENTER", lines=3, container=False)
273
  with gr.Blocks(theme=gr.themes.Ocean()) as demo:
274
  gr.HTML(html_header)
275
+ gr.Markdown("Note: you might have to increase the \"Max New Tokens\" and wait longer to obtain answer when Deep Thinking is enabled.")
276
+
277
  with gr.Row():
 
278
  with gr.Column(scale=4):
279
+ input_type_radio = gr.Radio(choices=["Image", "Video"], value="Image", label="Select Input Type")
280
  image_input = gr.Image(label="Image Input", type="pil", visible=True)
281
  video_input = gr.Video(label="Video Input", visible=False)
282
+
283
  with gr.Accordion("Generation Settings", open=True):
284
+ do_sample = gr.Checkbox(label="Enable Sampling (Do Sample)", value=True)
285
  max_new_tokens = gr.Slider(minimum=32, maximum=4096, value=1024, step=32, label="Max New Tokens")
286
+ enable_thinking = gr.Checkbox(label="Enable Deep Thinking", value=False)
287
 
288
+
 
 
 
289
 
290
  with gr.Column(visible=True) as image_examples_col:
291
  gr.Examples(
 
298
  ],
299
  inputs=[image_input, prompt_input]
300
  )
301
+ with gr.Column(visible=False) as video_examples_col:
302
+ gr.Examples(examples=[[os.path.join(CUR_DIR, "examples", "video_demo.mp4"), "Describe the video."]],
303
+ inputs=[video_input, prompt_input])
304
+
305
+ with gr.Column(scale=7):
306
+ chatbot = gr.Chatbot(label="Ovis", height=750, show_copy_button=True, layout="panel", latex_delimiters=latex_delimiters_set)
307
+ prompt_input.render()
308
+ with gr.Row():
309
+ generate_btn = gr.Button("Send", variant="primary")
310
+ clear_btn = gr.Button("Clear", variant="secondary")
311
+
312
  input_type_radio.change(
313
  fn=toggle_media_input,
314
  inputs=input_type_radio,
315
+ outputs=[image_input, video_input, image_examples_col, video_examples_col]
316
  )
 
 
317
 
318
+ # MODIFICATION: Update event handlers to use the new function and manage state
319
+ run_inputs = [chatbot, image_input, video_input, do_sample, max_new_tokens, enable_thinking]
320
+ # run_outputs = [image_input, prompt_input]
321
+
322
+ generat_click_event = generate_btn.click(submit_chat, [chatbot, prompt_input], [chatbot, prompt_input]).then(run_inference, run_inputs, chatbot)
323
+ submit_event = prompt_input.submit(submit_chat, [chatbot, prompt_input], [chatbot, prompt_input]).then(run_inference, run_inputs, chatbot)
324
+
325
  clear_btn.click(
326
+ fn=lambda: ([], None, None, "", "Image", True, 1024, False),
327
  outputs=[chatbot, image_input, video_input, prompt_input, input_type_radio, do_sample, max_new_tokens, enable_thinking]
328
  ).then(
329
  fn=toggle_media_input,
330
  inputs=input_type_radio,
331
+ outputs=[image_input, video_input, image_examples_col, video_examples_col]
332
  )
333
+
334
  return demo
335
 
336
  # --- Main Execution Block ---
 
342
  # parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Server name for the Gradio app.")
343
  # return parser.parse_args()
344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
+ # if __name__ == "__main__":
347
+ # args = parse_args()
348
  model_path = 'AIDC-AI/Ovis2.5-9B'
349
  demo = build_demo(model_path=model_path)
350
+ # demo = build_demo(model_path=args.model_path)
351
+ # demo.launch(server_name=args.server_name, server_port=args.port, share=False, ssl_verify=False, show_error=True)
352
+ demo.queue().launch()
examples/video_demo.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4476e4fd82da4fc37b4c167ec6a4f56fa270c0ad3f2724fd47c0ff92b87d6c6
3
+ size 103118