Files changed (1) hide show
  1. README.md +336 -325
README.md CHANGED
@@ -1,326 +1,337 @@
1
- ---
2
- pipeline_tag: image-text-to-text
3
- license: apache-2.0
4
- base_model:
5
- - Qwen/Qwen2.5-7B-Instruct
6
- language:
7
- - en
8
- - zh
9
- datasets:
10
- - HuggingFaceFV/finevideo
11
- ---
12
-
13
- # Ola-7B
14
-
15
- ## Model Summary
16
-
17
- The Ola-7B model is developed by people from Tencent, Tsinghua University and Nanyang Technological University.
18
- Based on Qwen2.5 language model, it is trained on text, image, video and audio data with a context window of 32K tokens. It can take both image/video, text and audio as input and output text.
19
-
20
- Ola offers an on-demand solution to seamlessly and efficiently process visual inputs with arbitrary spatial sizes and temporal lengths.
21
-
22
- - **Repository:** https://github.com/Ola-Omni/Ola
23
- - **Languages:** English, Chinese
24
- - **Paper:** https://huggingface.co/papers/2502.04328
25
-
26
- ## Use
27
-
28
- 1. Download the speech encoder at https://huggingface.co/THUdyh/Ola_speech_encoders.
29
- 2. Replace the path in config.json with local path of speech encoders.
30
-
31
- We provide a simple generation process for using our model. For more details, please refer to our [Github Repo](https://github.com/Ola-Omni/Ola)
32
-
33
- ```
34
- import os
35
- os.environ['LOWRES_RESIZE'] = '384x32'
36
- os.environ['HIGHRES_BASE'] = '0x32'
37
- os.environ['VIDEO_RESIZE'] = "0x64"
38
- os.environ['VIDEO_MAXRES'] = "480"
39
- os.environ['VIDEO_MINRES'] = "288"
40
- os.environ['MAXRES'] = '1536'
41
- os.environ['MINRES'] = '0'
42
- os.environ['REGIONAL_POOL'] = '2x'
43
- os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
44
- os.environ['LOAD_VISION_EARLY'] = '1'
45
- os.environ['SKIP_LOAD_VIT'] = '1'
46
-
47
-
48
- import gradio as gr
49
- import torch
50
- import re
51
- from decord import VideoReader, cpu
52
- from PIL import Image
53
- import numpy as np
54
- import transformers
55
- import moviepy.editor as mp
56
- from typing import Dict, Optional, Sequence, List
57
- import librosa
58
- import whisper
59
- from ola.conversation import conv_templates, SeparatorStyle
60
- from ola.model.builder import load_pretrained_model
61
- from ola.utils import disable_torch_init
62
- from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
63
- from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
64
- from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
65
-
66
- model_path = ""
67
- tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
68
- model = model.to('cuda').eval()
69
- model = model.bfloat16()
70
-
71
- USE_SPEECH=False
72
- cur_dir = os.path.dirname(os.path.abspath(__file__))
73
-
74
-
75
- def load_audio(audio_file_name):
76
- speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
77
- if len(speech_wav.shape) > 1:
78
- speech_wav = speech_wav[:, 0]
79
- speech_wav = speech_wav.astype(np.float32)
80
- CHUNK_LIM = 480000
81
- SAMPLE_RATE = 16000
82
- speechs = []
83
- speech_wavs = []
84
-
85
- if len(speech_wav) <= CHUNK_LIM:
86
- speech = whisper.pad_or_trim(speech_wav)
87
- speech_wav = whisper.pad_or_trim(speech_wav)
88
- speechs.append(speech)
89
- speech_wavs.append(torch.from_numpy(speech_wav).unsqueeze(0))
90
- else:
91
- for i in range(0, len(speech_wav), CHUNK_LIM):
92
- chunk = speech_wav[i : i + CHUNK_LIM]
93
- if len(chunk) < CHUNK_LIM:
94
- chunk = whisper.pad_or_trim(chunk)
95
- speechs.append(chunk)
96
- speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
97
- mels = []
98
- for chunk in speechs:
99
- chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
100
- mels.append(chunk)
101
-
102
- mels = torch.cat(mels, dim=0)
103
- speech_wavs = torch.cat(speech_wavs, dim=0)
104
- if mels.shape[0] > 25:
105
- mels = mels[:25]
106
- speech_wavs = speech_wavs[:25]
107
-
108
- speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
109
- speech_chunks = torch.LongTensor([mels.shape[0]])
110
- return mels, speech_length, speech_chunks, speech_wavs
111
-
112
- def extract_audio(videos_file_path):
113
- my_clip = mp.VideoFileClip(videos_file_path)
114
- return my_clip.audio
115
-
116
- def ola_inference(multimodal, audio_path):
117
- visual, text = multimodal["files"][0], multimodal["text"]
118
- if visual.endswith("image2.png"):
119
- modality = "video"
120
- visual = f"{cur_dir}/case/case1.mp4"
121
- if visual.endswith(".mp4"):
122
- modality = "video"
123
- else:
124
- modality = "image"
125
-
126
- # input audio and video, do not parse audio in the video, else parse audio in the video
127
- if audio_path:
128
- USE_SPEECH = True
129
- elif modality == "video":
130
- USE_SPEECH = True
131
- else:
132
- USE_SPEECH = False
133
-
134
- speechs = []
135
- speech_lengths = []
136
- speech_wavs = []
137
- speech_chunks = []
138
- if modality == "video":
139
- vr = VideoReader(visual, ctx=cpu(0))
140
- total_frame_num = len(vr)
141
- fps = round(vr.get_avg_fps())
142
- uniform_sampled_frames = np.linspace(0, total_frame_num - 1, 64, dtype=int)
143
- frame_idx = uniform_sampled_frames.tolist()
144
- spare_frames = vr.get_batch(frame_idx).asnumpy()
145
- video = [Image.fromarray(frame) for frame in spare_frames]
146
- else:
147
- image = [Image.open(visual)]
148
- image_sizes = [image[0].size]
149
-
150
- if USE_SPEECH and audio_path:
151
- audio_path = audio_path
152
- speech, speech_length, speech_chunk, speech_wav = load_audio(audio_path)
153
- speechs.append(speech.bfloat16().to('cuda'))
154
- speech_lengths.append(speech_length.to('cuda'))
155
- speech_chunks.append(speech_chunk.to('cuda'))
156
- speech_wavs.append(speech_wav.to('cuda'))
157
- print('load audio')
158
- elif USE_SPEECH and not audio_path:
159
- # parse audio in the video
160
- audio = extract_audio(visual)
161
- audio.write_audiofile("./video_audio.wav")
162
- video_audio_path = './video_audio.wav'
163
- speech, speech_length, speech_chunk, speech_wav = load_audio(video_audio_path)
164
- speechs.append(speech.bfloat16().to('cuda'))
165
- speech_lengths.append(speech_length.to('cuda'))
166
- speech_chunks.append(speech_chunk.to('cuda'))
167
- speech_wavs.append(speech_wav.to('cuda'))
168
- else:
169
- speechs = [torch.zeros(1, 3000, 128).bfloat16().to('cuda')]
170
- speech_lengths = [torch.LongTensor([3000]).to('cuda')]
171
- speech_wavs = [torch.zeros([1, 480000]).to('cuda')]
172
- speech_chunks = [torch.LongTensor([1]).to('cuda')]
173
-
174
- conv_mode = "qwen_1_5"
175
- if text:
176
- qs = text
177
- else:
178
- qs = ''
179
- if USE_SPEECH and audio_path:
180
- qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
181
- elif USE_SPEECH:
182
- qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
183
- else:
184
- qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
185
-
186
- conv = conv_templates[conv_mode].copy()
187
- conv.append_message(conv.roles[0], qs)
188
- conv.append_message(conv.roles[1], None)
189
- prompt = conv.get_prompt()
190
- if USE_SPEECH and audio_path:
191
- input_ids = tokenizer_speech_question_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
192
- elif USE_SPEECH:
193
- input_ids = tokenizer_speech_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
194
- else:
195
- input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
196
-
197
- if modality == "video":
198
- video_processed = []
199
- for idx, frame in enumerate(video):
200
- image_processor.do_resize = False
201
- image_processor.do_center_crop = False
202
- frame = process_anyres_video(frame, image_processor)
203
-
204
- if frame_idx is not None and idx in frame_idx:
205
- video_processed.append(frame.unsqueeze(0))
206
- elif frame_idx is None:
207
- video_processed.append(frame.unsqueeze(0))
208
-
209
- if frame_idx is None:
210
- frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()
211
-
212
- video_processed = torch.cat(video_processed, dim=0).bfloat16().to("cuda")
213
- video_processed = (video_processed, video_processed)
214
-
215
- video_data = (video_processed, (384, 384), "video")
216
- else:
217
- image_processor.do_resize = False
218
- image_processor.do_center_crop = False
219
- image_tensor, image_highres_tensor = [], []
220
- for visual in image:
221
- image_tensor_, image_highres_tensor_ = process_anyres_highres_image_genli(visual, image_processor)
222
- image_tensor.append(image_tensor_)
223
- image_highres_tensor.append(image_highres_tensor_)
224
- if all(x.shape == image_tensor[0].shape for x in image_tensor):
225
- image_tensor = torch.stack(image_tensor, dim=0)
226
- if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
227
- image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
228
- if type(image_tensor) is list:
229
- image_tensor = [_image.bfloat16().to("cuda") for _image in image_tensor]
230
- else:
231
- image_tensor = image_tensor.bfloat16().to("cuda")
232
- if type(image_highres_tensor) is list:
233
- image_highres_tensor = [_image.bfloat16().to("cuda") for _image in image_highres_tensor]
234
- else:
235
- image_highres_tensor = image_highres_tensor.bfloat16().to("cuda")
236
-
237
- pad_token_ids = 151643
238
-
239
- attention_masks = input_ids.ne(pad_token_ids).long().to('cuda')
240
- stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
241
- keywords = [stop_str]
242
- stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
243
-
244
- gen_kwargs = {}
245
-
246
- if "max_new_tokens" not in gen_kwargs:
247
- gen_kwargs["max_new_tokens"] = 1024
248
- if "temperature" not in gen_kwargs:
249
- gen_kwargs["temperature"] = 0.2
250
- if "top_p" not in gen_kwargs:
251
- gen_kwargs["top_p"] = None
252
- if "num_beams" not in gen_kwargs:
253
- gen_kwargs["num_beams"] = 1
254
-
255
- with torch.inference_mode():
256
- if modality == "video":
257
- output_ids = model.generate(
258
- inputs=input_ids,
259
- images=video_data[0][0],
260
- images_highres=video_data[0][1],
261
- modalities=video_data[2],
262
- speech=speechs,
263
- speech_lengths=speech_lengths,
264
- speech_chunks=speech_chunks,
265
- speech_wav=speech_wavs,
266
- attention_mask=attention_masks,
267
- use_cache=True,
268
- stopping_criteria=[stopping_criteria],
269
- do_sample=True if gen_kwargs["temperature"] > 0 else False,
270
- temperature=gen_kwargs["temperature"],
271
- top_p=gen_kwargs["top_p"],
272
- num_beams=gen_kwargs["num_beams"],
273
- max_new_tokens=gen_kwargs["max_new_tokens"],
274
- )
275
- else:
276
- output_ids = model.generate(
277
- inputs=input_ids,
278
- images=image_tensor,
279
- images_highres=image_highres_tensor,
280
- image_sizes=image_sizes,
281
- modalities=['image'],
282
- speech=speechs,
283
- speech_lengths=speech_lengths,
284
- speech_chunks=speech_chunks,
285
- speech_wav=speech_wavs,
286
- attention_mask=attention_masks,
287
- use_cache=True,
288
- stopping_criteria=[stopping_criteria],
289
- do_sample=True if gen_kwargs["temperature"] > 0 else False,
290
- temperature=gen_kwargs["temperature"],
291
- top_p=gen_kwargs["top_p"],
292
- num_beams=gen_kwargs["num_beams"],
293
- max_new_tokens=gen_kwargs["max_new_tokens"],
294
- )
295
-
296
- outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
297
- outputs = outputs.strip()
298
- if outputs.endswith(stop_str):
299
- outputs = outputs[:-len(stop_str)]
300
- outputs = outputs.strip()
301
- return outputs, None
302
- ```
303
-
304
- ### Model Architecture
305
-
306
- - **Architecture:** Pre-trained [Oryx-ViT](https://huggingface.co/THUdyh/Oryx-ViT) + Qwen2.5-7B
307
-
308
- - **Data:** a mixture of more than 5M image/video/audio data, training for 3 stage.
309
-
310
- - **Precision:** BFloat16
311
-
312
- #### Hardware & Software
313
-
314
- - **Hardware:** 64 \* NVIDIA Tesla A100
315
-
316
- - **Orchestration:** HuggingFace Trainer
317
-
318
- - **Code:** Pytorch
319
-
320
- ## Citation
321
- @article{liu2025ola,
322
- title={Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment},
323
- author={Liu, Zuyan and Dong, Yuhao and Wang, Jiahui and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
324
- journal={arXiv preprint arXiv:2502.04328},
325
- year={2025}
 
 
 
 
 
 
 
 
 
 
 
326
  }
 
1
+ ---
2
+ pipeline_tag: image-text-to-text
3
+ license: apache-2.0
4
+ base_model:
5
+ - Qwen/Qwen2.5-7B-Instruct
6
+ language:
7
+ - zho
8
+ - eng
9
+ - fra
10
+ - spa
11
+ - por
12
+ - deu
13
+ - ita
14
+ - rus
15
+ - jpn
16
+ - kor
17
+ - vie
18
+ - tha
19
+ - ara
20
+ datasets:
21
+ - HuggingFaceFV/finevideo
22
+ ---
23
+
24
+ # Ola-7B
25
+
26
+ ## Model Summary
27
+
28
+ The Ola-7B model is developed by people from Tencent, Tsinghua University and Nanyang Technological University.
29
+ Based on Qwen2.5 language model, it is trained on text, image, video and audio data with a context window of 32K tokens. It can take both image/video, text and audio as input and output text.
30
+
31
+ Ola offers an on-demand solution to seamlessly and efficiently process visual inputs with arbitrary spatial sizes and temporal lengths.
32
+
33
+ - **Repository:** https://github.com/Ola-Omni/Ola
34
+ - **Languages:** English, Chinese
35
+ - **Paper:** https://huggingface.co/papers/2502.04328
36
+
37
+ ## Use
38
+
39
+ 1. Download the speech encoder at https://huggingface.co/THUdyh/Ola_speech_encoders.
40
+ 2. Replace the path in config.json with local path of speech encoders.
41
+
42
+ We provide a simple generation process for using our model. For more details, please refer to our [Github Repo](https://github.com/Ola-Omni/Ola)
43
+
44
+ ```
45
+ import os
46
+ os.environ['LOWRES_RESIZE'] = '384x32'
47
+ os.environ['HIGHRES_BASE'] = '0x32'
48
+ os.environ['VIDEO_RESIZE'] = "0x64"
49
+ os.environ['VIDEO_MAXRES'] = "480"
50
+ os.environ['VIDEO_MINRES'] = "288"
51
+ os.environ['MAXRES'] = '1536'
52
+ os.environ['MINRES'] = '0'
53
+ os.environ['REGIONAL_POOL'] = '2x'
54
+ os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
55
+ os.environ['LOAD_VISION_EARLY'] = '1'
56
+ os.environ['SKIP_LOAD_VIT'] = '1'
57
+
58
+
59
+ import gradio as gr
60
+ import torch
61
+ import re
62
+ from decord import VideoReader, cpu
63
+ from PIL import Image
64
+ import numpy as np
65
+ import transformers
66
+ import moviepy.editor as mp
67
+ from typing import Dict, Optional, Sequence, List
68
+ import librosa
69
+ import whisper
70
+ from ola.conversation import conv_templates, SeparatorStyle
71
+ from ola.model.builder import load_pretrained_model
72
+ from ola.utils import disable_torch_init
73
+ from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
74
+ from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
75
+ from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
76
+
77
+ model_path = ""
78
+ tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
79
+ model = model.to('cuda').eval()
80
+ model = model.bfloat16()
81
+
82
+ USE_SPEECH=False
83
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
84
+
85
+
86
+ def load_audio(audio_file_name):
87
+ speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
88
+ if len(speech_wav.shape) > 1:
89
+ speech_wav = speech_wav[:, 0]
90
+ speech_wav = speech_wav.astype(np.float32)
91
+ CHUNK_LIM = 480000
92
+ SAMPLE_RATE = 16000
93
+ speechs = []
94
+ speech_wavs = []
95
+
96
+ if len(speech_wav) <= CHUNK_LIM:
97
+ speech = whisper.pad_or_trim(speech_wav)
98
+ speech_wav = whisper.pad_or_trim(speech_wav)
99
+ speechs.append(speech)
100
+ speech_wavs.append(torch.from_numpy(speech_wav).unsqueeze(0))
101
+ else:
102
+ for i in range(0, len(speech_wav), CHUNK_LIM):
103
+ chunk = speech_wav[i : i + CHUNK_LIM]
104
+ if len(chunk) < CHUNK_LIM:
105
+ chunk = whisper.pad_or_trim(chunk)
106
+ speechs.append(chunk)
107
+ speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
108
+ mels = []
109
+ for chunk in speechs:
110
+ chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
111
+ mels.append(chunk)
112
+
113
+ mels = torch.cat(mels, dim=0)
114
+ speech_wavs = torch.cat(speech_wavs, dim=0)
115
+ if mels.shape[0] > 25:
116
+ mels = mels[:25]
117
+ speech_wavs = speech_wavs[:25]
118
+
119
+ speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
120
+ speech_chunks = torch.LongTensor([mels.shape[0]])
121
+ return mels, speech_length, speech_chunks, speech_wavs
122
+
123
+ def extract_audio(videos_file_path):
124
+ my_clip = mp.VideoFileClip(videos_file_path)
125
+ return my_clip.audio
126
+
127
+ def ola_inference(multimodal, audio_path):
128
+ visual, text = multimodal["files"][0], multimodal["text"]
129
+ if visual.endswith("image2.png"):
130
+ modality = "video"
131
+ visual = f"{cur_dir}/case/case1.mp4"
132
+ if visual.endswith(".mp4"):
133
+ modality = "video"
134
+ else:
135
+ modality = "image"
136
+
137
+ # input audio and video, do not parse audio in the video, else parse audio in the video
138
+ if audio_path:
139
+ USE_SPEECH = True
140
+ elif modality == "video":
141
+ USE_SPEECH = True
142
+ else:
143
+ USE_SPEECH = False
144
+
145
+ speechs = []
146
+ speech_lengths = []
147
+ speech_wavs = []
148
+ speech_chunks = []
149
+ if modality == "video":
150
+ vr = VideoReader(visual, ctx=cpu(0))
151
+ total_frame_num = len(vr)
152
+ fps = round(vr.get_avg_fps())
153
+ uniform_sampled_frames = np.linspace(0, total_frame_num - 1, 64, dtype=int)
154
+ frame_idx = uniform_sampled_frames.tolist()
155
+ spare_frames = vr.get_batch(frame_idx).asnumpy()
156
+ video = [Image.fromarray(frame) for frame in spare_frames]
157
+ else:
158
+ image = [Image.open(visual)]
159
+ image_sizes = [image[0].size]
160
+
161
+ if USE_SPEECH and audio_path:
162
+ audio_path = audio_path
163
+ speech, speech_length, speech_chunk, speech_wav = load_audio(audio_path)
164
+ speechs.append(speech.bfloat16().to('cuda'))
165
+ speech_lengths.append(speech_length.to('cuda'))
166
+ speech_chunks.append(speech_chunk.to('cuda'))
167
+ speech_wavs.append(speech_wav.to('cuda'))
168
+ print('load audio')
169
+ elif USE_SPEECH and not audio_path:
170
+ # parse audio in the video
171
+ audio = extract_audio(visual)
172
+ audio.write_audiofile("./video_audio.wav")
173
+ video_audio_path = './video_audio.wav'
174
+ speech, speech_length, speech_chunk, speech_wav = load_audio(video_audio_path)
175
+ speechs.append(speech.bfloat16().to('cuda'))
176
+ speech_lengths.append(speech_length.to('cuda'))
177
+ speech_chunks.append(speech_chunk.to('cuda'))
178
+ speech_wavs.append(speech_wav.to('cuda'))
179
+ else:
180
+ speechs = [torch.zeros(1, 3000, 128).bfloat16().to('cuda')]
181
+ speech_lengths = [torch.LongTensor([3000]).to('cuda')]
182
+ speech_wavs = [torch.zeros([1, 480000]).to('cuda')]
183
+ speech_chunks = [torch.LongTensor([1]).to('cuda')]
184
+
185
+ conv_mode = "qwen_1_5"
186
+ if text:
187
+ qs = text
188
+ else:
189
+ qs = ''
190
+ if USE_SPEECH and audio_path:
191
+ qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
192
+ elif USE_SPEECH:
193
+ qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
194
+ else:
195
+ qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
196
+
197
+ conv = conv_templates[conv_mode].copy()
198
+ conv.append_message(conv.roles[0], qs)
199
+ conv.append_message(conv.roles[1], None)
200
+ prompt = conv.get_prompt()
201
+ if USE_SPEECH and audio_path:
202
+ input_ids = tokenizer_speech_question_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
203
+ elif USE_SPEECH:
204
+ input_ids = tokenizer_speech_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
205
+ else:
206
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
207
+
208
+ if modality == "video":
209
+ video_processed = []
210
+ for idx, frame in enumerate(video):
211
+ image_processor.do_resize = False
212
+ image_processor.do_center_crop = False
213
+ frame = process_anyres_video(frame, image_processor)
214
+
215
+ if frame_idx is not None and idx in frame_idx:
216
+ video_processed.append(frame.unsqueeze(0))
217
+ elif frame_idx is None:
218
+ video_processed.append(frame.unsqueeze(0))
219
+
220
+ if frame_idx is None:
221
+ frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()
222
+
223
+ video_processed = torch.cat(video_processed, dim=0).bfloat16().to("cuda")
224
+ video_processed = (video_processed, video_processed)
225
+
226
+ video_data = (video_processed, (384, 384), "video")
227
+ else:
228
+ image_processor.do_resize = False
229
+ image_processor.do_center_crop = False
230
+ image_tensor, image_highres_tensor = [], []
231
+ for visual in image:
232
+ image_tensor_, image_highres_tensor_ = process_anyres_highres_image_genli(visual, image_processor)
233
+ image_tensor.append(image_tensor_)
234
+ image_highres_tensor.append(image_highres_tensor_)
235
+ if all(x.shape == image_tensor[0].shape for x in image_tensor):
236
+ image_tensor = torch.stack(image_tensor, dim=0)
237
+ if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
238
+ image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
239
+ if type(image_tensor) is list:
240
+ image_tensor = [_image.bfloat16().to("cuda") for _image in image_tensor]
241
+ else:
242
+ image_tensor = image_tensor.bfloat16().to("cuda")
243
+ if type(image_highres_tensor) is list:
244
+ image_highres_tensor = [_image.bfloat16().to("cuda") for _image in image_highres_tensor]
245
+ else:
246
+ image_highres_tensor = image_highres_tensor.bfloat16().to("cuda")
247
+
248
+ pad_token_ids = 151643
249
+
250
+ attention_masks = input_ids.ne(pad_token_ids).long().to('cuda')
251
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
252
+ keywords = [stop_str]
253
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
254
+
255
+ gen_kwargs = {}
256
+
257
+ if "max_new_tokens" not in gen_kwargs:
258
+ gen_kwargs["max_new_tokens"] = 1024
259
+ if "temperature" not in gen_kwargs:
260
+ gen_kwargs["temperature"] = 0.2
261
+ if "top_p" not in gen_kwargs:
262
+ gen_kwargs["top_p"] = None
263
+ if "num_beams" not in gen_kwargs:
264
+ gen_kwargs["num_beams"] = 1
265
+
266
+ with torch.inference_mode():
267
+ if modality == "video":
268
+ output_ids = model.generate(
269
+ inputs=input_ids,
270
+ images=video_data[0][0],
271
+ images_highres=video_data[0][1],
272
+ modalities=video_data[2],
273
+ speech=speechs,
274
+ speech_lengths=speech_lengths,
275
+ speech_chunks=speech_chunks,
276
+ speech_wav=speech_wavs,
277
+ attention_mask=attention_masks,
278
+ use_cache=True,
279
+ stopping_criteria=[stopping_criteria],
280
+ do_sample=True if gen_kwargs["temperature"] > 0 else False,
281
+ temperature=gen_kwargs["temperature"],
282
+ top_p=gen_kwargs["top_p"],
283
+ num_beams=gen_kwargs["num_beams"],
284
+ max_new_tokens=gen_kwargs["max_new_tokens"],
285
+ )
286
+ else:
287
+ output_ids = model.generate(
288
+ inputs=input_ids,
289
+ images=image_tensor,
290
+ images_highres=image_highres_tensor,
291
+ image_sizes=image_sizes,
292
+ modalities=['image'],
293
+ speech=speechs,
294
+ speech_lengths=speech_lengths,
295
+ speech_chunks=speech_chunks,
296
+ speech_wav=speech_wavs,
297
+ attention_mask=attention_masks,
298
+ use_cache=True,
299
+ stopping_criteria=[stopping_criteria],
300
+ do_sample=True if gen_kwargs["temperature"] > 0 else False,
301
+ temperature=gen_kwargs["temperature"],
302
+ top_p=gen_kwargs["top_p"],
303
+ num_beams=gen_kwargs["num_beams"],
304
+ max_new_tokens=gen_kwargs["max_new_tokens"],
305
+ )
306
+
307
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
308
+ outputs = outputs.strip()
309
+ if outputs.endswith(stop_str):
310
+ outputs = outputs[:-len(stop_str)]
311
+ outputs = outputs.strip()
312
+ return outputs, None
313
+ ```
314
+
315
+ ### Model Architecture
316
+
317
+ - **Architecture:** Pre-trained [Oryx-ViT](https://huggingface.co/THUdyh/Oryx-ViT) + Qwen2.5-7B
318
+
319
+ - **Data:** a mixture of more than 5M image/video/audio data, training for 3 stage.
320
+
321
+ - **Precision:** BFloat16
322
+
323
+ #### Hardware & Software
324
+
325
+ - **Hardware:** 64 \* NVIDIA Tesla A100
326
+
327
+ - **Orchestration:** HuggingFace Trainer
328
+
329
+ - **Code:** Pytorch
330
+
331
+ ## Citation
332
+ @article{liu2025ola,
333
+ title={Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment},
334
+ author={Liu, Zuyan and Dong, Yuhao and Wang, Jiahui and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
335
+ journal={arXiv preprint arXiv:2502.04328},
336
+ year={2025}
337
  }