Update README.md
Browse files
README.md
CHANGED
@@ -7,12 +7,320 @@ language:
|
|
7 |
base_model:
|
8 |
- Qwen/Qwen2-VL-7B
|
9 |
tags:
|
|
|
|
|
|
|
10 |
- multimodal
|
11 |
-
-
|
12 |
---
|
13 |
-
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
@article{livecc,
|
17 |
author = {Joya Chen and Ziyun Zeng and Yiqi Lin and Wei Li and Zejun Ma and Mike Zheng Shou},
|
18 |
title = {LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale},
|
|
|
7 |
base_model:
|
8 |
- Qwen/Qwen2-VL-7B
|
9 |
tags:
|
10 |
+
- qwen_vl
|
11 |
+
- video
|
12 |
+
- real-time
|
13 |
- multimodal
|
14 |
+
- LLM
|
15 |
---
|
16 |
+
# LiveCC-7B-Instruct
|
17 |
|
18 |
+
## Introduction
|
19 |
+
|
20 |
+
We introduce LiveCC, the first video LLM capable of real-time commentary, trained with a novel video-ASR streaming method, SOTA on both streaming and offline benchmarks.
|
21 |
+
|
22 |
+
- Project Page: https://showlab.github.io/livecc
|
23 |
+
|
24 |
+
> [!Important]
|
25 |
+
> This is the Base model. The base model is at [LiveCC-7B-Instruct](https://huggingface.co/chenjoya/LiveCC-7B-Instruct).
|
26 |
+
|
27 |
+
## Training with Streaming Frame-Words Paradigm
|
28 |
+
|
29 |
+

|
30 |
+
|
31 |
+
## Quickstart
|
32 |
+
Like qwen-vl-utils, we offer a toolkit to help you handle various types of visual input more conveniently, **especially on video streaming inputs**. You can install it using the following command:
|
33 |
+
|
34 |
+
```bash
|
35 |
+
pip install qwen-vl-utils livecc-utils
|
36 |
+
```
|
37 |
+
|
38 |
+
Here we show a code snippet to show you how to do **real-time video commentary** with `transformers` and the above utils:
|
39 |
+
|
40 |
+
```python
|
41 |
+
import functools, torch, os, tqdm
|
42 |
+
from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
|
43 |
+
apply_liger_kernel_to_qwen2_vl() # important. our model is trained with this. keep consistency
|
44 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, LogitsProcessor, logging
|
45 |
+
from livecc_utils import prepare_multiturn_multimodal_inputs_for_generation, get_smart_resized_clip, get_smart_resized_video_reader
|
46 |
+
from qwen_vl_utils import process_vision_info
|
47 |
+
|
48 |
+
class LiveCCDemoInfer:
|
49 |
+
fps = 2
|
50 |
+
initial_fps_frames = 6
|
51 |
+
streaming_fps_frames = 2
|
52 |
+
initial_time_interval = initial_fps_frames / fps
|
53 |
+
streaming_time_interval = streaming_fps_frames / fps
|
54 |
+
frame_time_interval = 1 / fps
|
55 |
+
def __init__(self, model_path: str = None, device_id: int = 0):
|
56 |
+
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
57 |
+
model_path, torch_dtype="auto",
|
58 |
+
device_map=f'cuda:{device_id}',
|
59 |
+
attn_implementation='flash_attention_2'
|
60 |
+
)
|
61 |
+
self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
|
62 |
+
self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
|
63 |
+
self.model.prepare_inputs_for_generation = functools.partial(prepare_multiturn_multimodal_inputs_for_generation, self.model)
|
64 |
+
message = {
|
65 |
+
"role": "user",
|
66 |
+
"content": [
|
67 |
+
{"type": "text", "text": 'livecc'},
|
68 |
+
]
|
69 |
+
}
|
70 |
+
texts = self.processor.apply_chat_template([message], tokenize=False)
|
71 |
+
self.system_prompt_offset = texts.index('<|im_start|>user')
|
72 |
+
self._cached_video_readers_with_hw = {}
|
73 |
+
|
74 |
+
@torch.inference_mode()
|
75 |
+
def live_cc(
|
76 |
+
self,
|
77 |
+
query: str,
|
78 |
+
state: dict,
|
79 |
+
max_pixels: int = 384 * 28 * 28,
|
80 |
+
default_query: str = 'Please describe the video.',
|
81 |
+
do_sample: bool = False,
|
82 |
+
repetition_penalty: float = 1.05,
|
83 |
+
streaming_eos_base_threshold: float = None,
|
84 |
+
streaming_eos_threshold_step: float = None,
|
85 |
+
**kwargs,
|
86 |
+
):
|
87 |
+
"""
|
88 |
+
state: dict, (maybe) with keys:
|
89 |
+
video_path: str, video path
|
90 |
+
video_timestamp: float, current video timestamp
|
91 |
+
last_timestamp: float, last processed video timestamp
|
92 |
+
last_video_pts_index: int, last processed video frame index
|
93 |
+
video_pts: np.ndarray, video pts
|
94 |
+
last_history: list, last processed history
|
95 |
+
"""
|
96 |
+
# 1. preparation: video_reader, and last processing info
|
97 |
+
video_timestamp, last_timestamp = state.get('video_timestamp', 0), state.get('last_timestamp', -1 / self.fps)
|
98 |
+
video_path = state['video_path']
|
99 |
+
if video_path not in self._cached_video_readers_with_hw:
|
100 |
+
self._cached_video_readers_with_hw[video_path] = get_smart_resized_video_reader(video_path, max_pixels)
|
101 |
+
video_reader = self._cached_video_readers_with_hw[video_path][0]
|
102 |
+
video_reader.get_frame_timestamp(0)
|
103 |
+
state['video_pts'] = torch.from_numpy(video_reader._frame_pts[:, 1])
|
104 |
+
state['last_video_pts_index'] = -1
|
105 |
+
video_pts = state['video_pts']
|
106 |
+
if last_timestamp + self.frame_time_interval > video_pts[-1]:
|
107 |
+
state['video_end'] = True
|
108 |
+
return
|
109 |
+
video_reader, resized_height, resized_width = self._cached_video_readers_with_hw[video_path]
|
110 |
+
last_video_pts_index = state['last_video_pts_index']
|
111 |
+
|
112 |
+
# 2. which frames will be processed
|
113 |
+
initialized = last_timestamp >= 0
|
114 |
+
if not initialized:
|
115 |
+
video_timestamp = max(video_timestamp, self.initial_time_interval)
|
116 |
+
if video_timestamp <= last_timestamp + self.frame_time_interval:
|
117 |
+
return
|
118 |
+
timestamps = torch.arange(last_timestamp + self.frame_time_interval, video_timestamp, self.frame_time_interval) # add compensation
|
119 |
+
|
120 |
+
# 3. fetch frames in required timestamps
|
121 |
+
clip, clip_timestamps, clip_idxs = get_smart_resized_clip(video_reader, resized_height, resized_width, timestamps, video_pts, video_pts_index_from=last_video_pts_index+1)
|
122 |
+
state['last_video_pts_index'] = clip_idxs[-1]
|
123 |
+
state['last_timestamp'] = clip_timestamps[-1]
|
124 |
+
|
125 |
+
# 4. organize to interleave frames
|
126 |
+
interleave_clips, interleave_timestamps = [], []
|
127 |
+
if not initialized:
|
128 |
+
interleave_clips.append(clip[:self.initial_fps_frames])
|
129 |
+
interleave_timestamps.append(clip_timestamps[:self.initial_fps_frames])
|
130 |
+
clip = clip[self.initial_fps_frames:]
|
131 |
+
clip_timestamps = clip_timestamps[self.initial_fps_frames:]
|
132 |
+
if len(clip) > 0:
|
133 |
+
interleave_clips.extend(list(clip.split(self.streaming_fps_frames)))
|
134 |
+
interleave_timestamps.extend(list(clip_timestamps.split(self.streaming_fps_frames)))
|
135 |
+
|
136 |
+
# 5. make conversation and send to model
|
137 |
+
for clip, timestamps in zip(interleave_clips, interleave_timestamps):
|
138 |
+
start_timestamp, stop_timestamp = timestamps[0].item(), timestamps[-1].item() + self.frame_time_interval
|
139 |
+
message = {
|
140 |
+
"role": "user",
|
141 |
+
"content": [
|
142 |
+
{"type": "text", "text": f'Time={start_timestamp:.1f}-{stop_timestamp:.1f}s'},
|
143 |
+
{"type": "video", "video": clip}
|
144 |
+
]
|
145 |
+
}
|
146 |
+
if not query and not state.get('query', None):
|
147 |
+
query = default_query
|
148 |
+
logger.warning(f'No query provided, use default_query={default_query}')
|
149 |
+
if query and state.get('query', None) != query:
|
150 |
+
message['content'].append({"type": "text", "text": query})
|
151 |
+
state['query'] = query
|
152 |
+
texts = self.processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True, return_tensors='pt')
|
153 |
+
past_ids = state.get('past_ids', None)
|
154 |
+
if past_ids is not None:
|
155 |
+
texts = '<|im_end|>\n' + texts[self.system_prompt_offset:]
|
156 |
+
inputs = self.processor(
|
157 |
+
text=texts,
|
158 |
+
images=None,
|
159 |
+
videos=[clip],
|
160 |
+
return_tensors="pt",
|
161 |
+
return_attention_mask=False
|
162 |
+
)
|
163 |
+
inputs.to('cuda')
|
164 |
+
if past_ids is not None:
|
165 |
+
inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
|
166 |
+
if streaming_eos_base_threshold is not None:
|
167 |
+
logits_processor = [ThresholdLogitsProcessor(self.streaming_eos_token_id, streaming_eos_base_threshold, streaming_eos_threshold_step)]
|
168 |
+
else:
|
169 |
+
logits_processor = None
|
170 |
+
outputs = self.model.generate(
|
171 |
+
**inputs, past_key_values=state.get('past_key_values', None),
|
172 |
+
return_dict_in_generate=True, do_sample=do_sample,
|
173 |
+
repetition_penalty=repetition_penalty,
|
174 |
+
logits_processor=logits_processor,
|
175 |
+
)
|
176 |
+
state['past_key_values'] = outputs.past_key_values
|
177 |
+
state['past_ids'] = outputs.sequences[:, :-1]
|
178 |
+
yield (start_timestamp, stop_timestamp), self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True), state
|
179 |
+
|
180 |
+
model_path = 'chenjoya/LiveCC-7B-Instruct'
|
181 |
+
video_path = "spacex_falcon9.mp4"
|
182 |
+
query = """Let's wait together!"""
|
183 |
+
|
184 |
+
infer = LiveCCDemoInfer(model_path=model_path)
|
185 |
+
state = {'video_path': video_path}
|
186 |
+
commentaries = []
|
187 |
+
t = 0
|
188 |
+
for t in range(31):
|
189 |
+
state['video_timestamp'] = t
|
190 |
+
for (start_t, stop_t), response, state in infer.live_cc(
|
191 |
+
query=query, state=state,
|
192 |
+
max_pixels = 512 * 28 * 28, repetition_penalty=1.05,
|
193 |
+
streaming_eos_base_threshold=0.0, streaming_eos_threshold_step=0
|
194 |
+
):
|
195 |
+
print(f'{start_t}s-{stop_t}s: {response}')
|
196 |
+
commentaries.append([start_t, stop_t, response])
|
197 |
+
if state.get('video_end', False):
|
198 |
+
break
|
199 |
+
t += 1
|
200 |
+
```
|
201 |
+
|
202 |
+
Here we show a code snippet to show you how to do **common video (multi-turn) qa** with `transformers` and the above utils:
|
203 |
+
```python
|
204 |
+
import functools, torch
|
205 |
+
from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
|
206 |
+
apply_liger_kernel_to_qwen2_vl() # important. our model is trained with this. keep consistency
|
207 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, LogitsProcessor, logging
|
208 |
+
from livecc_utils import prepare_multiturn_multimodal_inputs_for_generation, get_smart_resized_clip, get_smart_resized_video_reader
|
209 |
+
from qwen_vl_utils import process_vision_info
|
210 |
+
|
211 |
+
class LiveCCDemoInfer:
|
212 |
+
fps = 2
|
213 |
+
initial_fps_frames = 6
|
214 |
+
streaming_fps_frames = 2
|
215 |
+
initial_time_interval = initial_fps_frames / fps
|
216 |
+
streaming_time_interval = streaming_fps_frames / fps
|
217 |
+
frame_time_interval = 1 / fps
|
218 |
+
|
219 |
+
def __init__(self, model_path: str = None, device: str = 'cuda'):
|
220 |
+
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
221 |
+
model_path, torch_dtype="auto",
|
222 |
+
device_map=device,
|
223 |
+
attn_implementation='sdpa'
|
224 |
+
)
|
225 |
+
self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
|
226 |
+
self.streaming_eos_token_id = self.processor.tokenizer(' ...').input_ids[-1]
|
227 |
+
self.model.prepare_inputs_for_generation = functools.partial(prepare_multiturn_multimodal_inputs_for_generation, self.model)
|
228 |
+
message = {
|
229 |
+
"role": "user",
|
230 |
+
"content": [
|
231 |
+
{"type": "text", "text": 'livecc'},
|
232 |
+
]
|
233 |
+
}
|
234 |
+
texts = self.processor.apply_chat_template([message], tokenize=False)
|
235 |
+
self.system_prompt_offset = texts.index('<|im_start|>user')
|
236 |
+
self._cached_video_readers_with_hw = {}
|
237 |
+
|
238 |
+
@torch.inference_mode()
|
239 |
+
def video_qa(
|
240 |
+
self,
|
241 |
+
message: str,
|
242 |
+
state: dict,
|
243 |
+
history: list = [],
|
244 |
+
do_sample: bool = False,
|
245 |
+
repetition_penalty: float = 1.05,
|
246 |
+
hf_spaces: bool = False,
|
247 |
+
**kwargs,
|
248 |
+
):
|
249 |
+
"""
|
250 |
+
state: dict, (maybe) with keys:
|
251 |
+
video_path: str, video path
|
252 |
+
video_timestamp: float, current video timestamp
|
253 |
+
last_timestamp: float, last processed video timestamp
|
254 |
+
last_video_pts_index: int, last processed video frame index
|
255 |
+
video_pts: np.ndarray, video pts
|
256 |
+
last_history: list, last processed history
|
257 |
+
"""
|
258 |
+
video_path = state.get('video_path', None)
|
259 |
+
conversation = []
|
260 |
+
if hf_spaces:
|
261 |
+
for past_message in history:
|
262 |
+
content = [{"type": "text", "text": past_message['content']}]
|
263 |
+
if video_path: # only use once
|
264 |
+
content.insert(0, {"type": "video", "video": video_path})
|
265 |
+
video_path = None
|
266 |
+
conversation.append({"role": past_message["role"], "content": content})
|
267 |
+
else:
|
268 |
+
pass # use past_key_values
|
269 |
+
past_ids = state.get('past_ids', None)
|
270 |
+
content = [{"type": "text", "text": message}]
|
271 |
+
if past_ids is None and video_path: # only use once
|
272 |
+
content.insert(0, {"type": "video", "video": video_path})
|
273 |
+
conversation.append({"role": "user", "content": content})
|
274 |
+
image_inputs, video_inputs = process_vision_info(conversation)
|
275 |
+
texts = self.processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True, return_tensors='pt')
|
276 |
+
if past_ids is not None:
|
277 |
+
texts = '<|im_end|>\n' + texts[self.system_prompt_offset:]
|
278 |
+
inputs = self.processor(
|
279 |
+
text=texts,
|
280 |
+
images=image_inputs,
|
281 |
+
videos=video_inputs,
|
282 |
+
return_tensors="pt",
|
283 |
+
return_attention_mask=False
|
284 |
+
)
|
285 |
+
inputs.to(self.model.device)
|
286 |
+
if past_ids is not None:
|
287 |
+
inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
|
288 |
+
outputs = self.model.generate(
|
289 |
+
**inputs, past_key_values=state.get('past_key_values', None),
|
290 |
+
return_dict_in_generate=True, do_sample=do_sample,
|
291 |
+
repetition_penalty=repetition_penalty,
|
292 |
+
max_new_tokens=512,
|
293 |
+
)
|
294 |
+
state['past_key_values'] = outputs.past_key_values if not hf_spaces else None
|
295 |
+
state['past_ids'] = outputs.sequences[:, :-1] if not hf_spaces else None
|
296 |
+
response = self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
|
297 |
+
return response, state
|
298 |
+
|
299 |
+
model_path = 'chenjoya/LiveCC-7B-Instruct'
|
300 |
+
video_path = "spacex_falcon9.mp4"
|
301 |
+
|
302 |
+
infer = LiveCCDemoInfer(model_path=model_path)
|
303 |
+
state = {'video_path': video_path}
|
304 |
+
# first round
|
305 |
+
response, state = infer.video_qa(message='What is the video?', state=state)
|
306 |
+
# second round
|
307 |
+
response, state = infer.video_qa(message='What? Say again.', state=state)
|
308 |
+
```
|
309 |
+
|
310 |
+
|
311 |
+
## Limitations
|
312 |
+
|
313 |
+
- This model is only performed video-ASR streaming pre-training, so it may not support well in common video qa.
|
314 |
+
- When performing real-time video commentary, it may appear collapse --- e.g., repeat pattern. If you encounter this situation, try to adjust repetition_penalty, streaming_eos_base_threshold, and streaming_eos_threshold_step.
|
315 |
+
- This model only has a context window of 32768. Using more visual tokens per frame (e.g. 768 * 28 * 28) will have better performance, but will shorten the working duration.
|
316 |
+
|
317 |
+
These limitations serve as ongoing directions for model optimization and improvement, and we are committed to continually enhancing the model's performance and scope of application.
|
318 |
+
|
319 |
+
## Citation
|
320 |
+
|
321 |
+
If you find our work helpful, feel free to give us a cite.
|
322 |
+
|
323 |
+
```
|
324 |
@article{livecc,
|
325 |
author = {Joya Chen and Ziyun Zeng and Yiqi Lin and Wei Li and Zejun Ma and Mike Zheng Shou},
|
326 |
title = {LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale},
|