Upload folder using huggingface_hub
Browse files- .ipynb_checkpoints/README-checkpoint.md +101 -0
- README.md +4 -2
.ipynb_checkpoints/README-checkpoint.md
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Model Card
|
2 |
+
|
3 |
+
<p align="center">
|
4 |
+
<img src="./assets/logo.jpg" alt="Video-XL-Pro Logo" width="350">
|
5 |
+
</p>
|
6 |
+
|
7 |
+
[Paper](https://arxiv.org/pdf/2503.18478) | [Model](https://huggingface.co/lxr2003/Video-XL-Pro-3B)
|
8 |
+
|
9 |
+
**Video-XL-Pro 3B** is a powerful multimodal large model designed for **extremely long video understanding**, supporting up to **10,000-frame input**. Leveraging a novel **Reconstructive Token Compression** mechanism, it enables efficient and effective long-range temporal reasoning.
|
10 |
+
|
11 |
+
### ✨ Highlights
|
12 |
+
|
13 |
+
- 🚀 **SOTA Performance** among 3B-scale models on:
|
14 |
+
- MLVU
|
15 |
+
- VideoMME
|
16 |
+
- VNBench
|
17 |
+
- LongVideoBench
|
18 |
+
|
19 |
+
- 🧠 **Efficient Long Video Processing**:
|
20 |
+
- Handles up to **10,000 frames on a single 80G A100 GPU**
|
21 |
+
- Achieves **~98% accuracy** on Needle-in-a-Haystack benchmark
|
22 |
+
|
23 |
+
---
|
24 |
+
|
25 |
+
## Quickstart
|
26 |
+
|
27 |
+
Before running the following code snippet, ensure you have installed the necessary dependencies by following the installation guide at [our official github repo](https://github.com/VectorSpaceLab/Video-XL/tree/main/Video-XL-Pro#installation). The installation process includes setting up the conda environment, installing PyTorch, and other required packages.
|
28 |
+
|
29 |
+
## Runing Scripts
|
30 |
+
|
31 |
+
```python
|
32 |
+
import torch
|
33 |
+
import transformers
|
34 |
+
import gc
|
35 |
+
from videoxlpro.videoxlpro.demo_utils import process_video, load_image_processor, generate_response
|
36 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
37 |
+
import warnings
|
38 |
+
|
39 |
+
# 禁用一些警告
|
40 |
+
transformers.logging.set_verbosity_error()
|
41 |
+
warnings.filterwarnings('ignore')
|
42 |
+
|
43 |
+
# 设置设备
|
44 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
45 |
+
|
46 |
+
# 模型路径
|
47 |
+
model_path = "lxr2003/Video-XL-Pro-3B"
|
48 |
+
video_path = "/path/to/your/example_video.mp4"
|
49 |
+
|
50 |
+
# 使用 Auto 类加载模型
|
51 |
+
# 使用 Auto 类加载模型
|
52 |
+
model = AutoModelForCausalLM.from_pretrained(
|
53 |
+
model_path,
|
54 |
+
low_cpu_mem_usage=True,
|
55 |
+
torch_dtype=torch.float16,
|
56 |
+
attn_implementation="flash_attention_2",
|
57 |
+
device_map=device,
|
58 |
+
trust_remote_code=True
|
59 |
+
)
|
60 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
61 |
+
model_path,
|
62 |
+
trust_remote_code=True
|
63 |
+
)
|
64 |
+
|
65 |
+
image_processor = load_image_processor(model, tokenizer)
|
66 |
+
|
67 |
+
max_frames_num = 128
|
68 |
+
|
69 |
+
# 处理视频
|
70 |
+
video_tensor,time_embed = process_video(video_path,tokenizer, image_processor, model.device, max_frames_num)
|
71 |
+
|
72 |
+
# 生成参数
|
73 |
+
gen_kwargs = {
|
74 |
+
"do_sample": True,
|
75 |
+
"temperature": 0.01,
|
76 |
+
"top_p": 0.001,
|
77 |
+
"num_beams": 1,
|
78 |
+
"use_cache": True,
|
79 |
+
"max_new_tokens": 256
|
80 |
+
}
|
81 |
+
|
82 |
+
# 文本提示
|
83 |
+
prompt = "Describe this video."
|
84 |
+
|
85 |
+
text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
|
86 |
+
|
87 |
+
response = generate_response(model, tokenizer, text, video_tensor,time_embed, gen_kwargs)
|
88 |
+
|
89 |
+
# 4. 输出结果
|
90 |
+
print("\n===== 生成的回答 =====")
|
91 |
+
print(response)
|
92 |
+
|
93 |
+
```
|
94 |
+
|
95 |
+
*Note: Replace `'example_video.mp4'` with your actual video path.*
|
96 |
+
|
97 |
+
---
|
98 |
+
|
99 |
+
## License
|
100 |
+
|
101 |
+
This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. The content of this project itself is licensed under the [Apache license 2.0](./LICENSE).
|
README.md
CHANGED
@@ -47,6 +47,7 @@ device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
47 |
model_path = "lxr2003/Video-XL-Pro-3B"
|
48 |
video_path = "/path/to/your/example_video.mp4"
|
49 |
|
|
|
50 |
# 使用 Auto 类加载模型
|
51 |
model = AutoModelForCausalLM.from_pretrained(
|
52 |
model_path,
|
@@ -66,7 +67,7 @@ image_processor = load_image_processor(model, tokenizer)
|
|
66 |
max_frames_num = 128
|
67 |
|
68 |
# 处理视频
|
69 |
-
video_tensor = process_video(video_path, image_processor, model.device, max_frames_num)
|
70 |
|
71 |
# 生成参数
|
72 |
gen_kwargs = {
|
@@ -83,11 +84,12 @@ prompt = "Describe this video."
|
|
83 |
|
84 |
text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
|
85 |
|
86 |
-
response = generate_response(model, tokenizer, text, video_tensor, gen_kwargs)
|
87 |
|
88 |
# 4. 输出结果
|
89 |
print("\n===== 生成的回答 =====")
|
90 |
print(response)
|
|
|
91 |
```
|
92 |
|
93 |
*Note: Replace `'example_video.mp4'` with your actual video path.*
|
|
|
47 |
model_path = "lxr2003/Video-XL-Pro-3B"
|
48 |
video_path = "/path/to/your/example_video.mp4"
|
49 |
|
50 |
+
# 使用 Auto 类加载模型
|
51 |
# 使用 Auto 类加载模型
|
52 |
model = AutoModelForCausalLM.from_pretrained(
|
53 |
model_path,
|
|
|
67 |
max_frames_num = 128
|
68 |
|
69 |
# 处理视频
|
70 |
+
video_tensor,time_embed = process_video(video_path,tokenizer, image_processor, model.device, max_frames_num)
|
71 |
|
72 |
# 生成参数
|
73 |
gen_kwargs = {
|
|
|
84 |
|
85 |
text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
|
86 |
|
87 |
+
response = generate_response(model, tokenizer, text, video_tensor,time_embed, gen_kwargs)
|
88 |
|
89 |
# 4. 输出结果
|
90 |
print("\n===== 生成的回答 =====")
|
91 |
print(response)
|
92 |
+
|
93 |
```
|
94 |
|
95 |
*Note: Replace `'example_video.mp4'` with your actual video path.*
|