Commit
·
81fd843
verified
·
0
Parent(s):
Duplicate from LanguageBind/Video-LLaVA-7B-hf
Browse filesCo-authored-by: linbin <[email protected]>
- .gitattributes +35 -0
- README.md +132 -0
- added_tokens.json +5 -0
- config.json +38 -0
- generation_config.json +7 -0
- model-00001-of-00003.safetensors +3 -0
- model-00002-of-00003.safetensors +3 -0
- model-00003-of-00003.safetensors +3 -0
- model.safetensors.index.json +0 -0
- preprocessor_config.json +46 -0
- special_tokens_map.json +30 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +68 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: transformers
|
3 |
+
tags: []
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Video-LLaVa
|
7 |
+
|
8 |
+
|
9 |
+
## Model Details
|
10 |
+
|
11 |
+
|
12 |
+
**Model type:**
|
13 |
+
Video-LLaVA is an open-source multomodal model trained by fine-tuning LLM on multimodal instruction-following data. It is an auto-regressive language model, based on the transformer architecture.
|
14 |
+
Base LLM: [lmsys/vicuna-13b-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5)
|
15 |
+
|
16 |
+
**Model Description:**
|
17 |
+
The model can generate interleaving images and videos, despite the absence of image-video pairs in the dataset. Video-LLaVa is uses an encoder trained for unified visual representation through alignment prior to projection.
|
18 |
+
Extensive experiments demonstrate the complementarity of modalities, showcasing significant superiority when compared to models specifically designed for either images or videos.
|
19 |
+
|
20 |
+
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/videollava_example.png"
|
21 |
+
alt="drawing" width="600"/>
|
22 |
+
|
23 |
+
<small> VideoLLaVa example. Taken from the <a href="https://arxiv.org/abs/2311.10122">original paper.</a> </small>
|
24 |
+
|
25 |
+
**Paper or resources for more information:**
|
26 |
+
https://github.com/PKU-YuanGroup/Video-LLaVA
|
27 |
+
|
28 |
+
|
29 |
+
## 🗝️ Training Dataset
|
30 |
+
- The images pretraining dataset is from [LLaVA](https://github.com/haotian-liu/LLaVA).
|
31 |
+
- The images tuning dataset is from [LLaVA](https://github.com/haotian-liu/LLaVA).
|
32 |
+
- The videos pretraining dataset is from [Valley](https://github.com/RupertLuo/Valley).
|
33 |
+
- The videos tuning dataset is from [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT).
|
34 |
+
|
35 |
+
|
36 |
+
## How to Get Started with the Model
|
37 |
+
|
38 |
+
Use the code below to get started with the model.
|
39 |
+
|
40 |
+
```python
|
41 |
+
from PIL import Image
|
42 |
+
import requests
|
43 |
+
import numpy as np
|
44 |
+
import av
|
45 |
+
from huggingface_hub import hf_hub_download
|
46 |
+
from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
|
47 |
+
|
48 |
+
def read_video_pyav(container, indices):
|
49 |
+
'''
|
50 |
+
Decode the video with PyAV decoder.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
container (av.container.input.InputContainer): PyAV container.
|
54 |
+
indices (List[int]): List of frame indices to decode.
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
|
58 |
+
'''
|
59 |
+
frames = []
|
60 |
+
container.seek(0)
|
61 |
+
start_index = indices[0]
|
62 |
+
end_index = indices[-1]
|
63 |
+
for i, frame in enumerate(container.decode(video=0)):
|
64 |
+
if i > end_index:
|
65 |
+
break
|
66 |
+
if i >= start_index and i in indices:
|
67 |
+
frames.append(frame)
|
68 |
+
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
|
69 |
+
|
70 |
+
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
|
71 |
+
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
|
72 |
+
|
73 |
+
prompt = "USER: <video>Why is this video funny? ASSISTANT:"
|
74 |
+
video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
|
75 |
+
container = av.open(video_path)
|
76 |
+
|
77 |
+
# sample uniformly 8 frames from the video
|
78 |
+
total_frames = container.streams.video[0].frames
|
79 |
+
indices = np.arange(0, total_frames, total_frames / 8).astype(int)
|
80 |
+
clip = read_video_pyav(container, indices)
|
81 |
+
|
82 |
+
inputs = processor(text=prompt, videos=clip, return_tensors="pt")
|
83 |
+
|
84 |
+
# Generate
|
85 |
+
generate_ids = model.generate(**inputs, max_length=80)
|
86 |
+
print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
|
87 |
+
>>> 'USER: Why is this video funny? ASSISTANT: The video is funny because the baby is sitting on the bed and reading a book, which is an unusual and amusing sight.Ъ'
|
88 |
+
|
89 |
+
# Generate from images and videos mix
|
90 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
91 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
92 |
+
prompt = [
|
93 |
+
"USER: <image> How many cats are there in the image? ASSISTANT:",
|
94 |
+
"USER: <video>Why is this video funny? ASSISTANT:"
|
95 |
+
]
|
96 |
+
inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
|
97 |
+
|
98 |
+
# Generate
|
99 |
+
generate_ids = model.generate(**inputs, max_length=50)
|
100 |
+
print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True))
|
101 |
+
>>> ['USER: How many cats are there in the image? ASSISTANT: There are two cats in the image.\nHow many cats are sleeping on the couch?\nThere are', 'USER: Why is this video funny? ASSISTANT: The video is funny because the baby is sitting on the bed and reading a book, which is an unusual and amusing']
|
102 |
+
```
|
103 |
+
|
104 |
+
|
105 |
+
## 👍 Acknowledgement
|
106 |
+
* [LLaVA](https://github.com/haotian-liu/LLaVA) The codebase we built upon and it is an efficient large language and vision assistant.
|
107 |
+
* [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT) Great job contributing the evaluation code and dataset.
|
108 |
+
|
109 |
+
## 🔒 License
|
110 |
+
* The majority of this project is released under the Apache 2.0 license as found in the [LICENSE](https://github.com/PKU-YuanGroup/Video-LLaVA/blob/main/LICENSE) file.
|
111 |
+
* The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
|
112 |
+
|
113 |
+
## ✏️ Citation
|
114 |
+
If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil:.
|
115 |
+
|
116 |
+
```BibTeX
|
117 |
+
@article{lin2023video,
|
118 |
+
title={Video-LLaVA: Learning United Visual Representation by Alignment Before Projection},
|
119 |
+
author={Lin, Bin and Zhu, Bin and Ye, Yang and Ning, Munan and Jin, Peng and Yuan, Li},
|
120 |
+
journal={arXiv preprint arXiv:2311.10122},
|
121 |
+
year={2023}
|
122 |
+
}
|
123 |
+
```
|
124 |
+
|
125 |
+
```BibTeX
|
126 |
+
@article{zhu2023languagebind,
|
127 |
+
title={LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment},
|
128 |
+
author={Zhu, Bin and Lin, Bin and Ning, Munan and Yan, Yang and Cui, Jiaxi and Wang, HongFa and Pang, Yatian and Jiang, Wenhao and Zhang, Junwu and Li, Zongwei and others},
|
129 |
+
journal={arXiv preprint arXiv:2310.01852},
|
130 |
+
year={2023}
|
131 |
+
}
|
132 |
+
```
|
added_tokens.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<image>": 32000,
|
3 |
+
"<pad>": 32002,
|
4 |
+
"<video>": 32001
|
5 |
+
}
|
config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"VideoLlavaForConditionalGeneration"
|
4 |
+
],
|
5 |
+
"ignore_index": -100,
|
6 |
+
"image_token_index": 32000,
|
7 |
+
"model_type": "video_llava",
|
8 |
+
"pad_token_id": 32002,
|
9 |
+
"projector_hidden_act": "gelu",
|
10 |
+
"text_config": {
|
11 |
+
"_name_or_path": "lmsys/vicuna-7b-v1.5",
|
12 |
+
"architectures": [
|
13 |
+
"LlamaForCausalLM"
|
14 |
+
],
|
15 |
+
"max_position_embeddings": 4096,
|
16 |
+
"model_type": "llama",
|
17 |
+
"pad_token_id": 0,
|
18 |
+
"rms_norm_eps": 1e-05,
|
19 |
+
"torch_dtype": "float16",
|
20 |
+
"vocab_size": 32064
|
21 |
+
},
|
22 |
+
"torch_dtype": "bfloat16",
|
23 |
+
"transformers_version": "4.41.0.dev0",
|
24 |
+
"video_token_index": 32001,
|
25 |
+
"vision_config": {
|
26 |
+
"hidden_size": 1024,
|
27 |
+
"intermediate_size": 4096,
|
28 |
+
"model_type": "clip_vision_model",
|
29 |
+
"num_attention_heads": 16,
|
30 |
+
"num_hidden_layers": 24,
|
31 |
+
"patch_size": 14,
|
32 |
+
"projection_dim": 768,
|
33 |
+
"vocab_size": 32000
|
34 |
+
},
|
35 |
+
"vision_feature_layer": -2,
|
36 |
+
"vision_feature_select_strategy": "default",
|
37 |
+
"vocab_size": 32064
|
38 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"eos_token_id": 2,
|
5 |
+
"pad_token_id": 32002,
|
6 |
+
"transformers_version": "4.41.0.dev0"
|
7 |
+
}
|
model-00001-of-00003.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:44e04e21c28f1201d1cd5e36cb155da59106fb7cf8ed1b39107840b77c30a8c7
|
3 |
+
size 4979992712
|
model-00002-of-00003.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e770a59eb7e8db746bfa8cf414b90c998660c4cd8dd68423433552604db30a0
|
3 |
+
size 4947392504
|
model-00003-of-00003.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2885519fb8795d283abd7c80ac17efc9e480f779e889f9260710c1b36fd38c84
|
3 |
+
size 4805317976
|
model.safetensors.index.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessor_config.json
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_valid_processor_keys": [
|
3 |
+
"images",
|
4 |
+
"videos",
|
5 |
+
"do_resize",
|
6 |
+
"size",
|
7 |
+
"resample",
|
8 |
+
"do_center_crop",
|
9 |
+
"crop_size",
|
10 |
+
"do_rescale",
|
11 |
+
"rescale_factor",
|
12 |
+
"do_normalize",
|
13 |
+
"image_mean",
|
14 |
+
"image_std",
|
15 |
+
"do_convert_rgb",
|
16 |
+
"return_tensors",
|
17 |
+
"data_format",
|
18 |
+
"input_data_format"
|
19 |
+
],
|
20 |
+
"crop_size": {
|
21 |
+
"height": 224,
|
22 |
+
"width": 224
|
23 |
+
},
|
24 |
+
"do_center_crop": true,
|
25 |
+
"do_convert_rgb": true,
|
26 |
+
"do_normalize": true,
|
27 |
+
"do_rescale": true,
|
28 |
+
"do_resize": true,
|
29 |
+
"image_mean": [
|
30 |
+
0.48145466,
|
31 |
+
0.4578275,
|
32 |
+
0.40821073
|
33 |
+
],
|
34 |
+
"image_processor_type": "VideoLlavaImageProcessor",
|
35 |
+
"image_std": [
|
36 |
+
0.26862954,
|
37 |
+
0.26130258,
|
38 |
+
0.27577711
|
39 |
+
],
|
40 |
+
"processor_class": "VideoLlavaProcessor",
|
41 |
+
"resample": 3,
|
42 |
+
"rescale_factor": 0.00392156862745098,
|
43 |
+
"size": {
|
44 |
+
"shortest_edge": 224
|
45 |
+
}
|
46 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<pad>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
3 |
+
size 499723
|
tokenizer_config.json
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"add_prefix_space": true,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "<unk>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": true
|
13 |
+
},
|
14 |
+
"1": {
|
15 |
+
"content": "<s>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": false,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"2": {
|
23 |
+
"content": "</s>",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": false,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": true
|
29 |
+
},
|
30 |
+
"32000": {
|
31 |
+
"content": "<image>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false,
|
36 |
+
"special": true
|
37 |
+
},
|
38 |
+
"32001": {
|
39 |
+
"content": "<video>",
|
40 |
+
"lstrip": false,
|
41 |
+
"normalized": false,
|
42 |
+
"rstrip": false,
|
43 |
+
"single_word": false,
|
44 |
+
"special": true
|
45 |
+
},
|
46 |
+
"32002": {
|
47 |
+
"content": "<pad>",
|
48 |
+
"lstrip": false,
|
49 |
+
"normalized": false,
|
50 |
+
"rstrip": false,
|
51 |
+
"single_word": false,
|
52 |
+
"special": true
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"bos_token": "<s>",
|
56 |
+
"clean_up_tokenization_spaces": false,
|
57 |
+
"eos_token": "</s>",
|
58 |
+
"legacy": false,
|
59 |
+
"model_max_length": 4096,
|
60 |
+
"pad_token": "<pad>",
|
61 |
+
"padding_side": "left",
|
62 |
+
"processor_class": "VideoLlavaProcessor",
|
63 |
+
"sp_model_kwargs": {},
|
64 |
+
"spaces_between_special_tokens": false,
|
65 |
+
"tokenizer_class": "LlamaTokenizer",
|
66 |
+
"unk_token": "<unk>",
|
67 |
+
"use_default_system_prompt": false
|
68 |
+
}
|