yongqiang commited on
Commit
6fb90cb
·
1 Parent(s): 9426f83

init this repo

Browse files
Files changed (50) hide show
  1. assets/bee.jpg +3 -0
  2. infer_axmodel.py +316 -0
  3. smolvlm2_axmodel/llama_p1024_l0_together.axmodel +3 -0
  4. smolvlm2_axmodel/llama_p1024_l10_together.axmodel +3 -0
  5. smolvlm2_axmodel/llama_p1024_l11_together.axmodel +3 -0
  6. smolvlm2_axmodel/llama_p1024_l12_together.axmodel +3 -0
  7. smolvlm2_axmodel/llama_p1024_l13_together.axmodel +3 -0
  8. smolvlm2_axmodel/llama_p1024_l14_together.axmodel +3 -0
  9. smolvlm2_axmodel/llama_p1024_l15_together.axmodel +3 -0
  10. smolvlm2_axmodel/llama_p1024_l16_together.axmodel +3 -0
  11. smolvlm2_axmodel/llama_p1024_l17_together.axmodel +3 -0
  12. smolvlm2_axmodel/llama_p1024_l18_together.axmodel +3 -0
  13. smolvlm2_axmodel/llama_p1024_l19_together.axmodel +3 -0
  14. smolvlm2_axmodel/llama_p1024_l1_together.axmodel +3 -0
  15. smolvlm2_axmodel/llama_p1024_l20_together.axmodel +3 -0
  16. smolvlm2_axmodel/llama_p1024_l21_together.axmodel +3 -0
  17. smolvlm2_axmodel/llama_p1024_l22_together.axmodel +3 -0
  18. smolvlm2_axmodel/llama_p1024_l23_together.axmodel +3 -0
  19. smolvlm2_axmodel/llama_p1024_l24_together.axmodel +3 -0
  20. smolvlm2_axmodel/llama_p1024_l25_together.axmodel +3 -0
  21. smolvlm2_axmodel/llama_p1024_l26_together.axmodel +3 -0
  22. smolvlm2_axmodel/llama_p1024_l27_together.axmodel +3 -0
  23. smolvlm2_axmodel/llama_p1024_l28_together.axmodel +3 -0
  24. smolvlm2_axmodel/llama_p1024_l29_together.axmodel +3 -0
  25. smolvlm2_axmodel/llama_p1024_l2_together.axmodel +3 -0
  26. smolvlm2_axmodel/llama_p1024_l30_together.axmodel +3 -0
  27. smolvlm2_axmodel/llama_p1024_l31_together.axmodel +3 -0
  28. smolvlm2_axmodel/llama_p1024_l3_together.axmodel +3 -0
  29. smolvlm2_axmodel/llama_p1024_l4_together.axmodel +3 -0
  30. smolvlm2_axmodel/llama_p1024_l5_together.axmodel +3 -0
  31. smolvlm2_axmodel/llama_p1024_l6_together.axmodel +3 -0
  32. smolvlm2_axmodel/llama_p1024_l7_together.axmodel +3 -0
  33. smolvlm2_axmodel/llama_p1024_l8_together.axmodel +3 -0
  34. smolvlm2_axmodel/llama_p1024_l9_together.axmodel +3 -0
  35. smolvlm2_axmodel/llama_post.axmodel +3 -0
  36. smolvlm2_axmodel/model.embed_tokens.weight.npy +3 -0
  37. smolvlm2_tokenizer/.gitattributes +35 -0
  38. smolvlm2_tokenizer/README.md +270 -0
  39. smolvlm2_tokenizer/added_tokens.json +130 -0
  40. smolvlm2_tokenizer/chat_template.json +3 -0
  41. smolvlm2_tokenizer/config.json +141 -0
  42. smolvlm2_tokenizer/generation_config.json +7 -0
  43. smolvlm2_tokenizer/merges.txt +0 -0
  44. smolvlm2_tokenizer/preprocessor_config.json +35 -0
  45. smolvlm2_tokenizer/processor_config.json +4 -0
  46. smolvlm2_tokenizer/special_tokens_map.json +39 -0
  47. smolvlm2_tokenizer/tokenizer.json +0 -0
  48. smolvlm2_tokenizer/tokenizer_config.json +1192 -0
  49. smolvlm2_tokenizer/vocab.json +0 -0
  50. vit_mdoel/vision_model.onnx +3 -0
assets/bee.jpg ADDED

Git LFS Details

  • SHA256: 8b21ba78250f852ca5990063866b1ace6432521d0251bde7f8de783b22c99a6d
  • Pointer size: 132 Bytes
  • Size of remote file: 5.37 MB
infer_axmodel.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoProcessor, AutoModelForImageTextToText
2
+ import torch
3
+ import onnx
4
+ import onnxruntime as ort
5
+ import numpy as np
6
+ import os
7
+ from tqdm import tqdm
8
+ from transformers import AutoConfig
9
+ from typing import List, Tuple
10
+ from axengine import InferenceSession
11
+ from ml_dtypes import bfloat16
12
+
13
+
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ embeddings = torch.load("SmolVLMVisionEmbeddings.pkl", map_location=device, weights_only=False)
16
+ embeds = np.load(os.path.join("./SmolVLM2-500M-Video-Instruct_1024_AXMODEL", "model.embed_tokens.weight.npy"))
17
+ # connector = torch.load("SmolVLMConnector.pkl", map_location=device, weights_only=False)
18
+ encoder = ort.InferenceSession(f'./export_onnx_model/vision_model.onnx', providers=["CPUExecutionProvider"])
19
+
20
+
21
+ def run_vision_model(
22
+ pixel_values,
23
+ patch_attention_mask=None,
24
+ ):
25
+ batch_size = pixel_values.size(0)
26
+ if patch_attention_mask is None:
27
+ patch_size = 16
28
+ patch_attention_mask = torch.ones(
29
+ (
30
+ batch_size,
31
+ pixel_values.size(2) // patch_size,
32
+ pixel_values.size(3) // patch_size,
33
+ )
34
+ )
35
+ patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
36
+
37
+ hidden_states = embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
38
+
39
+ patch_attention_mask = patch_attention_mask.view(batch_size, -1)
40
+ # The call to `_upad_input` in `_flash_attention_forward` is expensive
41
+ # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
42
+ # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
43
+ if not torch.any(~patch_attention_mask):
44
+ patch_attention_mask = None
45
+ elif not self._use_flash_attention_2:
46
+ patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
47
+
48
+ encoder_outputs = encoder.run(None, {"input": hidden_states.detach().cpu().to(dtype=torch.float32).numpy()})[0]
49
+ encoder_outputs = torch.from_numpy(encoder_outputs).to(device, dtype=hidden_states.dtype)
50
+
51
+ return encoder_outputs
52
+
53
+
54
+ def get_image_features(pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
55
+ """
56
+ Encodes images into continuous embeddings that can be forwarded to the language model.
57
+
58
+ Args:
59
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
60
+ The tensors corresponding to the input images.
61
+ pixel_attention_mask (`torch.LongTensor`, *optional*):
62
+ The attention mask indicating padded regions in the image.
63
+ """
64
+ batch_size, num_images, num_channels, height, width = pixel_values.shape
65
+ pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
66
+
67
+ # Remove padding images - padding images are full 0.
68
+ nb_values_per_image = pixel_values.shape[1:].numel()
69
+ real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
70
+
71
+ if not any(real_images_inds):
72
+ # no images, leave one empty image.
73
+ real_images_inds[0] = True
74
+
75
+ pixel_values = pixel_values[real_images_inds].contiguous()
76
+ # Handle the vision attention mask
77
+ if pixel_attention_mask is None:
78
+ pixel_attention_mask = torch.ones(
79
+ size=[pixel_values.shape[i] for i in (0, 2, 3)],
80
+ dtype=torch.bool,
81
+ device=pixel_values.device,
82
+ )
83
+ else:
84
+ # Remove padding images from the mask
85
+ pixel_attention_mask = pixel_attention_mask.view(batch_size * num_images, *pixel_attention_mask.shape[2:])
86
+ pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
87
+ patch_size = 16
88
+ patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
89
+ patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
90
+ patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
91
+
92
+ # Get sequence from the vision encoder
93
+ image_hidden_states = run_vision_model(pixel_values, patch_attention_mask)
94
+
95
+ # Modality projection & resampling
96
+ # image_hidden_states = connector(image_hidden_states) # 已经 fuse 到了 onnx 中
97
+ return image_hidden_states
98
+
99
+
100
+ def inputs_merger(
101
+ input_ids: torch.LongTensor, inputs_embeds: torch.Tensor, image_hidden_states: torch.Tensor
102
+ ):
103
+ """
104
+ This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
105
+ The merging happens as follows:
106
+ - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
107
+ - We get the image hidden states for the image through the vision encoder and that hidden state, after a pixel shuffle operation, is then projected into the text embedding space.
108
+ We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
109
+ - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
110
+ - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
111
+ """
112
+ _, patch_size, _ = image_hidden_states.shape
113
+
114
+ image_mask = input_ids == 49190 # self.image_token_id
115
+ num_image_tokens = image_mask.sum(dim=1)
116
+ if not torch.all(num_image_tokens % patch_size == 0):
117
+ raise ValueError("At least one sample has <image> tokens not divisible by patch_size.")
118
+
119
+ blocks_per_sample = num_image_tokens // patch_size
120
+
121
+ offsets = torch.nn.functional.pad(blocks_per_sample.cumsum(dim=0), (1, 0), value=0)
122
+ block_offset = offsets[:-1]
123
+ row_cum = image_mask.cumsum(dim=-1)
124
+ chunk_idx = (row_cum - 1) // patch_size
125
+ local_idx = (row_cum - 1) % patch_size
126
+ block_idx = block_offset.unsqueeze(1) + chunk_idx
127
+
128
+ image_embeds = torch.zeros_like(inputs_embeds)
129
+ image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]
130
+
131
+ merged_embeds = torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
132
+ return merged_embeds
133
+
134
+
135
+ def post_process(data, topk=1, topp=0.9, temperature=0.6):
136
+ def top_p(l: np.ndarray, p: float) -> np.ndarray:
137
+ index = np.argsort(l)
138
+ res = l.copy()
139
+ sum_p = 0
140
+ for i in index[::-1]:
141
+ if sum_p >= p:
142
+ res[i] = 0
143
+ sum_p += res[i]
144
+ return res / sum_p
145
+
146
+ def softmax(l: np.ndarray) -> np.ndarray:
147
+ l_max = l - l.max()
148
+ l_exp = np.exp(l_max)
149
+ res = l_exp / np.sum(l_exp)
150
+ return res.astype(np.float64)
151
+
152
+ r = data.astype(np.float32)
153
+ r = r.flatten()
154
+ candidate_index = np.argpartition(r, -topk)[-topk:]
155
+ candidate_value = r[candidate_index]
156
+ candidate_value /= temperature
157
+ candidate_soft = softmax(candidate_value)
158
+ candidate_soft = top_p(candidate_soft, topp)
159
+ candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
160
+ pos = np.random.multinomial(1, candidate_soft).argmax()
161
+ next_token = candidate_index[pos]
162
+ return next_token, candidate_index, candidate_soft
163
+
164
+
165
+ if __name__ == "__main__":
166
+
167
+ hf_model_path = "./SmolVLM2-500M-Video-Instruct/"
168
+ axmodel_path = "./SmolVLM2-500M-Video-Instruct_1024_AXMODEL"
169
+ prompt = 'Can you describe this image?'
170
+
171
+ processor = AutoProcessor.from_pretrained(hf_model_path)
172
+ config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
173
+ tokenizer = processor.tokenizer
174
+
175
+ messages = [
176
+ {
177
+ "role": "user",
178
+ "content": [
179
+ {"type": "image", "url": "./bee.jpg"},
180
+ {"type": "text", "text": prompt},
181
+ ]
182
+ },
183
+ ]
184
+
185
+ inputs = processor.apply_chat_template(
186
+ messages,
187
+ add_generation_prompt=True,
188
+ tokenize=True,
189
+ return_dict=True,
190
+ return_tensors="pt",
191
+ ).to(device, dtype=torch.bfloat16)
192
+
193
+ pixel_values = inputs["pixel_values"]
194
+ pixel_attention_mask = inputs["pixel_attention_mask"]
195
+ input_ids = inputs["input_ids"]
196
+ input_ids_length = input_ids.shape[1]
197
+
198
+ inputs_embeds = np.take(embeds, input_ids[0].cpu().numpy().tolist(), axis=0)[None, ...]
199
+ inputs_embeds = torch.from_numpy(inputs_embeds).to(device, dtype=torch.bfloat16)
200
+
201
+ """
202
+ miniforge-pypy3/envs/lerobot/lib/python3.10/site-packages/transformers/models/smolvlm/modeling_smolvlm.py(681)get_image_features()
203
+ """
204
+ image_hidden_states = get_image_features(pixel_values, pixel_attention_mask)
205
+
206
+ inputs_embeds = inputs_merger(
207
+ input_ids=input_ids,
208
+ inputs_embeds=inputs_embeds,
209
+ image_hidden_states=image_hidden_states,
210
+ ).to(dtype=torch.float32).cpu().numpy()
211
+
212
+ prefill_data = inputs_embeds
213
+ prefill_data = prefill_data.astype(bfloat16)
214
+ token_ids = input_ids[0].cpu().numpy().tolist()
215
+ token_len = len(token_ids)
216
+
217
+ lastN = 2048
218
+ cfg = config.text_config
219
+
220
+ kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
221
+ k_caches = [
222
+ np.zeros((1, lastN, kv_dim), dtype=bfloat16)
223
+ for _ in range(cfg.num_hidden_layers)
224
+ ]
225
+ v_caches = [
226
+ np.zeros((1, lastN, kv_dim), dtype=bfloat16)
227
+ for _ in range(cfg.num_hidden_layers)
228
+ ]
229
+
230
+ prefill_decoder_sessins = []
231
+ for i in tqdm(range(cfg.num_hidden_layers), desc="Init InferenceSession"):
232
+ session = InferenceSession(
233
+ f"{axmodel_path}/llama_p1024_l{i}_together.axmodel"
234
+ )
235
+ prefill_decoder_sessins.append(session)
236
+ post_process_session = InferenceSession(
237
+ f"{axmodel_path}/llama_post.axmodel"
238
+ )
239
+ print("model load done!")
240
+
241
+ """
242
+ prefill
243
+ """
244
+ prefill_len = 1024
245
+
246
+ if prefill_len > 0:
247
+ indices = np.array(list(range(prefill_len)), np.uint32).reshape(
248
+ (1, prefill_len)
249
+ )
250
+ indices[:, token_len:] = 0
251
+ mask = np.zeros((1, prefill_len, prefill_len)) - 65536
252
+ data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
253
+ data[:, 0:token_len] = prefill_data
254
+ for i, t in enumerate(token_ids):
255
+ mask[:, i, : i + 1] = 0
256
+ mask = mask.astype(bfloat16)
257
+ for i in range(cfg.num_hidden_layers):
258
+ input_feed = {
259
+ "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
260
+ "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
261
+ "indices": indices,
262
+ "input": data,
263
+ "mask": mask,
264
+ }
265
+ outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=1)
266
+ k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
267
+ v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
268
+ data[:, :token_len] = outputs[2][:, :token_len, :]
269
+
270
+ post_out = post_process_session.run(None, {"input": data[:, token_len - 1, :][None, ...]})[0]
271
+ next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
272
+ posibles = [tokenizer.decode([t]) for t in posssible_tokens]
273
+ posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
274
+ token_ids.append(next_token)
275
+ # print("prefill done!")
276
+ print(f"input prompt: {prompt}\n")
277
+ print("answer >>", tokenizer.decode(token_ids[token_len], skip_special_tokens=True), end='', flush=True)
278
+
279
+ """
280
+ decode
281
+ """
282
+ mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
283
+ mask[:, :, :lastN] -= 65536
284
+ mask[:, :, :token_len] = 0
285
+ for start_indice in range(lastN + 1):
286
+ if prefill_len > 0 and start_indice < token_len:
287
+ continue
288
+ next_token = token_ids[start_indice]
289
+ indices = np.array([start_indice], np.uint32).reshape((1, 1))
290
+ data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)
291
+
292
+ for i in range(cfg.num_hidden_layers):
293
+ input_feed = {
294
+ "K_cache": k_caches[i],
295
+ "V_cache": v_caches[i],
296
+ "indices": indices,
297
+ "input": data,
298
+ "mask": mask,
299
+ }
300
+ outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=0)
301
+ k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
302
+ v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
303
+ data = outputs[2]
304
+
305
+ mask[..., start_indice] = 0
306
+ if start_indice < token_len - 1:
307
+ pass
308
+ else:
309
+ post_out = post_process_session.run(None, {"input": data})[0]
310
+ next_token, posssible_tokens, possible_soft = post_process(post_out)
311
+ token_ids.append(next_token)
312
+ print(tokenizer.decode(next_token, skip_special_tokens=True), end='', flush=True)
313
+
314
+ if next_token == tokenizer.eos_token_id:
315
+ break
316
+ print("\n")
smolvlm2_axmodel/llama_p1024_l0_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:125ac7e80a94dbd3920fb0e0077ccad612abe8fabc2040dda09b19813ce96f68
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l10_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12f5aa82a4dcc3a66aaad951b1ea87c50e618c93adade3a2d1a7b5614169f5a1
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l11_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba247ba036a831b6201b53a03bf9847e16be239b386846cf22980da6695cc0d6
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l12_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:011aea9b7e4fcadec5d1b2c386ff4a12e2f3f0e0e31eca634afc8acc9f0d343b
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l13_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9420f15bb5b591f258212242bc5fa5566ba45f4d697d0599999114961152d1fd
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l14_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:397511107011f700388029e604c2f5ec6d092f9cb6e09ab890a198932173193c
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l15_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:689d9286ad7cf81345352f85bfbb8387934fe7ccb76d3f56563ded5f1d7cdb7b
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l16_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b91fecc232c92c9faa5fca4ca1bff0802abc8351457f9b34ef55327ccdcbc85a
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l17_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9404c81f4a02fe332ae1f4ed5361d2f68eea66a9550233cc4c1d4455afc95797
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l18_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa8d959498bd479d2bbb2c42e883a21bb173fbcb73f5d1bbdebe6c8365e8e21
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l19_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66265cbf7cd8571f949c23ca6a5918f8c95fb3413e4349cb9c9f3ac18231ca21
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l1_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9addcae5bad93adaf9f8df49d4cbfa82024be2d2e0b2e815537121a7417ecb88
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l20_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69430a836a9eb0d46242419a999e761d61a0c4cc4d17eafbe373641551ac0a8b
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l21_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a19009fd1a1d28c9414cb9421af4c66473088a0b3caea9157bde6aac071e1ce
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l22_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec30ac9fd2a52f281b76a037d0aa146b8144277aed3408a6c281e5a7df8ba62a
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l23_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1093d36fa84d6248b1a4728d8ae2aadb1143894eaf3d960e12fd3753d3ab4da2
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l24_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff63d4efb6dd75433205ce87e4d69d7850dad86555b2919864f04c5df3a8a844
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l25_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83d8b772f3aef6356234912a371baebcb6c0897faf3d524091b7ea2fc56f77bc
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l26_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:033f9deb6fe2288347d1af507d7a31deb0633614dfb0efe9a3a9c962afbe44eb
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l27_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0c8c035eb371dd31d53844534c4d321efc933e1097ad3e9d87afd52dba74214
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l28_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d33cae03279cab06a856cfacc3e84414c615082a4a358bd09c4a5996c17c575
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l29_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84583f5ef60b629b34d47c7deeb3200c096d6d6bf3de3f6bec4da6ae005b5a1e
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l2_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4514475633a7317118fe4486200bbed73929bd4210c6da4041591797ad93fb3a
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l30_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e1612aac9b1604146b61b4fc37eaada2299f62078260689bf03812c256c75b
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l31_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f2f54bcb7d01ea69a3177b72d49e3bdab2d0e0403e86085903389cc6839b5fd
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l3_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a991d67e4c1dc4bf58689ce4a58362f6bcc73a87257bcb2982774a0b056ca720
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l4_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a43e6886989c31dfffeae70177fc9464322bded5bb69515e31aaade31b431b5
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l5_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed59bef655c1eae8eb7af4566ef21fd874cfac72b67bbfd1a7279e1a1cffd2c8
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l6_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:848640700c17925475ef9f9edeaa0fccf235e90a5ad159430682ac389910d86b
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l7_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46e4bce8f94d80d12e3b1a5ceae7ba62cbaa06f0ddf11f13999b1936a98bc0a1
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l8_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3ba57d8f2cd4d932445600d161a04b0a1160f452425c5abd08f94bece56f23f
3
+ size 12002005
smolvlm2_axmodel/llama_p1024_l9_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0464cccfdfb0566069bad977d98f70b9e15e8e0b642a6e01ca2b16b5f7eb170a
3
+ size 12002005
smolvlm2_axmodel/llama_post.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89e16c32d05a23b3449b298d8df16bc80edba5c719812c2567e074bdccafbd50
3
+ size 51580706
smolvlm2_axmodel/model.embed_tokens.weight.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:226adbf93820671559d70330a69e69f02641a41b8284dd26b51576545ab3eb10
3
+ size 189235328
smolvlm2_tokenizer/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
smolvlm2_tokenizer/README.md ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ datasets:
5
+ - HuggingFaceM4/the_cauldron
6
+ - HuggingFaceM4/Docmatix
7
+ - lmms-lab/LLaVA-OneVision-Data
8
+ - lmms-lab/M4-Instruct-Data
9
+ - HuggingFaceFV/finevideo
10
+ - MAmmoTH-VL/MAmmoTH-VL-Instruct-12M
11
+ - lmms-lab/LLaVA-Video-178K
12
+ - orrzohar/Video-STaR
13
+ - Mutonix/Vript
14
+ - TIGER-Lab/VISTA-400K
15
+ - Enxin/MovieChat-1K_train
16
+ - ShareGPT4Video/ShareGPT4Video
17
+ pipeline_tag: image-text-to-text
18
+ language:
19
+ - en
20
+ base_model:
21
+ - HuggingFaceTB/SmolVLM-500M-Instruct
22
+ ---
23
+
24
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/SmolVLM2_banner.png" width="800" height="auto" alt="Image description">
25
+
26
+ # SmolVLM2-500M-Video
27
+
28
+ SmolVLM2-500M-Video is a lightweight multimodal model designed to analyze video content. The model processes videos, images, and text inputs to generate text outputs - whether answering questions about media files, comparing visual content, or transcribing text from images. Despite its compact size, requiring only 1.8GB of GPU RAM for video inference, it delivers robust performance on complex multimodal tasks. This efficiency makes it particularly well-suited for on-device applications where computational resources may be limited.
29
+ ## Model Summary
30
+
31
+ - **Developed by:** Hugging Face 🤗
32
+ - **Model type:** Multi-modal model (image/multi-image/video/text)
33
+ - **Language(s) (NLP):** English
34
+ - **License:** Apache 2.0
35
+ - **Architecture:** Based on [Idefics3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (see technical summary)
36
+
37
+ ## Resources
38
+
39
+ - **Demo:** [Video Highlight Generator](https://huggingface.co/spaces/HuggingFaceTB/SmolVLM2-HighlightGenerator)
40
+ - **Blog:** [Blog post](https://huggingface.co/blog/smolvlm2)
41
+
42
+ ## Uses
43
+
44
+ SmolVLM2 can be used for inference on multimodal (video / image / text) tasks where the input consists of text queries along with video or one or more images. Text and media files can be interleaved arbitrarily, enabling tasks like captioning, visual question answering, and storytelling based on visual content. The model does not support image or video generation.
45
+
46
+ To fine-tune SmolVLM2 on a specific task, you can follow [the fine-tuning tutorial](https://github.com/huggingface/smollm/blob/main/vision/finetuning/Smol_VLM_FT.ipynb).
47
+
48
+ ## Evaluation
49
+
50
+ We evaluated the performance of the SmolVLM2 family on the following scientific benchmarks:
51
+
52
+ | Size | Video-MME | MLVU | MVBench |
53
+ |----------|-----------------|----------|---------------|
54
+ | 2.2B | 52.1 | 55.2 | 46.27 |
55
+ | 500M | 42.2 | 47.3 | 39.73 |
56
+ | 256M | 33.7 | 40.6 | 32.7 |
57
+
58
+
59
+ ### How to get started
60
+
61
+ You can use transformers to load, infer and fine-tune SmolVLM. Make sure you have num2words, flash-attn and latest transformers installed.
62
+ You can load the model as follows.
63
+
64
+ ```python
65
+ from transformers import AutoProcessor, AutoModelForImageTextToText
66
+ import torch
67
+
68
+ model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
69
+ processor = AutoProcessor.from_pretrained(model_path)
70
+ model = AutoModelForImageTextToText.from_pretrained(
71
+ model_path,
72
+ torch_dtype=torch.bfloat16,
73
+ _attn_implementation="flash_attention_2"
74
+ ).to("cuda")
75
+ ```
76
+
77
+ #### Simple Inference
78
+
79
+ You preprocess your inputs directly using chat templates and directly passing them
80
+
81
+ ```python
82
+ messages = [
83
+ {
84
+ "role": "user",
85
+ "content": [
86
+ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
87
+ {"type": "text", "text": "Can you describe this image?"},
88
+ ]
89
+ },
90
+ ]
91
+
92
+ inputs = processor.apply_chat_template(
93
+ messages,
94
+ add_generation_prompt=True,
95
+ tokenize=True,
96
+ return_dict=True,
97
+ return_tensors="pt",
98
+ ).to(model.device, dtype=torch.bfloat16)
99
+
100
+ generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)
101
+ generated_texts = processor.batch_decode(
102
+ generated_ids,
103
+ skip_special_tokens=True,
104
+ )
105
+ print(generated_texts[0])
106
+ ```
107
+
108
+ #### Video Inference
109
+
110
+ To use SmolVLM2 for video inference, make sure you have decord installed.
111
+
112
+ ```python
113
+ messages = [
114
+ {
115
+ "role": "user",
116
+ "content": [
117
+ {"type": "video", "path": "path_to_video.mp4"},
118
+ {"type": "text", "text": "Describe this video in detail"}
119
+ ]
120
+ },
121
+ ]
122
+
123
+ inputs = processor.apply_chat_template(
124
+ messages,
125
+ add_generation_prompt=True,
126
+ tokenize=True,
127
+ return_dict=True,
128
+ return_tensors="pt",
129
+ ).to(model.device, dtype=torch.bfloat16)
130
+
131
+ generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)
132
+ generated_texts = processor.batch_decode(
133
+ generated_ids,
134
+ skip_special_tokens=True,
135
+ )
136
+
137
+ print(generated_texts[0])
138
+ ```
139
+ #### Multi-image Interleaved Inference
140
+
141
+ You can interleave multiple media with text using chat templates.
142
+
143
+ ```python
144
+ import torch
145
+
146
+
147
+ messages = [
148
+ {
149
+ "role": "user",
150
+ "content": [
151
+ {"type": "text", "text": "What is the similarity between these two images?"},
152
+ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
153
+ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"},
154
+ ]
155
+ },
156
+ ]
157
+
158
+ inputs = processor.apply_chat_template(
159
+ messages,
160
+ add_generation_prompt=True,
161
+ tokenize=True,
162
+ return_dict=True,
163
+ return_tensors="pt",
164
+ ).to(model.device, dtype=torch.bfloat16)
165
+
166
+ generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)
167
+ generated_texts = processor.batch_decode(
168
+ generated_ids,
169
+ skip_special_tokens=True,
170
+ )
171
+ print(generated_texts[0])
172
+ ```
173
+
174
+
175
+ ### Model optimizations
176
+
177
+ ## Misuse and Out-of-scope Use
178
+
179
+ SmolVLM is not intended for high-stakes scenarios or critical decision-making processes that affect an individual's well-being or livelihood. The model may produce content that appears factual but may not be accurate. Misuse includes, but is not limited to:
180
+
181
+ - Prohibited Uses:
182
+ - Evaluating or scoring individuals (e.g., in employment, education, credit)
183
+ - Critical automated decision-making
184
+ - Generating unreliable factual content
185
+ - Malicious Activities:
186
+ - Spam generation
187
+ - Disinformation campaigns
188
+ - Harassment or abuse
189
+ - Unauthorized surveillance
190
+
191
+ ### License
192
+
193
+ SmolVLM2 is built upon [SigLIP](https://huggingface.co/google/siglip-base-patch16-512) as image encoder and [SmolLM2](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct) for text decoder part.
194
+
195
+ We release the SmolVLM2 checkpoints under the Apache 2.0 license.
196
+
197
+ ## Citation information
198
+ You can cite us in the following way:
199
+ ```bibtex
200
+ @article{marafioti2025smolvlm,
201
+ title={SmolVLM: Redefining small and efficient multimodal models},
202
+ author={Andrés Marafioti and Orr Zohar and Miquel Farré and Merve Noyan and Elie Bakouch and Pedro Cuenca and Cyril Zakka and Loubna Ben Allal and Anton Lozhkov and Nouamane Tazi and Vaibhav Srivastav and Joshua Lochner and Hugo Larcher and Mathieu Morlon and Lewis Tunstall and Leandro von Werra and Thomas Wolf},
203
+ journal={arXiv preprint arXiv:2504.05299},
204
+ year={2025}
205
+ }
206
+ ```
207
+
208
+ ## Training Data
209
+ SmolVLM2 used 3.3M samples for training originally from ten different datasets: [LlaVa Onevision](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data), [M4-Instruct](https://huggingface.co/datasets/lmms-lab/M4-Instruct-Data), [Mammoth](https://huggingface.co/datasets/MAmmoTH-VL/MAmmoTH-VL-Instruct-12M), [LlaVa Video 178K](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K), [FineVideo](https://huggingface.co/datasets/HuggingFaceFV/finevideo), [VideoStar](https://huggingface.co/datasets/orrzohar/Video-STaR), [VRipt](https://huggingface.co/datasets/Mutonix/Vript), [Vista-400K](https://huggingface.co/datasets/TIGER-Lab/VISTA-400K), [MovieChat](https://huggingface.co/datasets/Enxin/MovieChat-1K_train) and [ShareGPT4Video](https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video).
210
+ In the following plots we give a general overview of the samples across modalities and the source of those samples.
211
+ <!--
212
+ <center><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolvlm2_data_split.png" width="auto" height="auto" alt="Image description">
213
+ </center>
214
+
215
+ ### Details
216
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolvlm2_datadetails.png" width="auto" height="auto" alt="Image description"> -->
217
+
218
+ ## Data Split per modality
219
+
220
+ | Data Type | Percentage |
221
+ |--------------|------------|
222
+ | Image | 34.4% |
223
+ | Text | 20.2% |
224
+ | Video | 33.0% |
225
+ | Multi-image | 12.3% |
226
+
227
+
228
+ ## Granular dataset slices per modality
229
+
230
+ ### Text Datasets
231
+ | Dataset | Percentage |
232
+ |--------------------------------------------|------------|
233
+ | llava-onevision/magpie_pro_ft3_80b_mt | 6.8% |
234
+ | llava-onevision/magpie_pro_ft3_80b_tt | 6.8% |
235
+ | llava-onevision/magpie_pro_qwen2_72b_tt | 5.8% |
236
+ | llava-onevision/mathqa | 0.9% |
237
+
238
+ ### Multi-image Datasets
239
+ | Dataset | Percentage |
240
+ |--------------------------------------------|------------|
241
+ | m4-instruct-data/m4_instruct_multiimage | 10.4% |
242
+ | mammoth/multiimage-cap6 | 1.9% |
243
+
244
+ ### Image Datasets
245
+ | Dataset | Percentage |
246
+ |--------------------------------------------|------------|
247
+ | llava-onevision/other | 17.4% |
248
+ | llava-onevision/vision_flan | 3.9% |
249
+ | llava-onevision/mavis_math_metagen | 2.6% |
250
+ | llava-onevision/mavis_math_rule_geo | 2.5% |
251
+ | llava-onevision/sharegpt4o | 1.7% |
252
+ | llava-onevision/sharegpt4v_coco | 1.5% |
253
+ | llava-onevision/image_textualization | 1.3% |
254
+ | llava-onevision/sharegpt4v_llava | 0.9% |
255
+ | llava-onevision/mapqa | 0.9% |
256
+ | llava-onevision/qa | 0.8% |
257
+ | llava-onevision/textocr | 0.8% |
258
+
259
+ ### Video Datasets
260
+ | Dataset | Percentage |
261
+ |--------------------------------------------|------------|
262
+ | llava-video-178k/1-2m | 7.3% |
263
+ | llava-video-178k/2-3m | 7.0% |
264
+ | other-video/combined | 5.7% |
265
+ | llava-video-178k/hound | 4.4% |
266
+ | llava-video-178k/0-30s | 2.4% |
267
+ | video-star/starb | 2.2% |
268
+ | vista-400k/combined | 2.2% |
269
+ | vript/long | 1.0% |
270
+ | ShareGPT4Video/all | 0.8% |
smolvlm2_tokenizer/added_tokens.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<end_of_utterance>": 49279,
3
+ "<fake_token_around_image>": 49189,
4
+ "<global-img>": 49152,
5
+ "<image>": 49190,
6
+ "<row_1_col_1>": 49153,
7
+ "<row_1_col_2>": 49154,
8
+ "<row_1_col_3>": 49155,
9
+ "<row_1_col_4>": 49156,
10
+ "<row_1_col_5>": 49157,
11
+ "<row_1_col_6>": 49158,
12
+ "<row_2_col_1>": 49159,
13
+ "<row_2_col_2>": 49160,
14
+ "<row_2_col_3>": 49161,
15
+ "<row_2_col_4>": 49162,
16
+ "<row_2_col_5>": 49163,
17
+ "<row_2_col_6>": 49164,
18
+ "<row_3_col_1>": 49165,
19
+ "<row_3_col_2>": 49166,
20
+ "<row_3_col_3>": 49167,
21
+ "<row_3_col_4>": 49168,
22
+ "<row_3_col_5>": 49169,
23
+ "<row_3_col_6>": 49170,
24
+ "<row_4_col_1>": 49171,
25
+ "<row_4_col_2>": 49172,
26
+ "<row_4_col_3>": 49173,
27
+ "<row_4_col_4>": 49174,
28
+ "<row_4_col_5>": 49175,
29
+ "<row_4_col_6>": 49176,
30
+ "<row_5_col_1>": 49177,
31
+ "<row_5_col_2>": 49178,
32
+ "<row_5_col_3>": 49179,
33
+ "<row_5_col_4>": 49180,
34
+ "<row_5_col_5>": 49181,
35
+ "<row_5_col_6>": 49182,
36
+ "<row_6_col_1>": 49183,
37
+ "<row_6_col_2>": 49184,
38
+ "<row_6_col_3>": 49185,
39
+ "<row_6_col_4>": 49186,
40
+ "<row_6_col_5>": 49187,
41
+ "<row_6_col_6>": 49188,
42
+ "<|reserved_special_token_0|>": 49191,
43
+ "<|reserved_special_token_10|>": 49201,
44
+ "<|reserved_special_token_11|>": 49202,
45
+ "<|reserved_special_token_12|>": 49203,
46
+ "<|reserved_special_token_13|>": 49204,
47
+ "<|reserved_special_token_14|>": 49205,
48
+ "<|reserved_special_token_15|>": 49206,
49
+ "<|reserved_special_token_16|>": 49207,
50
+ "<|reserved_special_token_17|>": 49208,
51
+ "<|reserved_special_token_18|>": 49209,
52
+ "<|reserved_special_token_19|>": 49210,
53
+ "<|reserved_special_token_1|>": 49192,
54
+ "<|reserved_special_token_20|>": 49211,
55
+ "<|reserved_special_token_21|>": 49212,
56
+ "<|reserved_special_token_22|>": 49213,
57
+ "<|reserved_special_token_23|>": 49214,
58
+ "<|reserved_special_token_24|>": 49215,
59
+ "<|reserved_special_token_25|>": 49216,
60
+ "<|reserved_special_token_26|>": 49217,
61
+ "<|reserved_special_token_27|>": 49218,
62
+ "<|reserved_special_token_28|>": 49219,
63
+ "<|reserved_special_token_29|>": 49220,
64
+ "<|reserved_special_token_2|>": 49193,
65
+ "<|reserved_special_token_30|>": 49221,
66
+ "<|reserved_special_token_31|>": 49222,
67
+ "<|reserved_special_token_32|>": 49223,
68
+ "<|reserved_special_token_33|>": 49224,
69
+ "<|reserved_special_token_34|>": 49225,
70
+ "<|reserved_special_token_35|>": 49226,
71
+ "<|reserved_special_token_36|>": 49227,
72
+ "<|reserved_special_token_37|>": 49228,
73
+ "<|reserved_special_token_38|>": 49229,
74
+ "<|reserved_special_token_39|>": 49230,
75
+ "<|reserved_special_token_3|>": 49194,
76
+ "<|reserved_special_token_40|>": 49231,
77
+ "<|reserved_special_token_41|>": 49232,
78
+ "<|reserved_special_token_42|>": 49233,
79
+ "<|reserved_special_token_43|>": 49234,
80
+ "<|reserved_special_token_44|>": 49235,
81
+ "<|reserved_special_token_45|>": 49236,
82
+ "<|reserved_special_token_46|>": 49237,
83
+ "<|reserved_special_token_47|>": 49238,
84
+ "<|reserved_special_token_48|>": 49239,
85
+ "<|reserved_special_token_49|>": 49240,
86
+ "<|reserved_special_token_4|>": 49195,
87
+ "<|reserved_special_token_50|>": 49241,
88
+ "<|reserved_special_token_51|>": 49242,
89
+ "<|reserved_special_token_52|>": 49243,
90
+ "<|reserved_special_token_53|>": 49244,
91
+ "<|reserved_special_token_54|>": 49245,
92
+ "<|reserved_special_token_55|>": 49246,
93
+ "<|reserved_special_token_56|>": 49247,
94
+ "<|reserved_special_token_57|>": 49248,
95
+ "<|reserved_special_token_58|>": 49249,
96
+ "<|reserved_special_token_59|>": 49250,
97
+ "<|reserved_special_token_5|>": 49196,
98
+ "<|reserved_special_token_60|>": 49251,
99
+ "<|reserved_special_token_61|>": 49252,
100
+ "<|reserved_special_token_62|>": 49253,
101
+ "<|reserved_special_token_63|>": 49254,
102
+ "<|reserved_special_token_64|>": 49255,
103
+ "<|reserved_special_token_65|>": 49256,
104
+ "<|reserved_special_token_66|>": 49257,
105
+ "<|reserved_special_token_67|>": 49258,
106
+ "<|reserved_special_token_68|>": 49259,
107
+ "<|reserved_special_token_69|>": 49260,
108
+ "<|reserved_special_token_6|>": 49197,
109
+ "<|reserved_special_token_70|>": 49261,
110
+ "<|reserved_special_token_71|>": 49262,
111
+ "<|reserved_special_token_72|>": 49263,
112
+ "<|reserved_special_token_73|>": 49264,
113
+ "<|reserved_special_token_74|>": 49265,
114
+ "<|reserved_special_token_75|>": 49266,
115
+ "<|reserved_special_token_76|>": 49267,
116
+ "<|reserved_special_token_77|>": 49268,
117
+ "<|reserved_special_token_78|>": 49269,
118
+ "<|reserved_special_token_79|>": 49270,
119
+ "<|reserved_special_token_7|>": 49198,
120
+ "<|reserved_special_token_80|>": 49271,
121
+ "<|reserved_special_token_81|>": 49272,
122
+ "<|reserved_special_token_82|>": 49273,
123
+ "<|reserved_special_token_83|>": 49274,
124
+ "<|reserved_special_token_84|>": 49275,
125
+ "<|reserved_special_token_85|>": 49276,
126
+ "<|reserved_special_token_86|>": 49277,
127
+ "<|reserved_special_token_87|>": 49278,
128
+ "<|reserved_special_token_8|>": 49199,
129
+ "<|reserved_special_token_9|>": 49200
130
+ }
smolvlm2_tokenizer/chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
3
+ }
smolvlm2_tokenizer/config.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SmolVLMForConditionalGeneration"
4
+ ],
5
+ "image_token_id": 49190,
6
+ "model_type": "smolvlm",
7
+ "pad_token_id": 128002,
8
+ "scale_factor": 4,
9
+ "text_config": {
10
+ "_flash_attn_2_enabled": true,
11
+ "_name_or_path": "None",
12
+ "architectures": [
13
+ "VLlama3ForCausalLM"
14
+ ],
15
+ "head_dim": 64,
16
+ "hidden_size": 960,
17
+ "intermediate_size": 2560,
18
+ "is_llama_config": true,
19
+ "max_position_embeddings": 8192,
20
+ "model_type": "llama",
21
+ "neftune_noise_alpha": 0.0,
22
+ "num_attention_heads": 15,
23
+ "num_hidden_layers": 32,
24
+ "num_key_value_heads": 5,
25
+ "pad_token_id": 2,
26
+ "perceiver_config": {
27
+ "_attn_implementation_autoset": false,
28
+ "_name_or_path": "",
29
+ "add_cross_attention": false,
30
+ "architectures": null,
31
+ "attention_dropout": 0.0,
32
+ "bad_words_ids": null,
33
+ "begin_suppress_tokens": null,
34
+ "bos_token_id": null,
35
+ "chunk_size_feed_forward": 0,
36
+ "cross_attention_hidden_size": null,
37
+ "decoder_start_token_id": null,
38
+ "diversity_penalty": 0.0,
39
+ "do_sample": false,
40
+ "early_stopping": false,
41
+ "encoder_no_repeat_ngram_size": 0,
42
+ "eos_token_id": null,
43
+ "exponential_decay_length_penalty": null,
44
+ "finetuning_task": null,
45
+ "forced_bos_token_id": null,
46
+ "forced_eos_token_id": null,
47
+ "hidden_act": "silu",
48
+ "id2label": {
49
+ "0": "LABEL_0",
50
+ "1": "LABEL_1"
51
+ },
52
+ "is_decoder": false,
53
+ "is_encoder_decoder": false,
54
+ "label2id": {
55
+ "LABEL_0": 0,
56
+ "LABEL_1": 1
57
+ },
58
+ "length_penalty": 1.0,
59
+ "max_length": 20,
60
+ "min_length": 0,
61
+ "model_type": "vllama3",
62
+ "no_repeat_ngram_size": 0,
63
+ "num_beam_groups": 1,
64
+ "num_beams": 1,
65
+ "num_key_value_heads": 1,
66
+ "num_return_sequences": 1,
67
+ "output_attentions": false,
68
+ "output_hidden_states": false,
69
+ "output_scores": false,
70
+ "pad_token_id": null,
71
+ "prefix": null,
72
+ "problem_type": null,
73
+ "pruned_heads": {},
74
+ "qk_layer_norms_perceiver": false,
75
+ "remove_invalid_values": false,
76
+ "repetition_penalty": 1.0,
77
+ "resampler_depth": 6,
78
+ "resampler_head_dim": 96,
79
+ "resampler_n_heads": 16,
80
+ "resampler_n_latents": 64,
81
+ "return_dict": true,
82
+ "return_dict_in_generate": false,
83
+ "sep_token_id": null,
84
+ "suppress_tokens": null,
85
+ "task_specific_params": null,
86
+ "temperature": 1.0,
87
+ "tf_legacy_loss": false,
88
+ "tie_encoder_decoder": false,
89
+ "tie_word_embeddings": true,
90
+ "tokenizer_class": null,
91
+ "top_k": 50,
92
+ "top_p": 1.0,
93
+ "torch_dtype": null,
94
+ "torchscript": false,
95
+ "transformers_version": "4.46.0",
96
+ "typical_p": 1.0,
97
+ "use_bfloat16": false
98
+ },
99
+ "pixel_shuffle_factor": 4,
100
+ "qk_layer_norms": false,
101
+ "rms_norm_eps": 1e-05,
102
+ "rope_interleaved": false,
103
+ "rope_theta": 100000,
104
+ "torch_dtype": "bfloat16",
105
+ "transformers.js_config": {
106
+ "kv_cache_dtype": {
107
+ "fp16": "float16",
108
+ "q4f16": "float16"
109
+ }
110
+ },
111
+ "use_resampler": false,
112
+ "vocab_size": 49280
113
+ },
114
+ "tie_word_embeddings": false,
115
+ "torch_dtype": "float32",
116
+ "transformers.js_config": {
117
+ "kv_cache_dtype": {
118
+ "fp16": "float16",
119
+ "q4f16": "float16"
120
+ }
121
+ },
122
+ "transformers_version": "4.47.1",
123
+ "use_cache": false,
124
+ "use_reentrant_checkpointing": false,
125
+ "vision_config": {
126
+ "hidden_size": 768,
127
+ "image_size": 512,
128
+ "max_image_size": {
129
+ "longest_edge": 512
130
+ },
131
+ "model_type": "smolvlm_vision",
132
+ "num_attention_heads": 12,
133
+ "patch_size": 16,
134
+ "size": {
135
+ "longest_edge": 2048
136
+ },
137
+ "tie_word_embeddings": false,
138
+ "use_base_siglip": false
139
+ },
140
+ "vocab_size": 49280
141
+ }
smolvlm2_tokenizer/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 49279,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.47.1"
7
+ }
smolvlm2_tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
smolvlm2_tokenizer/preprocessor_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_image_splitting": true,
4
+ "do_normalize": true,
5
+ "do_pad": true,
6
+ "do_rescale": true,
7
+ "do_resize": true,
8
+ "image_mean": [
9
+ 0.5,
10
+ 0.5,
11
+ 0.5
12
+ ],
13
+ "image_processor_type": "SmolVLMImageProcessor",
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "max_image_size": {
20
+ "longest_edge": 512
21
+ },
22
+ "processor_class": "SmolVLMProcessor",
23
+ "resample": 1,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "longest_edge": 2048
27
+ },
28
+ "video_sampling": {
29
+ "fps": 1,
30
+ "max_frames": 64,
31
+ "video_size": {
32
+ "longest_edge": 512
33
+ }
34
+ }
35
+ }
smolvlm2_tokenizer/processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_len": 64,
3
+ "processor_class": "SmolVLMProcessor"
4
+ }
smolvlm2_tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<fake_token_around_image>",
4
+ "<image>",
5
+ "<end_of_utterance>"
6
+ ],
7
+ "bos_token": {
8
+ "content": "<|im_start|>",
9
+ "lstrip": false,
10
+ "normalized": false,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "end_of_utterance_token": "<end_of_utterance>",
15
+ "eos_token": {
16
+ "content": "<end_of_utterance>",
17
+ "lstrip": false,
18
+ "normalized": false,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ },
22
+ "fake_image_token": "<fake_token_around_image>",
23
+ "global_image_token": "<global-img>",
24
+ "image_token": "<image>",
25
+ "pad_token": {
26
+ "content": "<|im_end|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "unk_token": {
33
+ "content": "<|endoftext|>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ }
39
+ }
smolvlm2_tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
smolvlm2_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,1192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "49152": {
141
+ "content": "<global-img>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "49153": {
149
+ "content": "<row_1_col_1>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "49154": {
157
+ "content": "<row_1_col_2>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "49155": {
165
+ "content": "<row_1_col_3>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "49156": {
173
+ "content": "<row_1_col_4>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "49157": {
181
+ "content": "<row_1_col_5>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "49158": {
189
+ "content": "<row_1_col_6>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "49159": {
197
+ "content": "<row_2_col_1>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "49160": {
205
+ "content": "<row_2_col_2>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "49161": {
213
+ "content": "<row_2_col_3>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "49162": {
221
+ "content": "<row_2_col_4>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "49163": {
229
+ "content": "<row_2_col_5>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "49164": {
237
+ "content": "<row_2_col_6>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "49165": {
245
+ "content": "<row_3_col_1>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "49166": {
253
+ "content": "<row_3_col_2>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "49167": {
261
+ "content": "<row_3_col_3>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "49168": {
269
+ "content": "<row_3_col_4>",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "49169": {
277
+ "content": "<row_3_col_5>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "49170": {
285
+ "content": "<row_3_col_6>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "49171": {
293
+ "content": "<row_4_col_1>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "49172": {
301
+ "content": "<row_4_col_2>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "49173": {
309
+ "content": "<row_4_col_3>",
310
+ "lstrip": false,
311
+ "normalized": false,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "49174": {
317
+ "content": "<row_4_col_4>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "49175": {
325
+ "content": "<row_4_col_5>",
326
+ "lstrip": false,
327
+ "normalized": false,
328
+ "rstrip": false,
329
+ "single_word": false,
330
+ "special": true
331
+ },
332
+ "49176": {
333
+ "content": "<row_4_col_6>",
334
+ "lstrip": false,
335
+ "normalized": false,
336
+ "rstrip": false,
337
+ "single_word": false,
338
+ "special": true
339
+ },
340
+ "49177": {
341
+ "content": "<row_5_col_1>",
342
+ "lstrip": false,
343
+ "normalized": false,
344
+ "rstrip": false,
345
+ "single_word": false,
346
+ "special": true
347
+ },
348
+ "49178": {
349
+ "content": "<row_5_col_2>",
350
+ "lstrip": false,
351
+ "normalized": false,
352
+ "rstrip": false,
353
+ "single_word": false,
354
+ "special": true
355
+ },
356
+ "49179": {
357
+ "content": "<row_5_col_3>",
358
+ "lstrip": false,
359
+ "normalized": false,
360
+ "rstrip": false,
361
+ "single_word": false,
362
+ "special": true
363
+ },
364
+ "49180": {
365
+ "content": "<row_5_col_4>",
366
+ "lstrip": false,
367
+ "normalized": false,
368
+ "rstrip": false,
369
+ "single_word": false,
370
+ "special": true
371
+ },
372
+ "49181": {
373
+ "content": "<row_5_col_5>",
374
+ "lstrip": false,
375
+ "normalized": false,
376
+ "rstrip": false,
377
+ "single_word": false,
378
+ "special": true
379
+ },
380
+ "49182": {
381
+ "content": "<row_5_col_6>",
382
+ "lstrip": false,
383
+ "normalized": false,
384
+ "rstrip": false,
385
+ "single_word": false,
386
+ "special": true
387
+ },
388
+ "49183": {
389
+ "content": "<row_6_col_1>",
390
+ "lstrip": false,
391
+ "normalized": false,
392
+ "rstrip": false,
393
+ "single_word": false,
394
+ "special": true
395
+ },
396
+ "49184": {
397
+ "content": "<row_6_col_2>",
398
+ "lstrip": false,
399
+ "normalized": false,
400
+ "rstrip": false,
401
+ "single_word": false,
402
+ "special": true
403
+ },
404
+ "49185": {
405
+ "content": "<row_6_col_3>",
406
+ "lstrip": false,
407
+ "normalized": false,
408
+ "rstrip": false,
409
+ "single_word": false,
410
+ "special": true
411
+ },
412
+ "49186": {
413
+ "content": "<row_6_col_4>",
414
+ "lstrip": false,
415
+ "normalized": false,
416
+ "rstrip": false,
417
+ "single_word": false,
418
+ "special": true
419
+ },
420
+ "49187": {
421
+ "content": "<row_6_col_5>",
422
+ "lstrip": false,
423
+ "normalized": false,
424
+ "rstrip": false,
425
+ "single_word": false,
426
+ "special": true
427
+ },
428
+ "49188": {
429
+ "content": "<row_6_col_6>",
430
+ "lstrip": false,
431
+ "normalized": false,
432
+ "rstrip": false,
433
+ "single_word": false,
434
+ "special": true
435
+ },
436
+ "49189": {
437
+ "content": "<fake_token_around_image>",
438
+ "lstrip": false,
439
+ "normalized": false,
440
+ "rstrip": false,
441
+ "single_word": false,
442
+ "special": true
443
+ },
444
+ "49190": {
445
+ "content": "<image>",
446
+ "lstrip": false,
447
+ "normalized": false,
448
+ "rstrip": false,
449
+ "single_word": false,
450
+ "special": true
451
+ },
452
+ "49191": {
453
+ "content": "<|reserved_special_token_0|>",
454
+ "lstrip": false,
455
+ "normalized": false,
456
+ "rstrip": false,
457
+ "single_word": false,
458
+ "special": true
459
+ },
460
+ "49192": {
461
+ "content": "<|reserved_special_token_1|>",
462
+ "lstrip": false,
463
+ "normalized": false,
464
+ "rstrip": false,
465
+ "single_word": false,
466
+ "special": true
467
+ },
468
+ "49193": {
469
+ "content": "<|reserved_special_token_2|>",
470
+ "lstrip": false,
471
+ "normalized": false,
472
+ "rstrip": false,
473
+ "single_word": false,
474
+ "special": true
475
+ },
476
+ "49194": {
477
+ "content": "<|reserved_special_token_3|>",
478
+ "lstrip": false,
479
+ "normalized": false,
480
+ "rstrip": false,
481
+ "single_word": false,
482
+ "special": true
483
+ },
484
+ "49195": {
485
+ "content": "<|reserved_special_token_4|>",
486
+ "lstrip": false,
487
+ "normalized": false,
488
+ "rstrip": false,
489
+ "single_word": false,
490
+ "special": true
491
+ },
492
+ "49196": {
493
+ "content": "<|reserved_special_token_5|>",
494
+ "lstrip": false,
495
+ "normalized": false,
496
+ "rstrip": false,
497
+ "single_word": false,
498
+ "special": true
499
+ },
500
+ "49197": {
501
+ "content": "<|reserved_special_token_6|>",
502
+ "lstrip": false,
503
+ "normalized": false,
504
+ "rstrip": false,
505
+ "single_word": false,
506
+ "special": true
507
+ },
508
+ "49198": {
509
+ "content": "<|reserved_special_token_7|>",
510
+ "lstrip": false,
511
+ "normalized": false,
512
+ "rstrip": false,
513
+ "single_word": false,
514
+ "special": true
515
+ },
516
+ "49199": {
517
+ "content": "<|reserved_special_token_8|>",
518
+ "lstrip": false,
519
+ "normalized": false,
520
+ "rstrip": false,
521
+ "single_word": false,
522
+ "special": true
523
+ },
524
+ "49200": {
525
+ "content": "<|reserved_special_token_9|>",
526
+ "lstrip": false,
527
+ "normalized": false,
528
+ "rstrip": false,
529
+ "single_word": false,
530
+ "special": true
531
+ },
532
+ "49201": {
533
+ "content": "<|reserved_special_token_10|>",
534
+ "lstrip": false,
535
+ "normalized": false,
536
+ "rstrip": false,
537
+ "single_word": false,
538
+ "special": true
539
+ },
540
+ "49202": {
541
+ "content": "<|reserved_special_token_11|>",
542
+ "lstrip": false,
543
+ "normalized": false,
544
+ "rstrip": false,
545
+ "single_word": false,
546
+ "special": true
547
+ },
548
+ "49203": {
549
+ "content": "<|reserved_special_token_12|>",
550
+ "lstrip": false,
551
+ "normalized": false,
552
+ "rstrip": false,
553
+ "single_word": false,
554
+ "special": true
555
+ },
556
+ "49204": {
557
+ "content": "<|reserved_special_token_13|>",
558
+ "lstrip": false,
559
+ "normalized": false,
560
+ "rstrip": false,
561
+ "single_word": false,
562
+ "special": true
563
+ },
564
+ "49205": {
565
+ "content": "<|reserved_special_token_14|>",
566
+ "lstrip": false,
567
+ "normalized": false,
568
+ "rstrip": false,
569
+ "single_word": false,
570
+ "special": true
571
+ },
572
+ "49206": {
573
+ "content": "<|reserved_special_token_15|>",
574
+ "lstrip": false,
575
+ "normalized": false,
576
+ "rstrip": false,
577
+ "single_word": false,
578
+ "special": true
579
+ },
580
+ "49207": {
581
+ "content": "<|reserved_special_token_16|>",
582
+ "lstrip": false,
583
+ "normalized": false,
584
+ "rstrip": false,
585
+ "single_word": false,
586
+ "special": true
587
+ },
588
+ "49208": {
589
+ "content": "<|reserved_special_token_17|>",
590
+ "lstrip": false,
591
+ "normalized": false,
592
+ "rstrip": false,
593
+ "single_word": false,
594
+ "special": true
595
+ },
596
+ "49209": {
597
+ "content": "<|reserved_special_token_18|>",
598
+ "lstrip": false,
599
+ "normalized": false,
600
+ "rstrip": false,
601
+ "single_word": false,
602
+ "special": true
603
+ },
604
+ "49210": {
605
+ "content": "<|reserved_special_token_19|>",
606
+ "lstrip": false,
607
+ "normalized": false,
608
+ "rstrip": false,
609
+ "single_word": false,
610
+ "special": true
611
+ },
612
+ "49211": {
613
+ "content": "<|reserved_special_token_20|>",
614
+ "lstrip": false,
615
+ "normalized": false,
616
+ "rstrip": false,
617
+ "single_word": false,
618
+ "special": true
619
+ },
620
+ "49212": {
621
+ "content": "<|reserved_special_token_21|>",
622
+ "lstrip": false,
623
+ "normalized": false,
624
+ "rstrip": false,
625
+ "single_word": false,
626
+ "special": true
627
+ },
628
+ "49213": {
629
+ "content": "<|reserved_special_token_22|>",
630
+ "lstrip": false,
631
+ "normalized": false,
632
+ "rstrip": false,
633
+ "single_word": false,
634
+ "special": true
635
+ },
636
+ "49214": {
637
+ "content": "<|reserved_special_token_23|>",
638
+ "lstrip": false,
639
+ "normalized": false,
640
+ "rstrip": false,
641
+ "single_word": false,
642
+ "special": true
643
+ },
644
+ "49215": {
645
+ "content": "<|reserved_special_token_24|>",
646
+ "lstrip": false,
647
+ "normalized": false,
648
+ "rstrip": false,
649
+ "single_word": false,
650
+ "special": true
651
+ },
652
+ "49216": {
653
+ "content": "<|reserved_special_token_25|>",
654
+ "lstrip": false,
655
+ "normalized": false,
656
+ "rstrip": false,
657
+ "single_word": false,
658
+ "special": true
659
+ },
660
+ "49217": {
661
+ "content": "<|reserved_special_token_26|>",
662
+ "lstrip": false,
663
+ "normalized": false,
664
+ "rstrip": false,
665
+ "single_word": false,
666
+ "special": true
667
+ },
668
+ "49218": {
669
+ "content": "<|reserved_special_token_27|>",
670
+ "lstrip": false,
671
+ "normalized": false,
672
+ "rstrip": false,
673
+ "single_word": false,
674
+ "special": true
675
+ },
676
+ "49219": {
677
+ "content": "<|reserved_special_token_28|>",
678
+ "lstrip": false,
679
+ "normalized": false,
680
+ "rstrip": false,
681
+ "single_word": false,
682
+ "special": true
683
+ },
684
+ "49220": {
685
+ "content": "<|reserved_special_token_29|>",
686
+ "lstrip": false,
687
+ "normalized": false,
688
+ "rstrip": false,
689
+ "single_word": false,
690
+ "special": true
691
+ },
692
+ "49221": {
693
+ "content": "<|reserved_special_token_30|>",
694
+ "lstrip": false,
695
+ "normalized": false,
696
+ "rstrip": false,
697
+ "single_word": false,
698
+ "special": true
699
+ },
700
+ "49222": {
701
+ "content": "<|reserved_special_token_31|>",
702
+ "lstrip": false,
703
+ "normalized": false,
704
+ "rstrip": false,
705
+ "single_word": false,
706
+ "special": true
707
+ },
708
+ "49223": {
709
+ "content": "<|reserved_special_token_32|>",
710
+ "lstrip": false,
711
+ "normalized": false,
712
+ "rstrip": false,
713
+ "single_word": false,
714
+ "special": true
715
+ },
716
+ "49224": {
717
+ "content": "<|reserved_special_token_33|>",
718
+ "lstrip": false,
719
+ "normalized": false,
720
+ "rstrip": false,
721
+ "single_word": false,
722
+ "special": true
723
+ },
724
+ "49225": {
725
+ "content": "<|reserved_special_token_34|>",
726
+ "lstrip": false,
727
+ "normalized": false,
728
+ "rstrip": false,
729
+ "single_word": false,
730
+ "special": true
731
+ },
732
+ "49226": {
733
+ "content": "<|reserved_special_token_35|>",
734
+ "lstrip": false,
735
+ "normalized": false,
736
+ "rstrip": false,
737
+ "single_word": false,
738
+ "special": true
739
+ },
740
+ "49227": {
741
+ "content": "<|reserved_special_token_36|>",
742
+ "lstrip": false,
743
+ "normalized": false,
744
+ "rstrip": false,
745
+ "single_word": false,
746
+ "special": true
747
+ },
748
+ "49228": {
749
+ "content": "<|reserved_special_token_37|>",
750
+ "lstrip": false,
751
+ "normalized": false,
752
+ "rstrip": false,
753
+ "single_word": false,
754
+ "special": true
755
+ },
756
+ "49229": {
757
+ "content": "<|reserved_special_token_38|>",
758
+ "lstrip": false,
759
+ "normalized": false,
760
+ "rstrip": false,
761
+ "single_word": false,
762
+ "special": true
763
+ },
764
+ "49230": {
765
+ "content": "<|reserved_special_token_39|>",
766
+ "lstrip": false,
767
+ "normalized": false,
768
+ "rstrip": false,
769
+ "single_word": false,
770
+ "special": true
771
+ },
772
+ "49231": {
773
+ "content": "<|reserved_special_token_40|>",
774
+ "lstrip": false,
775
+ "normalized": false,
776
+ "rstrip": false,
777
+ "single_word": false,
778
+ "special": true
779
+ },
780
+ "49232": {
781
+ "content": "<|reserved_special_token_41|>",
782
+ "lstrip": false,
783
+ "normalized": false,
784
+ "rstrip": false,
785
+ "single_word": false,
786
+ "special": true
787
+ },
788
+ "49233": {
789
+ "content": "<|reserved_special_token_42|>",
790
+ "lstrip": false,
791
+ "normalized": false,
792
+ "rstrip": false,
793
+ "single_word": false,
794
+ "special": true
795
+ },
796
+ "49234": {
797
+ "content": "<|reserved_special_token_43|>",
798
+ "lstrip": false,
799
+ "normalized": false,
800
+ "rstrip": false,
801
+ "single_word": false,
802
+ "special": true
803
+ },
804
+ "49235": {
805
+ "content": "<|reserved_special_token_44|>",
806
+ "lstrip": false,
807
+ "normalized": false,
808
+ "rstrip": false,
809
+ "single_word": false,
810
+ "special": true
811
+ },
812
+ "49236": {
813
+ "content": "<|reserved_special_token_45|>",
814
+ "lstrip": false,
815
+ "normalized": false,
816
+ "rstrip": false,
817
+ "single_word": false,
818
+ "special": true
819
+ },
820
+ "49237": {
821
+ "content": "<|reserved_special_token_46|>",
822
+ "lstrip": false,
823
+ "normalized": false,
824
+ "rstrip": false,
825
+ "single_word": false,
826
+ "special": true
827
+ },
828
+ "49238": {
829
+ "content": "<|reserved_special_token_47|>",
830
+ "lstrip": false,
831
+ "normalized": false,
832
+ "rstrip": false,
833
+ "single_word": false,
834
+ "special": true
835
+ },
836
+ "49239": {
837
+ "content": "<|reserved_special_token_48|>",
838
+ "lstrip": false,
839
+ "normalized": false,
840
+ "rstrip": false,
841
+ "single_word": false,
842
+ "special": true
843
+ },
844
+ "49240": {
845
+ "content": "<|reserved_special_token_49|>",
846
+ "lstrip": false,
847
+ "normalized": false,
848
+ "rstrip": false,
849
+ "single_word": false,
850
+ "special": true
851
+ },
852
+ "49241": {
853
+ "content": "<|reserved_special_token_50|>",
854
+ "lstrip": false,
855
+ "normalized": false,
856
+ "rstrip": false,
857
+ "single_word": false,
858
+ "special": true
859
+ },
860
+ "49242": {
861
+ "content": "<|reserved_special_token_51|>",
862
+ "lstrip": false,
863
+ "normalized": false,
864
+ "rstrip": false,
865
+ "single_word": false,
866
+ "special": true
867
+ },
868
+ "49243": {
869
+ "content": "<|reserved_special_token_52|>",
870
+ "lstrip": false,
871
+ "normalized": false,
872
+ "rstrip": false,
873
+ "single_word": false,
874
+ "special": true
875
+ },
876
+ "49244": {
877
+ "content": "<|reserved_special_token_53|>",
878
+ "lstrip": false,
879
+ "normalized": false,
880
+ "rstrip": false,
881
+ "single_word": false,
882
+ "special": true
883
+ },
884
+ "49245": {
885
+ "content": "<|reserved_special_token_54|>",
886
+ "lstrip": false,
887
+ "normalized": false,
888
+ "rstrip": false,
889
+ "single_word": false,
890
+ "special": true
891
+ },
892
+ "49246": {
893
+ "content": "<|reserved_special_token_55|>",
894
+ "lstrip": false,
895
+ "normalized": false,
896
+ "rstrip": false,
897
+ "single_word": false,
898
+ "special": true
899
+ },
900
+ "49247": {
901
+ "content": "<|reserved_special_token_56|>",
902
+ "lstrip": false,
903
+ "normalized": false,
904
+ "rstrip": false,
905
+ "single_word": false,
906
+ "special": true
907
+ },
908
+ "49248": {
909
+ "content": "<|reserved_special_token_57|>",
910
+ "lstrip": false,
911
+ "normalized": false,
912
+ "rstrip": false,
913
+ "single_word": false,
914
+ "special": true
915
+ },
916
+ "49249": {
917
+ "content": "<|reserved_special_token_58|>",
918
+ "lstrip": false,
919
+ "normalized": false,
920
+ "rstrip": false,
921
+ "single_word": false,
922
+ "special": true
923
+ },
924
+ "49250": {
925
+ "content": "<|reserved_special_token_59|>",
926
+ "lstrip": false,
927
+ "normalized": false,
928
+ "rstrip": false,
929
+ "single_word": false,
930
+ "special": true
931
+ },
932
+ "49251": {
933
+ "content": "<|reserved_special_token_60|>",
934
+ "lstrip": false,
935
+ "normalized": false,
936
+ "rstrip": false,
937
+ "single_word": false,
938
+ "special": true
939
+ },
940
+ "49252": {
941
+ "content": "<|reserved_special_token_61|>",
942
+ "lstrip": false,
943
+ "normalized": false,
944
+ "rstrip": false,
945
+ "single_word": false,
946
+ "special": true
947
+ },
948
+ "49253": {
949
+ "content": "<|reserved_special_token_62|>",
950
+ "lstrip": false,
951
+ "normalized": false,
952
+ "rstrip": false,
953
+ "single_word": false,
954
+ "special": true
955
+ },
956
+ "49254": {
957
+ "content": "<|reserved_special_token_63|>",
958
+ "lstrip": false,
959
+ "normalized": false,
960
+ "rstrip": false,
961
+ "single_word": false,
962
+ "special": true
963
+ },
964
+ "49255": {
965
+ "content": "<|reserved_special_token_64|>",
966
+ "lstrip": false,
967
+ "normalized": false,
968
+ "rstrip": false,
969
+ "single_word": false,
970
+ "special": true
971
+ },
972
+ "49256": {
973
+ "content": "<|reserved_special_token_65|>",
974
+ "lstrip": false,
975
+ "normalized": false,
976
+ "rstrip": false,
977
+ "single_word": false,
978
+ "special": true
979
+ },
980
+ "49257": {
981
+ "content": "<|reserved_special_token_66|>",
982
+ "lstrip": false,
983
+ "normalized": false,
984
+ "rstrip": false,
985
+ "single_word": false,
986
+ "special": true
987
+ },
988
+ "49258": {
989
+ "content": "<|reserved_special_token_67|>",
990
+ "lstrip": false,
991
+ "normalized": false,
992
+ "rstrip": false,
993
+ "single_word": false,
994
+ "special": true
995
+ },
996
+ "49259": {
997
+ "content": "<|reserved_special_token_68|>",
998
+ "lstrip": false,
999
+ "normalized": false,
1000
+ "rstrip": false,
1001
+ "single_word": false,
1002
+ "special": true
1003
+ },
1004
+ "49260": {
1005
+ "content": "<|reserved_special_token_69|>",
1006
+ "lstrip": false,
1007
+ "normalized": false,
1008
+ "rstrip": false,
1009
+ "single_word": false,
1010
+ "special": true
1011
+ },
1012
+ "49261": {
1013
+ "content": "<|reserved_special_token_70|>",
1014
+ "lstrip": false,
1015
+ "normalized": false,
1016
+ "rstrip": false,
1017
+ "single_word": false,
1018
+ "special": true
1019
+ },
1020
+ "49262": {
1021
+ "content": "<|reserved_special_token_71|>",
1022
+ "lstrip": false,
1023
+ "normalized": false,
1024
+ "rstrip": false,
1025
+ "single_word": false,
1026
+ "special": true
1027
+ },
1028
+ "49263": {
1029
+ "content": "<|reserved_special_token_72|>",
1030
+ "lstrip": false,
1031
+ "normalized": false,
1032
+ "rstrip": false,
1033
+ "single_word": false,
1034
+ "special": true
1035
+ },
1036
+ "49264": {
1037
+ "content": "<|reserved_special_token_73|>",
1038
+ "lstrip": false,
1039
+ "normalized": false,
1040
+ "rstrip": false,
1041
+ "single_word": false,
1042
+ "special": true
1043
+ },
1044
+ "49265": {
1045
+ "content": "<|reserved_special_token_74|>",
1046
+ "lstrip": false,
1047
+ "normalized": false,
1048
+ "rstrip": false,
1049
+ "single_word": false,
1050
+ "special": true
1051
+ },
1052
+ "49266": {
1053
+ "content": "<|reserved_special_token_75|>",
1054
+ "lstrip": false,
1055
+ "normalized": false,
1056
+ "rstrip": false,
1057
+ "single_word": false,
1058
+ "special": true
1059
+ },
1060
+ "49267": {
1061
+ "content": "<|reserved_special_token_76|>",
1062
+ "lstrip": false,
1063
+ "normalized": false,
1064
+ "rstrip": false,
1065
+ "single_word": false,
1066
+ "special": true
1067
+ },
1068
+ "49268": {
1069
+ "content": "<|reserved_special_token_77|>",
1070
+ "lstrip": false,
1071
+ "normalized": false,
1072
+ "rstrip": false,
1073
+ "single_word": false,
1074
+ "special": true
1075
+ },
1076
+ "49269": {
1077
+ "content": "<|reserved_special_token_78|>",
1078
+ "lstrip": false,
1079
+ "normalized": false,
1080
+ "rstrip": false,
1081
+ "single_word": false,
1082
+ "special": true
1083
+ },
1084
+ "49270": {
1085
+ "content": "<|reserved_special_token_79|>",
1086
+ "lstrip": false,
1087
+ "normalized": false,
1088
+ "rstrip": false,
1089
+ "single_word": false,
1090
+ "special": true
1091
+ },
1092
+ "49271": {
1093
+ "content": "<|reserved_special_token_80|>",
1094
+ "lstrip": false,
1095
+ "normalized": false,
1096
+ "rstrip": false,
1097
+ "single_word": false,
1098
+ "special": true
1099
+ },
1100
+ "49272": {
1101
+ "content": "<|reserved_special_token_81|>",
1102
+ "lstrip": false,
1103
+ "normalized": false,
1104
+ "rstrip": false,
1105
+ "single_word": false,
1106
+ "special": true
1107
+ },
1108
+ "49273": {
1109
+ "content": "<|reserved_special_token_82|>",
1110
+ "lstrip": false,
1111
+ "normalized": false,
1112
+ "rstrip": false,
1113
+ "single_word": false,
1114
+ "special": true
1115
+ },
1116
+ "49274": {
1117
+ "content": "<|reserved_special_token_83|>",
1118
+ "lstrip": false,
1119
+ "normalized": false,
1120
+ "rstrip": false,
1121
+ "single_word": false,
1122
+ "special": true
1123
+ },
1124
+ "49275": {
1125
+ "content": "<|reserved_special_token_84|>",
1126
+ "lstrip": false,
1127
+ "normalized": false,
1128
+ "rstrip": false,
1129
+ "single_word": false,
1130
+ "special": true
1131
+ },
1132
+ "49276": {
1133
+ "content": "<|reserved_special_token_85|>",
1134
+ "lstrip": false,
1135
+ "normalized": false,
1136
+ "rstrip": false,
1137
+ "single_word": false,
1138
+ "special": true
1139
+ },
1140
+ "49277": {
1141
+ "content": "<|reserved_special_token_86|>",
1142
+ "lstrip": false,
1143
+ "normalized": false,
1144
+ "rstrip": false,
1145
+ "single_word": false,
1146
+ "special": true
1147
+ },
1148
+ "49278": {
1149
+ "content": "<|reserved_special_token_87|>",
1150
+ "lstrip": false,
1151
+ "normalized": false,
1152
+ "rstrip": false,
1153
+ "single_word": false,
1154
+ "special": true
1155
+ },
1156
+ "49279": {
1157
+ "content": "<end_of_utterance>",
1158
+ "lstrip": false,
1159
+ "normalized": false,
1160
+ "rstrip": false,
1161
+ "single_word": false,
1162
+ "special": true
1163
+ }
1164
+ },
1165
+ "additional_special_tokens": [
1166
+ "<fake_token_around_image>",
1167
+ "<image>",
1168
+ "<end_of_utterance>"
1169
+ ],
1170
+ "bos_token": "<|im_start|>",
1171
+ "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
1172
+ "clean_up_tokenization_spaces": false,
1173
+ "end_of_utterance_token": "<end_of_utterance>",
1174
+ "eos_token": "<end_of_utterance>",
1175
+ "extra_special_tokens": {
1176
+ "end_of_utterance_token": "<end_of_utterance>",
1177
+ "fake_image_token": "<fake_token_around_image>",
1178
+ "global_image_token": "<global-img>",
1179
+ "image_token": "<image>"
1180
+ },
1181
+ "fake_image_token": "<fake_token_around_image>",
1182
+ "global_image_token": "<global-img>",
1183
+ "image_token": "<image>",
1184
+ "legacy": false,
1185
+ "model_max_length": 8192,
1186
+ "pad_token": "<|im_end|>",
1187
+ "processor_class": "SmolVLMProcessor",
1188
+ "tokenizer_class": "GPT2Tokenizer",
1189
+ "truncation_side": "left",
1190
+ "unk_token": "<|endoftext|>",
1191
+ "vocab_size": 49152
1192
+ }
smolvlm2_tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
vit_mdoel/vision_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5b317aa656fc27e49745a23253ee9adcd14ca90e3a9145bdd4568a5a18b2f41
3
+ size 387531753