Zhoues commited on
Commit
4b60fd3
·
verified ·
1 Parent(s): 318a967

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Ubit": 100,
3
+ "_attn_implementation_autoset": true,
4
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model",
5
+ "architectures": [
6
+ "LlavaLlamaModel"
7
+ ],
8
+ "babit": "E5M2",
9
+ "bobit": "E5M2",
10
+ "bwbit": "E5M2",
11
+ "chat_template": "qwen2",
12
+ "col_blocksize": -1,
13
+ "col_blocksize_optimizer": 128,
14
+ "depth_encoder": {
15
+ "_target_": "llava.model.encoders.BasicImageEncoder"
16
+ },
17
+ "depth_projector": "mlp_downsample_3x3_fix",
18
+ "depth_projector_cfg": {
19
+ "_attn_implementation_autoset": false,
20
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model/depth_projector",
21
+ "add_cross_attention": false,
22
+ "architectures": [
23
+ "MultimodalProjector"
24
+ ],
25
+ "bad_words_ids": null,
26
+ "begin_suppress_tokens": null,
27
+ "bos_token_id": null,
28
+ "chunk_size_feed_forward": 0,
29
+ "cross_attention_hidden_size": null,
30
+ "decoder_start_token_id": null,
31
+ "diversity_penalty": 0.0,
32
+ "do_sample": false,
33
+ "early_stopping": false,
34
+ "encoder_no_repeat_ngram_size": 0,
35
+ "eos_token_id": null,
36
+ "exponential_decay_length_penalty": null,
37
+ "finetuning_task": null,
38
+ "forced_bos_token_id": null,
39
+ "forced_eos_token_id": null,
40
+ "id2label": {
41
+ "0": "LABEL_0",
42
+ "1": "LABEL_1"
43
+ },
44
+ "is_decoder": false,
45
+ "is_encoder_decoder": false,
46
+ "label2id": {
47
+ "LABEL_0": 0,
48
+ "LABEL_1": 1
49
+ },
50
+ "length_penalty": 1.0,
51
+ "max_length": 20,
52
+ "min_length": 0,
53
+ "mm_projector_type": "mlp_downsample_3x3_fix",
54
+ "model_type": "v2l_projector",
55
+ "no_repeat_ngram_size": 0,
56
+ "num_beam_groups": 1,
57
+ "num_beams": 1,
58
+ "num_return_sequences": 1,
59
+ "output_attentions": false,
60
+ "output_hidden_states": false,
61
+ "output_scores": false,
62
+ "pad_token_id": null,
63
+ "prefix": null,
64
+ "problem_type": null,
65
+ "pruned_heads": {},
66
+ "remove_invalid_values": false,
67
+ "repetition_penalty": 1.0,
68
+ "return_dict": true,
69
+ "return_dict_in_generate": false,
70
+ "sep_token_id": null,
71
+ "suppress_tokens": null,
72
+ "task_specific_params": null,
73
+ "temperature": 1.0,
74
+ "tf_legacy_loss": false,
75
+ "tie_encoder_decoder": false,
76
+ "tie_word_embeddings": true,
77
+ "tokenizer_class": null,
78
+ "top_k": 50,
79
+ "top_p": 1.0,
80
+ "torch_dtype": "bfloat16",
81
+ "torchscript": false,
82
+ "typical_p": 1.0,
83
+ "use_bfloat16": false
84
+ },
85
+ "depth_tower": "/share/project/zhouenshen/hpfs/ckpt/vlm/paligemma-siglip-so400m-patch14-448",
86
+ "depth_tower_cfg": {
87
+ "_attn_implementation_autoset": false,
88
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model/depth_tower",
89
+ "add_cross_attention": false,
90
+ "architectures": [
91
+ "SiglipVisionModel"
92
+ ],
93
+ "attention_dropout": 0.0,
94
+ "bad_words_ids": null,
95
+ "begin_suppress_tokens": null,
96
+ "bos_token_id": null,
97
+ "chunk_size_feed_forward": 0,
98
+ "cross_attention_hidden_size": null,
99
+ "decoder_start_token_id": null,
100
+ "diversity_penalty": 0.0,
101
+ "do_sample": false,
102
+ "early_stopping": false,
103
+ "encoder_no_repeat_ngram_size": 0,
104
+ "eos_token_id": null,
105
+ "exponential_decay_length_penalty": null,
106
+ "finetuning_task": null,
107
+ "forced_bos_token_id": null,
108
+ "forced_eos_token_id": null,
109
+ "hidden_act": "gelu_pytorch_tanh",
110
+ "hidden_size": 1152,
111
+ "id2label": {
112
+ "0": "LABEL_0",
113
+ "1": "LABEL_1"
114
+ },
115
+ "image_size": 448,
116
+ "intermediate_size": 4304,
117
+ "is_decoder": false,
118
+ "is_encoder_decoder": false,
119
+ "label2id": {
120
+ "LABEL_0": 0,
121
+ "LABEL_1": 1
122
+ },
123
+ "layer_norm_eps": 1e-06,
124
+ "length_penalty": 1.0,
125
+ "max_length": 20,
126
+ "min_length": 0,
127
+ "model_type": "siglip_vision_model",
128
+ "no_repeat_ngram_size": 0,
129
+ "num_attention_heads": 16,
130
+ "num_beam_groups": 1,
131
+ "num_beams": 1,
132
+ "num_channels": 3,
133
+ "num_hidden_layers": 27,
134
+ "num_image_tokens": 256,
135
+ "num_return_sequences": 1,
136
+ "output_attentions": false,
137
+ "output_hidden_states": false,
138
+ "output_scores": false,
139
+ "pad_token_id": null,
140
+ "patch_size": 14,
141
+ "prefix": null,
142
+ "problem_type": null,
143
+ "projection_dim": 2048,
144
+ "projector_hidden_act": "gelu_fast",
145
+ "pruned_heads": {},
146
+ "remove_invalid_values": false,
147
+ "repetition_penalty": 1.0,
148
+ "return_dict": true,
149
+ "return_dict_in_generate": false,
150
+ "sep_token_id": null,
151
+ "suppress_tokens": null,
152
+ "task_specific_params": null,
153
+ "temperature": 1.0,
154
+ "tf_legacy_loss": false,
155
+ "tie_encoder_decoder": false,
156
+ "tie_word_embeddings": true,
157
+ "tokenizer_class": null,
158
+ "top_k": 50,
159
+ "top_p": 1.0,
160
+ "torch_dtype": "bfloat16",
161
+ "torchscript": false,
162
+ "typical_p": 1.0,
163
+ "use_bfloat16": false,
164
+ "vision_use_head": false
165
+ },
166
+ "draw_distribution_backward": false,
167
+ "draw_distribution_forward": false,
168
+ "drop_path_rate": 0.0,
169
+ "dynamic_s2": false,
170
+ "enable_depth": true,
171
+ "epsilon": 1e-10,
172
+ "epsilon_optimizer": 1e-15,
173
+ "fabit": "E4M3",
174
+ "first_order_bit": null,
175
+ "first_order_quant_type": null,
176
+ "fobit": "E4M3",
177
+ "fps": 0.0,
178
+ "fwbit": "E4M3",
179
+ "group_size": -1,
180
+ "hidden_size": 1536,
181
+ "image_aspect_ratio": "dynamic",
182
+ "image_encoder": {
183
+ "_target_": "llava.model.encoders.BasicImageEncoder"
184
+ },
185
+ "interpolate_mode": "linear",
186
+ "llm_cfg": {
187
+ "_attn_implementation_autoset": false,
188
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model/llm",
189
+ "add_cross_attention": false,
190
+ "architectures": [
191
+ "Qwen2ForCausalLM"
192
+ ],
193
+ "attention_dropout": 0.0,
194
+ "bad_words_ids": null,
195
+ "begin_suppress_tokens": null,
196
+ "bos_token_id": 151643,
197
+ "chunk_size_feed_forward": 0,
198
+ "cross_attention_hidden_size": null,
199
+ "decoder_start_token_id": null,
200
+ "diversity_penalty": 0.0,
201
+ "do_sample": false,
202
+ "early_stopping": false,
203
+ "encoder_no_repeat_ngram_size": 0,
204
+ "eos_token_id": 151645,
205
+ "exponential_decay_length_penalty": null,
206
+ "finetuning_task": null,
207
+ "forced_bos_token_id": null,
208
+ "forced_eos_token_id": null,
209
+ "hidden_act": "silu",
210
+ "hidden_size": 1536,
211
+ "id2label": {
212
+ "0": "LABEL_0",
213
+ "1": "LABEL_1"
214
+ },
215
+ "initializer_range": 0.02,
216
+ "intermediate_size": 8960,
217
+ "is_decoder": false,
218
+ "is_encoder_decoder": false,
219
+ "label2id": {
220
+ "LABEL_0": 0,
221
+ "LABEL_1": 1
222
+ },
223
+ "length_penalty": 1.0,
224
+ "max_length": 20,
225
+ "max_position_embeddings": 32768,
226
+ "max_window_layers": 28,
227
+ "min_length": 0,
228
+ "model_max_length": 16384,
229
+ "model_type": "qwen2",
230
+ "no_repeat_ngram_size": 0,
231
+ "num_attention_heads": 12,
232
+ "num_beam_groups": 1,
233
+ "num_beams": 1,
234
+ "num_hidden_layers": 28,
235
+ "num_key_value_heads": 2,
236
+ "num_return_sequences": 1,
237
+ "output_attentions": false,
238
+ "output_hidden_states": false,
239
+ "output_scores": false,
240
+ "pad_token_id": null,
241
+ "prefix": null,
242
+ "problem_type": null,
243
+ "pruned_heads": {},
244
+ "remove_invalid_values": false,
245
+ "repetition_penalty": 1.0,
246
+ "return_dict": true,
247
+ "return_dict_in_generate": false,
248
+ "rms_norm_eps": 1e-06,
249
+ "rope_scaling": null,
250
+ "rope_theta": 1000000.0,
251
+ "sep_token_id": null,
252
+ "sliding_window": null,
253
+ "suppress_tokens": null,
254
+ "task_specific_params": null,
255
+ "temperature": 1.0,
256
+ "tf_legacy_loss": false,
257
+ "tie_encoder_decoder": false,
258
+ "tie_word_embeddings": true,
259
+ "tokenizer_class": null,
260
+ "tokenizer_model_max_length": 4096,
261
+ "tokenizer_padding_side": "right",
262
+ "top_k": 50,
263
+ "top_p": 1.0,
264
+ "torch_dtype": "bfloat16",
265
+ "torchscript": false,
266
+ "typical_p": 1.0,
267
+ "use_bfloat16": false,
268
+ "use_cache": true,
269
+ "use_sliding_window": false,
270
+ "vocab_size": 151652
271
+ },
272
+ "max_tiles": 12,
273
+ "min_blockunit_col": 4,
274
+ "min_blockunit_row": 4,
275
+ "min_tiles": 1,
276
+ "mlp_path": null,
277
+ "mm_hidden_size": 1152,
278
+ "mm_projector": "mlp_downsample_3x3_fix",
279
+ "mm_projector_cfg": {
280
+ "_attn_implementation_autoset": false,
281
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model/mm_projector",
282
+ "add_cross_attention": false,
283
+ "architectures": [
284
+ "MultimodalProjector"
285
+ ],
286
+ "bad_words_ids": null,
287
+ "begin_suppress_tokens": null,
288
+ "bos_token_id": null,
289
+ "chunk_size_feed_forward": 0,
290
+ "cross_attention_hidden_size": null,
291
+ "decoder_start_token_id": null,
292
+ "diversity_penalty": 0.0,
293
+ "do_sample": false,
294
+ "early_stopping": false,
295
+ "encoder_no_repeat_ngram_size": 0,
296
+ "eos_token_id": null,
297
+ "exponential_decay_length_penalty": null,
298
+ "finetuning_task": null,
299
+ "forced_bos_token_id": null,
300
+ "forced_eos_token_id": null,
301
+ "id2label": {
302
+ "0": "LABEL_0",
303
+ "1": "LABEL_1"
304
+ },
305
+ "is_decoder": false,
306
+ "is_encoder_decoder": false,
307
+ "label2id": {
308
+ "LABEL_0": 0,
309
+ "LABEL_1": 1
310
+ },
311
+ "length_penalty": 1.0,
312
+ "max_length": 20,
313
+ "min_length": 0,
314
+ "mm_projector_type": "mlp_downsample_3x3_fix",
315
+ "model_type": "v2l_projector",
316
+ "no_repeat_ngram_size": 0,
317
+ "num_beam_groups": 1,
318
+ "num_beams": 1,
319
+ "num_return_sequences": 1,
320
+ "output_attentions": false,
321
+ "output_hidden_states": false,
322
+ "output_scores": false,
323
+ "pad_token_id": null,
324
+ "prefix": null,
325
+ "problem_type": null,
326
+ "pruned_heads": {},
327
+ "remove_invalid_values": false,
328
+ "repetition_penalty": 1.0,
329
+ "return_dict": true,
330
+ "return_dict_in_generate": false,
331
+ "sep_token_id": null,
332
+ "suppress_tokens": null,
333
+ "task_specific_params": null,
334
+ "temperature": 1.0,
335
+ "tf_legacy_loss": false,
336
+ "tie_encoder_decoder": false,
337
+ "tie_word_embeddings": true,
338
+ "tokenizer_class": null,
339
+ "top_k": 50,
340
+ "top_p": 1.0,
341
+ "torch_dtype": "bfloat16",
342
+ "torchscript": false,
343
+ "typical_p": 1.0,
344
+ "use_bfloat16": false
345
+ },
346
+ "mm_projector_lr": null,
347
+ "mm_use_im_patch_token": false,
348
+ "mm_use_im_start_end": false,
349
+ "mm_vision_select_feature": "cls_patch",
350
+ "mm_vision_select_layer": -2,
351
+ "model_dtype": "torch.bfloat16",
352
+ "model_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-align-new_placement+new_simulator/model",
353
+ "model_type": "llava_llama",
354
+ "num_time_tokens": 0,
355
+ "num_video_frames": 8,
356
+ "pad_block": false,
357
+ "pad_to_multiple_of": 0,
358
+ "qchoice": "none",
359
+ "quantize_model": false,
360
+ "refine_attn_blocksize": false,
361
+ "refine_col_blocksize": 4,
362
+ "refine_ln_blocksize": false,
363
+ "refine_ln_blocksize_but_only_backward": false,
364
+ "refine_ln_blocksize_but_only_forward": false,
365
+ "refine_ln_pertoken": false,
366
+ "refine_mlp_blocksize": false,
367
+ "refine_residual_fp": false,
368
+ "refine_row_blocksize": 4,
369
+ "resume_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model",
370
+ "row_blocksize": -1,
371
+ "row_blocksize_optimizer": 1,
372
+ "s2": false,
373
+ "s2_max_split_size": 336,
374
+ "s2_resize_output_to_scale_idx": 0,
375
+ "s2_scales": "336,672,1008",
376
+ "second_order_bit": null,
377
+ "second_order_quant_type": null,
378
+ "soft_ce_std": 1.0,
379
+ "symm": true,
380
+ "time_token_format": "<t{t}>",
381
+ "time_token_ids": [],
382
+ "transformers_version": "4.49.0",
383
+ "tune_depth_projector": true,
384
+ "tune_depth_tower": true,
385
+ "tune_language_model": true,
386
+ "tune_mm_projector": true,
387
+ "tune_vision_tower": true,
388
+ "use_depth_tower": true,
389
+ "use_quantize_optimizer": false,
390
+ "version": "auto",
391
+ "video_encoder": {
392
+ "_target_": "llava.model.encoders.BasicVideoEncoder"
393
+ },
394
+ "vision_resolution": -1,
395
+ "vision_tower": "/share/project/zhouenshen/hpfs/ckpt/vlm/paligemma-siglip-so400m-patch14-448",
396
+ "vision_tower_cfg": {
397
+ "_attn_implementation_autoset": false,
398
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model/vision_tower",
399
+ "add_cross_attention": false,
400
+ "architectures": [
401
+ "SiglipVisionModel"
402
+ ],
403
+ "attention_dropout": 0.0,
404
+ "bad_words_ids": null,
405
+ "begin_suppress_tokens": null,
406
+ "bos_token_id": null,
407
+ "chunk_size_feed_forward": 0,
408
+ "cross_attention_hidden_size": null,
409
+ "decoder_start_token_id": null,
410
+ "diversity_penalty": 0.0,
411
+ "do_sample": false,
412
+ "early_stopping": false,
413
+ "encoder_no_repeat_ngram_size": 0,
414
+ "eos_token_id": null,
415
+ "exponential_decay_length_penalty": null,
416
+ "finetuning_task": null,
417
+ "forced_bos_token_id": null,
418
+ "forced_eos_token_id": null,
419
+ "hidden_act": "gelu_pytorch_tanh",
420
+ "hidden_size": 1152,
421
+ "id2label": {
422
+ "0": "LABEL_0",
423
+ "1": "LABEL_1"
424
+ },
425
+ "image_size": 448,
426
+ "intermediate_size": 4304,
427
+ "is_decoder": false,
428
+ "is_encoder_decoder": false,
429
+ "label2id": {
430
+ "LABEL_0": 0,
431
+ "LABEL_1": 1
432
+ },
433
+ "layer_norm_eps": 1e-06,
434
+ "length_penalty": 1.0,
435
+ "max_length": 20,
436
+ "min_length": 0,
437
+ "model_type": "siglip_vision_model",
438
+ "no_repeat_ngram_size": 0,
439
+ "num_attention_heads": 16,
440
+ "num_beam_groups": 1,
441
+ "num_beams": 1,
442
+ "num_channels": 3,
443
+ "num_hidden_layers": 27,
444
+ "num_image_tokens": 256,
445
+ "num_return_sequences": 1,
446
+ "output_attentions": false,
447
+ "output_hidden_states": false,
448
+ "output_scores": false,
449
+ "pad_token_id": null,
450
+ "patch_size": 14,
451
+ "prefix": null,
452
+ "problem_type": null,
453
+ "projection_dim": 2048,
454
+ "projector_hidden_act": "gelu_fast",
455
+ "pruned_heads": {},
456
+ "remove_invalid_values": false,
457
+ "repetition_penalty": 1.0,
458
+ "return_dict": true,
459
+ "return_dict_in_generate": false,
460
+ "sep_token_id": null,
461
+ "suppress_tokens": null,
462
+ "task_specific_params": null,
463
+ "temperature": 1.0,
464
+ "tf_legacy_loss": false,
465
+ "tie_encoder_decoder": false,
466
+ "tie_word_embeddings": true,
467
+ "tokenizer_class": null,
468
+ "top_k": 50,
469
+ "top_p": 1.0,
470
+ "torch_dtype": "bfloat16",
471
+ "torchscript": false,
472
+ "typical_p": 1.0,
473
+ "use_bfloat16": false,
474
+ "vision_use_head": false
475
+ },
476
+ "vision_tower_lr": null,
477
+ "weight_memory_efficient": true
478
+ }
depth_projector/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model/depth_projector",
3
+ "architectures": [
4
+ "MultimodalProjector"
5
+ ],
6
+ "mm_projector_type": "mlp_downsample_3x3_fix",
7
+ "model_type": "v2l_projector",
8
+ "torch_dtype": "bfloat16",
9
+ "transformers_version": "4.49.0"
10
+ }
depth_projector/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04d8d30f7ada9c8912a155a8e3286b4be9f356d57402783d5257cd098a3a0075
3
+ size 87068272
depth_tower/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model/depth_tower",
3
+ "architectures": [
4
+ "SiglipVisionModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "hidden_act": "gelu_pytorch_tanh",
8
+ "hidden_size": 1152,
9
+ "image_size": 448,
10
+ "intermediate_size": 4304,
11
+ "layer_norm_eps": 1e-06,
12
+ "model_type": "siglip_vision_model",
13
+ "num_attention_heads": 16,
14
+ "num_channels": 3,
15
+ "num_hidden_layers": 27,
16
+ "num_image_tokens": 256,
17
+ "patch_size": 14,
18
+ "projection_dim": 2048,
19
+ "projector_hidden_act": "gelu_fast",
20
+ "torch_dtype": "bfloat16",
21
+ "transformers_version": "4.49.0",
22
+ "vision_use_head": false
23
+ }
depth_tower/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ab0cdbfe8518b7af42d78413b5f41250492b4da2532020e63e76896d16f3565
3
+ size 826707904
depth_tower/preprocessor_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "SiglipImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "processor_class": "SiglipProcessor",
18
+ "resample": 3,
19
+ "rescale_factor": 0.00392156862745098,
20
+ "size": {
21
+ "height": 448,
22
+ "width": 448
23
+ }
24
+ }
llm/added_tokens.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<depth>": 151651,
3
+ "<image>": 151649,
4
+ "<vila/sentinel>": 151648,
5
+ "<vila/video>": 151650,
6
+ "<|endoftext|>": 151643,
7
+ "<|im_end|>": 151645,
8
+ "<|im_start|>": 151644,
9
+ "[BOS]": 151646,
10
+ "[PAD]": 151647
11
+ }
llm/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model/llm",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 8960,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 28,
15
+ "model_max_length": 16384,
16
+ "model_type": "qwen2",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 2,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000.0,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": true,
25
+ "tokenizer_model_max_length": 4096,
26
+ "tokenizer_padding_side": "right",
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.49.0",
29
+ "use_cache": true,
30
+ "use_sliding_window": false,
31
+ "vocab_size": 151652
32
+ }
llm/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.49.0"
14
+ }
llm/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
llm/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b93b78db7e60f4afe47dfede4f31cb3c4c5df5a10a29a7957a8c1172a96011be
3
+ size 3086594696
llm/special_tokens_map.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "[BOS]",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "[PAD]",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ }
27
+ }
llm/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2e99f750957188ad0c9c3cd55d518b66c1527f44853aad1f5ec04f127681512
3
+ size 11419373
llm/tokenizer_config.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "[BOS]",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "[PAD]",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<vila/sentinel>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<image>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<vila/video>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<depth>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ }
76
+ },
77
+ "additional_special_tokens": [
78
+ "<|im_start|>",
79
+ "<|im_end|>"
80
+ ],
81
+ "bos_token": "[BOS]",
82
+ "chat_template": "{% if messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{% for message in messages if message['content'] is not none %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
83
+ "clean_up_tokenization_spaces": false,
84
+ "eos_token": "<|im_end|>",
85
+ "errors": "replace",
86
+ "extra_special_tokens": {},
87
+ "legacy": false,
88
+ "model_max_length": 16384,
89
+ "pad_token": "[PAD]",
90
+ "padding_side": "right",
91
+ "split_special_tokens": false,
92
+ "tokenizer_class": "Qwen2Tokenizer",
93
+ "unk_token": null
94
+ }
llm/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
mm_projector/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model/mm_projector",
3
+ "architectures": [
4
+ "MultimodalProjector"
5
+ ],
6
+ "mm_projector_type": "mlp_downsample_3x3_fix",
7
+ "model_type": "v2l_projector",
8
+ "torch_dtype": "bfloat16",
9
+ "transformers_version": "4.49.0"
10
+ }
mm_projector/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02752ffd6a4c1d45ca48cfed496f019f43311d09be43f5f1c944eae15e363e3e
3
+ size 87068272
runs/Jun18_07-08-47_job-af63a4e8-da56-4186-a961-12d161efe7c0-master-0/events.out.tfevents.1750231024.job-af63a4e8-da56-4186-a961-12d161efe7c0-master-0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92707d1fd64ef547f37aab8dd2ada85d91e0ba7e514f341bdb9309ad12c64599
3
+ size 3695845
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
vision_tower/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/share/project/zhouenshen/hpfs/code/VILA/runs/train/NVILA-Lite-2B-depth-sft-new_placement+new_simulator-10-nodes/model/vision_tower",
3
+ "architectures": [
4
+ "SiglipVisionModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "hidden_act": "gelu_pytorch_tanh",
8
+ "hidden_size": 1152,
9
+ "image_size": 448,
10
+ "intermediate_size": 4304,
11
+ "layer_norm_eps": 1e-06,
12
+ "model_type": "siglip_vision_model",
13
+ "num_attention_heads": 16,
14
+ "num_channels": 3,
15
+ "num_hidden_layers": 27,
16
+ "num_image_tokens": 256,
17
+ "patch_size": 14,
18
+ "projection_dim": 2048,
19
+ "projector_hidden_act": "gelu_fast",
20
+ "torch_dtype": "bfloat16",
21
+ "transformers_version": "4.49.0",
22
+ "vision_use_head": false
23
+ }
vision_tower/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ed468183a3f5823cce92e3362c4a3b9e18de39cfec67bc573dfca6bd82df82c
3
+ size 826707904
vision_tower/preprocessor_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "SiglipImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "processor_class": "SiglipProcessor",
18
+ "resample": 3,
19
+ "rescale_factor": 0.00392156862745098,
20
+ "size": {
21
+ "height": 448,
22
+ "width": 448
23
+ }
24
+ }