tcm03 commited on
Commit
475fbaa
·
1 Parent(s): cb14d91

Track log files with git lfs

Browse files
runtime_logs/run_2025-02-14_04-23-49.log CHANGED
@@ -1,382 +1,3 @@
1
- 2025-02-14 04:23:49,942 - training_args.py:2100 - _setup_devices - INFO - PyTorch: setting up devices
2
- 2025-02-14 04:23:49,975 - training_args.py:1837 - __post_init__ - WARNING - When using FSDP full shard, instead of using `gradient_checkpointing` in TrainingArguments, please use `activation_checkpointing` in `fsdp_config`. The former introduces a redundant AllGather operation in backward pass. Reference: https://github.com/huggingface/transformers/issues/30404
3
- 2025-02-14 04:23:50,508 - configuration_utils.py:731 - _get_config_dict - INFO - loading configuration file ./checkpoints/longvu_llama3_2/config.json
4
- 2025-02-14 04:23:50,511 - configuration_utils.py:800 - from_dict - INFO - Model config CambrianConfig {
5
- "_name_or_path": "/tmp/iopath_cache/manifold_cache/tree/users/shenx/finetune/09281004-cambrian_llama3_2_t576_ov",
6
- "architectures": [
7
- "CambrianLlamaForCausalLM"
8
- ],
9
- "attention_bias": false,
10
- "attention_dropout": 0.0,
11
- "bos_token_id": 128000,
12
- "connect_layer": 2,
13
- "connector_depth": 3,
14
- "connector_only": true,
15
- "dino_threshold": 0.83,
16
- "drop_threshold": 0.8,
17
- "eos_token_id": [
18
- 128001,
19
- 128008,
20
- 128009
21
- ],
22
- "frame_pos": false,
23
- "freeze_mm_mlp_adapter": false,
24
- "hidden_act": "silu",
25
- "hidden_size": 3072,
26
- "highres": true,
27
- "highres_connect": false,
28
- "image_aspect_ratio": "pad",
29
- "image_position": 91,
30
- "image_token_len": 144,
31
- "initializer_range": 0.02,
32
- "intermediate_size": 8192,
33
- "is_image_newline": true,
34
- "is_st_sampler": false,
35
- "lowres_token": 8,
36
- "max_position_embeddings": 131072,
37
- "mlp_bias": false,
38
- "mm_patch_merge_type": "flat",
39
- "mm_projector_lr": null,
40
- "mm_projector_type": "sva",
41
- "mm_use_im_patch_token": false,
42
- "mm_use_im_start_end": false,
43
- "mm_vision_sampler_lr": null,
44
- "mm_vision_select_feature": "patch",
45
- "mm_vision_select_layer": -2,
46
- "mm_vision_tower_aux_list": [
47
- "siglip/CLIP-ViT-SO400M-14-384",
48
- "facebook/dinov2-giant-res378"
49
- ],
50
- "mm_vision_tower_aux_token_len_list": [
51
- 576,
52
- 576
53
- ],
54
- "mm_vision_tower_lr": null,
55
- "model_type": "cambrian_llama",
56
- "num_attention_heads": 24,
57
- "num_hidden_layers": 28,
58
- "num_key_value_heads": 8,
59
- "num_of_vision_sampler_layers": 10,
60
- "num_query_group": 1,
61
- "pretraining_tp": 1,
62
- "query_num_list": [
63
- 144
64
- ],
65
- "rms_norm_eps": 1e-05,
66
- "rope_scaling": {
67
- "factor": 32.0,
68
- "high_freq_factor": 4.0,
69
- "low_freq_factor": 1.0,
70
- "original_max_position_embeddings": 8192,
71
- "rope_type": "llama3"
72
- },
73
- "rope_theta": 500000.0,
74
- "spmd_debug": null,
75
- "spmd_fsdp_sharding": null,
76
- "spmd_mesh": null,
77
- "start_of_vision_sampler_layers": 0,
78
- "stride_of_vision_sampler_layers": 3,
79
- "tie_word_embeddings": false,
80
- "tokenizer_model_max_length": 8192,
81
- "tokenizer_padding_side": "right",
82
- "torch_dtype": "float32",
83
- "transformers_version": "4.43.1",
84
- "tune_mm_mlp_adapter": false,
85
- "unfreeze_mm_vision_tower": false,
86
- "use_cache": false,
87
- "use_mm_proj": true,
88
- "vision_hidden_size": 1024,
89
- "vision_tower_aux_token_len_list": [
90
- 576,
91
- 576
92
- ],
93
- "vocab_size": 128256
94
- }
95
-
96
- 2025-02-14 04:23:50,511 - modeling_utils.py:3618 - from_pretrained - INFO - loading weights file ./checkpoints/longvu_llama3_2/pytorch_model.bin
97
- 2025-02-14 04:23:50,551 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
98
- "bos_token_id": 128000,
99
- "eos_token_id": [
100
- 128001,
101
- 128008,
102
- 128009
103
- ],
104
- "use_cache": false
105
- }
106
-
107
- 2025-02-14 04:23:50,771 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
108
- 2025-02-14 04:23:50,774 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
109
- "apply_layernorm": true,
110
- "architectures": [
111
- "Dinov2Model"
112
- ],
113
- "attention_probs_dropout_prob": 0.0,
114
- "drop_path_rate": 0.0,
115
- "hidden_act": "gelu",
116
- "hidden_dropout_prob": 0.0,
117
- "hidden_size": 1536,
118
- "image_size": 518,
119
- "initializer_range": 0.02,
120
- "layer_norm_eps": 1e-06,
121
- "layerscale_value": 1.0,
122
- "mlp_ratio": 4,
123
- "model_type": "dinov2",
124
- "num_attention_heads": 24,
125
- "num_channels": 3,
126
- "num_hidden_layers": 40,
127
- "out_features": [
128
- "stage40"
129
- ],
130
- "out_indices": [
131
- 40
132
- ],
133
- "patch_size": 14,
134
- "qkv_bias": true,
135
- "reshape_hidden_states": true,
136
- "stage_names": [
137
- "stem",
138
- "stage1",
139
- "stage2",
140
- "stage3",
141
- "stage4",
142
- "stage5",
143
- "stage6",
144
- "stage7",
145
- "stage8",
146
- "stage9",
147
- "stage10",
148
- "stage11",
149
- "stage12",
150
- "stage13",
151
- "stage14",
152
- "stage15",
153
- "stage16",
154
- "stage17",
155
- "stage18",
156
- "stage19",
157
- "stage20",
158
- "stage21",
159
- "stage22",
160
- "stage23",
161
- "stage24",
162
- "stage25",
163
- "stage26",
164
- "stage27",
165
- "stage28",
166
- "stage29",
167
- "stage30",
168
- "stage31",
169
- "stage32",
170
- "stage33",
171
- "stage34",
172
- "stage35",
173
- "stage36",
174
- "stage37",
175
- "stage38",
176
- "stage39",
177
- "stage40"
178
- ],
179
- "torch_dtype": "float32",
180
- "transformers_version": "4.43.1",
181
- "use_swiglu_ffn": true
182
- }
183
-
184
- 2025-02-14 04:23:52,141 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing CambrianLlamaForCausalLM.
185
-
186
- 2025-02-14 04:23:52,141 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of CambrianLlamaForCausalLM were initialized from the model checkpoint at ./checkpoints/longvu_llama3_2.
187
- If your task is similar to the task the model of the checkpoint was trained on, you can already use CambrianLlamaForCausalLM for predictions without further training.
188
- 2025-02-14 04:23:52,147 - configuration_utils.py:991 - from_pretrained - INFO - loading configuration file ./checkpoints/longvu_llama3_2/generation_config.json
189
- 2025-02-14 04:23:52,147 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
190
- "bos_token_id": 128000,
191
- "do_sample": true,
192
- "eos_token_id": [
193
- 128001,
194
- 128008,
195
- 128009
196
- ],
197
- "temperature": 0.6,
198
- "top_p": 0.9
199
- }
200
-
201
- 2025-02-14 04:23:52,674 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer.json
202
- 2025-02-14 04:23:52,674 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file added_tokens.json
203
- 2025-02-14 04:23:52,674 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file special_tokens_map.json
204
- 2025-02-14 04:23:52,674 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer_config.json
205
- 2025-02-14 04:23:53,030 - tokenization_utils_base.py:2533 - _from_pretrained - INFO - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
206
- 2025-02-14 04:23:53,702 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/config.json
207
- 2025-02-14 04:23:53,704 - configuration_utils.py:800 - from_dict - INFO - Model config SiglipVisionConfig {
208
- "attention_dropout": 0.0,
209
- "hidden_act": "gelu_pytorch_tanh",
210
- "hidden_size": 1152,
211
- "image_size": 384,
212
- "intermediate_size": 4304,
213
- "layer_norm_eps": 1e-06,
214
- "model_type": "siglip_vision_model",
215
- "num_attention_heads": 16,
216
- "num_channels": 3,
217
- "num_hidden_layers": 27,
218
- "patch_size": 14,
219
- "transformers_version": "4.43.1"
220
- }
221
-
222
- 2025-02-14 04:23:53,704 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/model.safetensors
223
- 2025-02-14 04:23:53,969 - modeling_utils.py:4440 - _load_pretrained_model - INFO - Some weights of the model checkpoint at google/siglip-so400m-patch14-384 were not used when initializing SiglipVisionModel: ['logit_bias', 'logit_scale', 'text_model.embeddings.position_embedding.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.12.layer_norm1.bias', 'text_model.encoder.layers.12.layer_norm1.weight', 'text_model.encoder.layers.12.layer_norm2.bias', 'text_model.encoder.layers.12.layer_norm2.weight', 'text_model.encoder.layers.12.mlp.fc1.bias', 'text_model.encoder.layers.12.mlp.fc1.weight', 'text_model.encoder.layers.12.mlp.fc2.bias', 'text_model.encoder.layers.12.mlp.fc2.weight', 'text_model.encoder.layers.12.self_attn.k_proj.bias', 'text_model.encoder.layers.12.self_attn.k_proj.weight', 'text_model.encoder.layers.12.self_attn.out_proj.bias', 'text_model.encoder.layers.12.self_attn.out_proj.weight', 'text_model.encoder.layers.12.self_attn.q_proj.bias', 'text_model.encoder.layers.12.self_attn.q_proj.weight', 'text_model.encoder.layers.12.self_attn.v_proj.bias', 'text_model.encoder.layers.12.self_attn.v_proj.weight', 'text_model.encoder.layers.13.layer_norm1.bias', 'text_model.encoder.layers.13.layer_norm1.weight', 'text_model.encoder.layers.13.layer_norm2.bias', 'text_model.encoder.layers.13.layer_norm2.weight', 'text_model.encoder.layers.13.mlp.fc1.bias', 'text_model.encoder.layers.13.mlp.fc1.weight', 'text_model.encoder.layers.13.mlp.fc2.bias', 'text_model.encoder.layers.13.mlp.fc2.weight', 'text_model.encoder.layers.13.self_attn.k_proj.bias', 'text_model.encoder.layers.13.self_attn.k_proj.weight', 'text_model.encoder.layers.13.self_attn.out_proj.bias', 'text_model.encoder.layers.13.self_attn.out_proj.weight', 'text_model.encoder.layers.13.self_attn.q_proj.bias', 'text_model.encoder.layers.13.self_attn.q_proj.weight', 'text_model.encoder.layers.13.self_attn.v_proj.bias', 'text_model.encoder.layers.13.self_attn.v_proj.weight', 'text_model.encoder.layers.14.layer_norm1.bias', 'text_model.encoder.layers.14.layer_norm1.weight', 'text_model.encoder.layers.14.layer_norm2.bias', 'text_model.encoder.layers.14.layer_norm2.weight', 'text_model.encoder.layers.14.mlp.fc1.bias', 'text_model.encoder.layers.14.mlp.fc1.weight', 'text_model.encoder.layers.14.mlp.fc2.bias', 'text_model.encoder.layers.14.mlp.fc2.weight', 'text_model.encoder.layers.14.self_attn.k_proj.bias', 'text_model.encoder.layers.14.self_attn.k_proj.weight', 'text_model.encoder.layers.14.self_attn.out_proj.bias', 'text_model.encoder.layers.14.self_attn.out_proj.weight', 'text_model.encoder.layers.14.self_attn.q_proj.bias', 'text_model.encoder.layers.14.self_attn.q_proj.weight', 'text_model.encoder.layers.14.self_attn.v_proj.bias', 'text_model.encoder.layers.14.self_attn.v_proj.weight', 'text_model.encoder.layers.15.layer_norm1.bias', 'text_model.encoder.layers.15.layer_norm1.weight', 'text_model.encoder.layers.15.layer_norm2.bias', 'text_model.encoder.layers.15.layer_norm2.weight', 'text_model.encoder.layers.15.mlp.fc1.bias', 'text_model.encoder.layers.15.mlp.fc1.weight', 'text_model.encoder.layers.15.mlp.fc2.bias', 'text_model.encoder.layers.15.mlp.fc2.weight', 'text_model.encoder.layers.15.self_attn.k_proj.bias', 'text_model.encoder.layers.15.self_attn.k_proj.weight', 'text_model.encoder.layers.15.self_attn.out_proj.bias', 'text_model.encoder.layers.15.self_attn.out_proj.weight', 'text_model.encoder.layers.15.self_attn.q_proj.bias', 'text_model.encoder.layers.15.self_attn.q_proj.weight', 'text_model.encoder.layers.15.self_attn.v_proj.bias', 'text_model.encoder.layers.15.self_attn.v_proj.weight', 'text_model.encoder.layers.16.layer_norm1.bias', 'text_model.encoder.layers.16.layer_norm1.weight', 'text_model.encoder.layers.16.layer_norm2.bias', 'text_model.encoder.layers.16.layer_norm2.weight', 'text_model.encoder.layers.16.mlp.fc1.bias', 'text_model.encoder.layers.16.mlp.fc1.weight', 'text_model.encoder.layers.16.mlp.fc2.bias', 'text_model.encoder.layers.16.mlp.fc2.weight', 'text_model.encoder.layers.16.self_attn.k_proj.bias', 'text_model.encoder.layers.16.self_attn.k_proj.weight', 'text_model.encoder.layers.16.self_attn.out_proj.bias', 'text_model.encoder.layers.16.self_attn.out_proj.weight', 'text_model.encoder.layers.16.self_attn.q_proj.bias', 'text_model.encoder.layers.16.self_attn.q_proj.weight', 'text_model.encoder.layers.16.self_attn.v_proj.bias', 'text_model.encoder.layers.16.self_attn.v_proj.weight', 'text_model.encoder.layers.17.layer_norm1.bias', 'text_model.encoder.layers.17.layer_norm1.weight', 'text_model.encoder.layers.17.layer_norm2.bias', 'text_model.encoder.layers.17.layer_norm2.weight', 'text_model.encoder.layers.17.mlp.fc1.bias', 'text_model.encoder.layers.17.mlp.fc1.weight', 'text_model.encoder.layers.17.mlp.fc2.bias', 'text_model.encoder.layers.17.mlp.fc2.weight', 'text_model.encoder.layers.17.self_attn.k_proj.bias', 'text_model.encoder.layers.17.self_attn.k_proj.weight', 'text_model.encoder.layers.17.self_attn.out_proj.bias', 'text_model.encoder.layers.17.self_attn.out_proj.weight', 'text_model.encoder.layers.17.self_attn.q_proj.bias', 'text_model.encoder.layers.17.self_attn.q_proj.weight', 'text_model.encoder.layers.17.self_attn.v_proj.bias', 'text_model.encoder.layers.17.self_attn.v_proj.weight', 'text_model.encoder.layers.18.layer_norm1.bias', 'text_model.encoder.layers.18.layer_norm1.weight', 'text_model.encoder.layers.18.layer_norm2.bias', 'text_model.encoder.layers.18.layer_norm2.weight', 'text_model.encoder.layers.18.mlp.fc1.bias', 'text_model.encoder.layers.18.mlp.fc1.weight', 'text_model.encoder.layers.18.mlp.fc2.bias', 'text_model.encoder.layers.18.mlp.fc2.weight', 'text_model.encoder.layers.18.self_attn.k_proj.bias', 'text_model.encoder.layers.18.self_attn.k_proj.weight', 'text_model.encoder.layers.18.self_attn.out_proj.bias', 'text_model.encoder.layers.18.self_attn.out_proj.weight', 'text_model.encoder.layers.18.self_attn.q_proj.bias', 'text_model.encoder.layers.18.self_attn.q_proj.weight', 'text_model.encoder.layers.18.self_attn.v_proj.bias', 'text_model.encoder.layers.18.self_attn.v_proj.weight', 'text_model.encoder.layers.19.layer_norm1.bias', 'text_model.encoder.layers.19.layer_norm1.weight', 'text_model.encoder.layers.19.layer_norm2.bias', 'text_model.encoder.layers.19.layer_norm2.weight', 'text_model.encoder.layers.19.mlp.fc1.bias', 'text_model.encoder.layers.19.mlp.fc1.weight', 'text_model.encoder.layers.19.mlp.fc2.bias', 'text_model.encoder.layers.19.mlp.fc2.weight', 'text_model.encoder.layers.19.self_attn.k_proj.bias', 'text_model.encoder.layers.19.self_attn.k_proj.weight', 'text_model.encoder.layers.19.self_attn.out_proj.bias', 'text_model.encoder.layers.19.self_attn.out_proj.weight', 'text_model.encoder.layers.19.self_attn.q_proj.bias', 'text_model.encoder.layers.19.self_attn.q_proj.weight', 'text_model.encoder.layers.19.self_attn.v_proj.bias', 'text_model.encoder.layers.19.self_attn.v_proj.weight', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.20.layer_norm1.bias', 'text_model.encoder.layers.20.layer_norm1.weight', 'text_model.encoder.layers.20.layer_norm2.bias', 'text_model.encoder.layers.20.layer_norm2.weight', 'text_model.encoder.layers.20.mlp.fc1.bias', 'text_model.encoder.layers.20.mlp.fc1.weight', 'text_model.encoder.layers.20.mlp.fc2.bias', 'text_model.encoder.layers.20.mlp.fc2.weight', 'text_model.encoder.layers.20.self_attn.k_proj.bias', 'text_model.encoder.layers.20.self_attn.k_proj.weight', 'text_model.encoder.layers.20.self_attn.out_proj.bias', 'text_model.encoder.layers.20.self_attn.out_proj.weight', 'text_model.encoder.layers.20.self_attn.q_proj.bias', 'text_model.encoder.layers.20.self_attn.q_proj.weight', 'text_model.encoder.layers.20.self_attn.v_proj.bias', 'text_model.encoder.layers.20.self_attn.v_proj.weight', 'text_model.encoder.layers.21.layer_norm1.bias', 'text_model.encoder.layers.21.layer_norm1.weight', 'text_model.encoder.layers.21.layer_norm2.bias', 'text_model.encoder.layers.21.layer_norm2.weight', 'text_model.encoder.layers.21.mlp.fc1.bias', 'text_model.encoder.layers.21.mlp.fc1.weight', 'text_model.encoder.layers.21.mlp.fc2.bias', 'text_model.encoder.layers.21.mlp.fc2.weight', 'text_model.encoder.layers.21.self_attn.k_proj.bias', 'text_model.encoder.layers.21.self_attn.k_proj.weight', 'text_model.encoder.layers.21.self_attn.out_proj.bias', 'text_model.encoder.layers.21.self_attn.out_proj.weight', 'text_model.encoder.layers.21.self_attn.q_proj.bias', 'text_model.encoder.layers.21.self_attn.q_proj.weight', 'text_model.encoder.layers.21.self_attn.v_proj.bias', 'text_model.encoder.layers.21.self_attn.v_proj.weight', 'text_model.encoder.layers.22.layer_norm1.bias', 'text_model.encoder.layers.22.layer_norm1.weight', 'text_model.encoder.layers.22.layer_norm2.bias', 'text_model.encoder.layers.22.layer_norm2.weight', 'text_model.encoder.layers.22.mlp.fc1.bias', 'text_model.encoder.layers.22.mlp.fc1.weight', 'text_model.encoder.layers.22.mlp.fc2.bias', 'text_model.encoder.layers.22.mlp.fc2.weight', 'text_model.encoder.layers.22.self_attn.k_proj.bias', 'text_model.encoder.layers.22.self_attn.k_proj.weight', 'text_model.encoder.layers.22.self_attn.out_proj.bias', 'text_model.encoder.layers.22.self_attn.out_proj.weight', 'text_model.encoder.layers.22.self_attn.q_proj.bias', 'text_model.encoder.layers.22.self_attn.q_proj.weight', 'text_model.encoder.layers.22.self_attn.v_proj.bias', 'text_model.encoder.layers.22.self_attn.v_proj.weight', 'text_model.encoder.layers.23.layer_norm1.bias', 'text_model.encoder.layers.23.layer_norm1.weight', 'text_model.encoder.layers.23.layer_norm2.bias', 'text_model.encoder.layers.23.layer_norm2.weight', 'text_model.encoder.layers.23.mlp.fc1.bias', 'text_model.encoder.layers.23.mlp.fc1.weight', 'text_model.encoder.layers.23.mlp.fc2.bias', 'text_model.encoder.layers.23.mlp.fc2.weight', 'text_model.encoder.layers.23.self_attn.k_proj.bias', 'text_model.encoder.layers.23.self_attn.k_proj.weight', 'text_model.encoder.layers.23.self_attn.out_proj.bias', 'text_model.encoder.layers.23.self_attn.out_proj.weight', 'text_model.encoder.layers.23.self_attn.q_proj.bias', 'text_model.encoder.layers.23.self_attn.q_proj.weight', 'text_model.encoder.layers.23.self_attn.v_proj.bias', 'text_model.encoder.layers.23.self_attn.v_proj.weight', 'text_model.encoder.layers.24.layer_norm1.bias', 'text_model.encoder.layers.24.layer_norm1.weight', 'text_model.encoder.layers.24.layer_norm2.bias', 'text_model.encoder.layers.24.layer_norm2.weight', 'text_model.encoder.layers.24.mlp.fc1.bias', 'text_model.encoder.layers.24.mlp.fc1.weight', 'text_model.encoder.layers.24.mlp.fc2.bias', 'text_model.encoder.layers.24.mlp.fc2.weight', 'text_model.encoder.layers.24.self_attn.k_proj.bias', 'text_model.encoder.layers.24.self_attn.k_proj.weight', 'text_model.encoder.layers.24.self_attn.out_proj.bias', 'text_model.encoder.layers.24.self_attn.out_proj.weight', 'text_model.encoder.layers.24.self_attn.q_proj.bias', 'text_model.encoder.layers.24.self_attn.q_proj.weight', 'text_model.encoder.layers.24.self_attn.v_proj.bias', 'text_model.encoder.layers.24.self_attn.v_proj.weight', 'text_model.encoder.layers.25.layer_norm1.bias', 'text_model.encoder.layers.25.layer_norm1.weight', 'text_model.encoder.layers.25.layer_norm2.bias', 'text_model.encoder.layers.25.layer_norm2.weight', 'text_model.encoder.layers.25.mlp.fc1.bias', 'text_model.encoder.layers.25.mlp.fc1.weight', 'text_model.encoder.layers.25.mlp.fc2.bias', 'text_model.encoder.layers.25.mlp.fc2.weight', 'text_model.encoder.layers.25.self_attn.k_proj.bias', 'text_model.encoder.layers.25.self_attn.k_proj.weight', 'text_model.encoder.layers.25.self_attn.out_proj.bias', 'text_model.encoder.layers.25.self_attn.out_proj.weight', 'text_model.encoder.layers.25.self_attn.q_proj.bias', 'text_model.encoder.layers.25.self_attn.q_proj.weight', 'text_model.encoder.layers.25.self_attn.v_proj.bias', 'text_model.encoder.layers.25.self_attn.v_proj.weight', 'text_model.encoder.layers.26.layer_norm1.bias', 'text_model.encoder.layers.26.layer_norm1.weight', 'text_model.encoder.layers.26.layer_norm2.bias', 'text_model.encoder.layers.26.layer_norm2.weight', 'text_model.encoder.layers.26.mlp.fc1.bias', 'text_model.encoder.layers.26.mlp.fc1.weight', 'text_model.encoder.layers.26.mlp.fc2.bias', 'text_model.encoder.layers.26.mlp.fc2.weight', 'text_model.encoder.layers.26.self_attn.k_proj.bias', 'text_model.encoder.layers.26.self_attn.k_proj.weight', 'text_model.encoder.layers.26.self_attn.out_proj.bias', 'text_model.encoder.layers.26.self_attn.out_proj.weight', 'text_model.encoder.layers.26.self_attn.q_proj.bias', 'text_model.encoder.layers.26.self_attn.q_proj.weight', 'text_model.encoder.layers.26.self_attn.v_proj.bias', 'text_model.encoder.layers.26.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.final_layer_norm.weight', 'text_model.head.bias', 'text_model.head.weight']
224
- - This IS expected if you are initializing SiglipVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
225
- - This IS NOT expected if you are initializing SiglipVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
226
- 2025-02-14 04:23:53,971 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of SiglipVisionModel were initialized from the model checkpoint at google/siglip-so400m-patch14-384.
227
- If your task is similar to the task the model of the checkpoint was trained on, you can already use SiglipVisionModel for predictions without further training.
228
- 2025-02-14 04:23:54,163 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/preprocessor_config.json
229
- 2025-02-14 04:23:54,164 - image_processing_base.py:429 - from_dict - INFO - Image processor SiglipImageProcessor {
230
- "do_convert_rgb": null,
231
- "do_normalize": true,
232
- "do_rescale": true,
233
- "do_resize": true,
234
- "image_mean": [
235
- 0.5,
236
- 0.5,
237
- 0.5
238
- ],
239
- "image_processor_type": "SiglipImageProcessor",
240
- "image_std": [
241
- 0.5,
242
- 0.5,
243
- 0.5
244
- ],
245
- "processor_class": "SiglipProcessor",
246
- "resample": 3,
247
- "rescale_factor": 0.00392156862745098,
248
- "size": {
249
- "height": 384,
250
- "width": 384
251
- }
252
- }
253
-
254
- 2025-02-14 04:23:54,834 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
255
- 2025-02-14 04:23:54,837 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
256
- "apply_layernorm": true,
257
- "architectures": [
258
- "Dinov2Model"
259
- ],
260
- "attention_probs_dropout_prob": 0.0,
261
- "drop_path_rate": 0.0,
262
- "hidden_act": "gelu",
263
- "hidden_dropout_prob": 0.0,
264
- "hidden_size": 1536,
265
- "image_size": 518,
266
- "initializer_range": 0.02,
267
- "layer_norm_eps": 1e-06,
268
- "layerscale_value": 1.0,
269
- "mlp_ratio": 4,
270
- "model_type": "dinov2",
271
- "num_attention_heads": 24,
272
- "num_channels": 3,
273
- "num_hidden_layers": 40,
274
- "out_features": [
275
- "stage40"
276
- ],
277
- "out_indices": [
278
- 40
279
- ],
280
- "patch_size": 14,
281
- "qkv_bias": true,
282
- "reshape_hidden_states": true,
283
- "stage_names": [
284
- "stem",
285
- "stage1",
286
- "stage2",
287
- "stage3",
288
- "stage4",
289
- "stage5",
290
- "stage6",
291
- "stage7",
292
- "stage8",
293
- "stage9",
294
- "stage10",
295
- "stage11",
296
- "stage12",
297
- "stage13",
298
- "stage14",
299
- "stage15",
300
- "stage16",
301
- "stage17",
302
- "stage18",
303
- "stage19",
304
- "stage20",
305
- "stage21",
306
- "stage22",
307
- "stage23",
308
- "stage24",
309
- "stage25",
310
- "stage26",
311
- "stage27",
312
- "stage28",
313
- "stage29",
314
- "stage30",
315
- "stage31",
316
- "stage32",
317
- "stage33",
318
- "stage34",
319
- "stage35",
320
- "stage36",
321
- "stage37",
322
- "stage38",
323
- "stage39",
324
- "stage40"
325
- ],
326
- "torch_dtype": "float32",
327
- "transformers_version": "4.43.1",
328
- "use_swiglu_ffn": true
329
- }
330
-
331
- 2025-02-14 04:23:54,838 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/model.safetensors
332
- 2025-02-14 04:23:55,363 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing Dinov2Model.
333
-
334
- 2025-02-14 04:23:55,364 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of Dinov2Model were initialized from the model checkpoint at facebook/dinov2-giant.
335
- If your task is similar to the task the model of the checkpoint was trained on, you can already use Dinov2Model for predictions without further training.
336
- 2025-02-14 04:23:55,551 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/preprocessor_config.json
337
- 2025-02-14 04:23:55,554 - image_processing_base.py:429 - from_dict - INFO - Image processor BitImageProcessor {
338
- "crop_size": {
339
- "height": 378,
340
- "width": 378
341
- },
342
- "do_center_crop": true,
343
- "do_convert_rgb": true,
344
- "do_normalize": true,
345
- "do_rescale": true,
346
- "do_resize": true,
347
- "image_mean": [
348
- 0.485,
349
- 0.456,
350
- 0.406
351
- ],
352
- "image_processor_type": "BitImageProcessor",
353
- "image_std": [
354
- 0.229,
355
- 0.224,
356
- 0.225
357
- ],
358
- "resample": 3,
359
- "rescale_factor": 0.00392156862745098,
360
- "size": {
361
- "shortest_edge": 378
362
- }
363
- }
364
-
365
- 2025-02-14 04:23:56,625 - finetune_llama.py:1239 - train - INFO - Total params: 3264865280
366
- 2025-02-14 04:23:56,625 - finetune_llama.py:1240 - train - INFO - Trainable params: 12589056
367
- 2025-02-14 04:23:56,625 - finetune_llama.py:1241 - train - INFO - LM head params: 394002432
368
- 2025-02-14 04:23:57,952 - trainer_callback.py:423 - add_callback - WARNING - You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
369
- :DefaultFlowCallback
370
- TensorBoardCallback
371
- 2025-02-14 04:23:57,952 - trainer.py:648 - __init__ - INFO - Using auto half precision backend
372
- 2025-02-14 04:24:00,013 - trainer.py:2134 - _inner_training_loop - INFO - ***** Running training *****
373
- 2025-02-14 04:24:00,013 - trainer.py:2135 - _inner_training_loop - INFO - Num examples = 554
374
- 2025-02-14 04:24:00,013 - trainer.py:2136 - _inner_training_loop - INFO - Num Epochs = 2
375
- 2025-02-14 04:24:00,013 - trainer.py:2137 - _inner_training_loop - INFO - Instantaneous batch size per device = 1
376
- 2025-02-14 04:24:00,013 - trainer.py:2140 - _inner_training_loop - INFO - Total train batch size (w. parallel, distributed & accumulation) = 1
377
- 2025-02-14 04:24:00,013 - trainer.py:2141 - _inner_training_loop - INFO - Gradient Accumulation steps = 1
378
- 2025-02-14 04:24:00,013 - trainer.py:2142 - _inner_training_loop - INFO - Total optimization steps = 1,108
379
- 2025-02-14 04:24:00,015 - trainer.py:2143 - _inner_training_loop - INFO - Number of trainable parameters = 406,591,488
380
- 2025-02-14 04:24:26,002 - resource_logging.py:42 - debug_tensor - DEBUG - File: Unknown, Line: Unknown
381
- 2025-02-14 04:24:26,003 - resource_logging.py:45 - debug_tensor - DEBUG - In compute_loss(): inputs['labels']: [torch.Size([1, 8192]), torch.int64, cuda:0]
382
- 2025-02-14 04:24:26,038 - mm_trainer.py:618 - compute_loss - DEBUG - In compute_loss(): assistant token at position 224
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c214f7ce3a6683296aa31dd99ef51f5692f03033e916f7973d03b3cb7b3484d
3
+ size 37176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
runtime_logs/run_2025-02-14_04-25-33.log CHANGED
@@ -1,378 +1,3 @@
1
- 2025-02-14 04:25:33,522 - training_args.py:2100 - _setup_devices - INFO - PyTorch: setting up devices
2
- 2025-02-14 04:25:34,088 - configuration_utils.py:731 - _get_config_dict - INFO - loading configuration file ./checkpoints/longvu_llama3_2/config.json
3
- 2025-02-14 04:25:34,092 - configuration_utils.py:800 - from_dict - INFO - Model config CambrianConfig {
4
- "_name_or_path": "/tmp/iopath_cache/manifold_cache/tree/users/shenx/finetune/09281004-cambrian_llama3_2_t576_ov",
5
- "architectures": [
6
- "CambrianLlamaForCausalLM"
7
- ],
8
- "attention_bias": false,
9
- "attention_dropout": 0.0,
10
- "bos_token_id": 128000,
11
- "connect_layer": 2,
12
- "connector_depth": 3,
13
- "connector_only": true,
14
- "dino_threshold": 0.83,
15
- "drop_threshold": 0.8,
16
- "eos_token_id": [
17
- 128001,
18
- 128008,
19
- 128009
20
- ],
21
- "frame_pos": false,
22
- "freeze_mm_mlp_adapter": false,
23
- "hidden_act": "silu",
24
- "hidden_size": 3072,
25
- "highres": true,
26
- "highres_connect": false,
27
- "image_aspect_ratio": "pad",
28
- "image_position": 91,
29
- "image_token_len": 144,
30
- "initializer_range": 0.02,
31
- "intermediate_size": 8192,
32
- "is_image_newline": true,
33
- "is_st_sampler": false,
34
- "lowres_token": 8,
35
- "max_position_embeddings": 131072,
36
- "mlp_bias": false,
37
- "mm_patch_merge_type": "flat",
38
- "mm_projector_lr": null,
39
- "mm_projector_type": "sva",
40
- "mm_use_im_patch_token": false,
41
- "mm_use_im_start_end": false,
42
- "mm_vision_sampler_lr": null,
43
- "mm_vision_select_feature": "patch",
44
- "mm_vision_select_layer": -2,
45
- "mm_vision_tower_aux_list": [
46
- "siglip/CLIP-ViT-SO400M-14-384",
47
- "facebook/dinov2-giant-res378"
48
- ],
49
- "mm_vision_tower_aux_token_len_list": [
50
- 576,
51
- 576
52
- ],
53
- "mm_vision_tower_lr": null,
54
- "model_type": "cambrian_llama",
55
- "num_attention_heads": 24,
56
- "num_hidden_layers": 28,
57
- "num_key_value_heads": 8,
58
- "num_of_vision_sampler_layers": 10,
59
- "num_query_group": 1,
60
- "pretraining_tp": 1,
61
- "query_num_list": [
62
- 144
63
- ],
64
- "rms_norm_eps": 1e-05,
65
- "rope_scaling": {
66
- "factor": 32.0,
67
- "high_freq_factor": 4.0,
68
- "low_freq_factor": 1.0,
69
- "original_max_position_embeddings": 8192,
70
- "rope_type": "llama3"
71
- },
72
- "rope_theta": 500000.0,
73
- "spmd_debug": null,
74
- "spmd_fsdp_sharding": null,
75
- "spmd_mesh": null,
76
- "start_of_vision_sampler_layers": 0,
77
- "stride_of_vision_sampler_layers": 3,
78
- "tie_word_embeddings": false,
79
- "tokenizer_model_max_length": 8192,
80
- "tokenizer_padding_side": "right",
81
- "torch_dtype": "float32",
82
- "transformers_version": "4.43.1",
83
- "tune_mm_mlp_adapter": false,
84
- "unfreeze_mm_vision_tower": false,
85
- "use_cache": false,
86
- "use_mm_proj": true,
87
- "vision_hidden_size": 1024,
88
- "vision_tower_aux_token_len_list": [
89
- 576,
90
- 576
91
- ],
92
- "vocab_size": 128256
93
- }
94
-
95
- 2025-02-14 04:25:34,092 - modeling_utils.py:3618 - from_pretrained - INFO - loading weights file ./checkpoints/longvu_llama3_2/pytorch_model.bin
96
- 2025-02-14 04:25:34,151 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
97
- "bos_token_id": 128000,
98
- "eos_token_id": [
99
- 128001,
100
- 128008,
101
- 128009
102
- ],
103
- "use_cache": false
104
- }
105
-
106
- 2025-02-14 04:25:34,708 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
107
- 2025-02-14 04:25:34,712 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
108
- "apply_layernorm": true,
109
- "architectures": [
110
- "Dinov2Model"
111
- ],
112
- "attention_probs_dropout_prob": 0.0,
113
- "drop_path_rate": 0.0,
114
- "hidden_act": "gelu",
115
- "hidden_dropout_prob": 0.0,
116
- "hidden_size": 1536,
117
- "image_size": 518,
118
- "initializer_range": 0.02,
119
- "layer_norm_eps": 1e-06,
120
- "layerscale_value": 1.0,
121
- "mlp_ratio": 4,
122
- "model_type": "dinov2",
123
- "num_attention_heads": 24,
124
- "num_channels": 3,
125
- "num_hidden_layers": 40,
126
- "out_features": [
127
- "stage40"
128
- ],
129
- "out_indices": [
130
- 40
131
- ],
132
- "patch_size": 14,
133
- "qkv_bias": true,
134
- "reshape_hidden_states": true,
135
- "stage_names": [
136
- "stem",
137
- "stage1",
138
- "stage2",
139
- "stage3",
140
- "stage4",
141
- "stage5",
142
- "stage6",
143
- "stage7",
144
- "stage8",
145
- "stage9",
146
- "stage10",
147
- "stage11",
148
- "stage12",
149
- "stage13",
150
- "stage14",
151
- "stage15",
152
- "stage16",
153
- "stage17",
154
- "stage18",
155
- "stage19",
156
- "stage20",
157
- "stage21",
158
- "stage22",
159
- "stage23",
160
- "stage24",
161
- "stage25",
162
- "stage26",
163
- "stage27",
164
- "stage28",
165
- "stage29",
166
- "stage30",
167
- "stage31",
168
- "stage32",
169
- "stage33",
170
- "stage34",
171
- "stage35",
172
- "stage36",
173
- "stage37",
174
- "stage38",
175
- "stage39",
176
- "stage40"
177
- ],
178
- "torch_dtype": "float32",
179
- "transformers_version": "4.43.1",
180
- "use_swiglu_ffn": true
181
- }
182
-
183
- 2025-02-14 04:25:36,246 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing CambrianLlamaForCausalLM.
184
-
185
- 2025-02-14 04:25:36,247 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of CambrianLlamaForCausalLM were initialized from the model checkpoint at ./checkpoints/longvu_llama3_2.
186
- If your task is similar to the task the model of the checkpoint was trained on, you can already use CambrianLlamaForCausalLM for predictions without further training.
187
- 2025-02-14 04:25:36,252 - configuration_utils.py:991 - from_pretrained - INFO - loading configuration file ./checkpoints/longvu_llama3_2/generation_config.json
188
- 2025-02-14 04:25:36,253 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
189
- "bos_token_id": 128000,
190
- "do_sample": true,
191
- "eos_token_id": [
192
- 128001,
193
- 128008,
194
- 128009
195
- ],
196
- "temperature": 0.6,
197
- "top_p": 0.9
198
- }
199
-
200
- 2025-02-14 04:25:36,601 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer.json
201
- 2025-02-14 04:25:36,601 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file added_tokens.json
202
- 2025-02-14 04:25:36,601 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file special_tokens_map.json
203
- 2025-02-14 04:25:36,601 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer_config.json
204
- 2025-02-14 04:25:37,048 - tokenization_utils_base.py:2533 - _from_pretrained - INFO - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
205
- 2025-02-14 04:25:37,745 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/config.json
206
- 2025-02-14 04:25:37,748 - configuration_utils.py:800 - from_dict - INFO - Model config SiglipVisionConfig {
207
- "attention_dropout": 0.0,
208
- "hidden_act": "gelu_pytorch_tanh",
209
- "hidden_size": 1152,
210
- "image_size": 384,
211
- "intermediate_size": 4304,
212
- "layer_norm_eps": 1e-06,
213
- "model_type": "siglip_vision_model",
214
- "num_attention_heads": 16,
215
- "num_channels": 3,
216
- "num_hidden_layers": 27,
217
- "patch_size": 14,
218
- "transformers_version": "4.43.1"
219
- }
220
-
221
- 2025-02-14 04:25:37,749 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/model.safetensors
222
- 2025-02-14 04:25:38,020 - modeling_utils.py:4440 - _load_pretrained_model - INFO - Some weights of the model checkpoint at google/siglip-so400m-patch14-384 were not used when initializing SiglipVisionModel: ['logit_bias', 'logit_scale', 'text_model.embeddings.position_embedding.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.12.layer_norm1.bias', 'text_model.encoder.layers.12.layer_norm1.weight', 'text_model.encoder.layers.12.layer_norm2.bias', 'text_model.encoder.layers.12.layer_norm2.weight', 'text_model.encoder.layers.12.mlp.fc1.bias', 'text_model.encoder.layers.12.mlp.fc1.weight', 'text_model.encoder.layers.12.mlp.fc2.bias', 'text_model.encoder.layers.12.mlp.fc2.weight', 'text_model.encoder.layers.12.self_attn.k_proj.bias', 'text_model.encoder.layers.12.self_attn.k_proj.weight', 'text_model.encoder.layers.12.self_attn.out_proj.bias', 'text_model.encoder.layers.12.self_attn.out_proj.weight', 'text_model.encoder.layers.12.self_attn.q_proj.bias', 'text_model.encoder.layers.12.self_attn.q_proj.weight', 'text_model.encoder.layers.12.self_attn.v_proj.bias', 'text_model.encoder.layers.12.self_attn.v_proj.weight', 'text_model.encoder.layers.13.layer_norm1.bias', 'text_model.encoder.layers.13.layer_norm1.weight', 'text_model.encoder.layers.13.layer_norm2.bias', 'text_model.encoder.layers.13.layer_norm2.weight', 'text_model.encoder.layers.13.mlp.fc1.bias', 'text_model.encoder.layers.13.mlp.fc1.weight', 'text_model.encoder.layers.13.mlp.fc2.bias', 'text_model.encoder.layers.13.mlp.fc2.weight', 'text_model.encoder.layers.13.self_attn.k_proj.bias', 'text_model.encoder.layers.13.self_attn.k_proj.weight', 'text_model.encoder.layers.13.self_attn.out_proj.bias', 'text_model.encoder.layers.13.self_attn.out_proj.weight', 'text_model.encoder.layers.13.self_attn.q_proj.bias', 'text_model.encoder.layers.13.self_attn.q_proj.weight', 'text_model.encoder.layers.13.self_attn.v_proj.bias', 'text_model.encoder.layers.13.self_attn.v_proj.weight', 'text_model.encoder.layers.14.layer_norm1.bias', 'text_model.encoder.layers.14.layer_norm1.weight', 'text_model.encoder.layers.14.layer_norm2.bias', 'text_model.encoder.layers.14.layer_norm2.weight', 'text_model.encoder.layers.14.mlp.fc1.bias', 'text_model.encoder.layers.14.mlp.fc1.weight', 'text_model.encoder.layers.14.mlp.fc2.bias', 'text_model.encoder.layers.14.mlp.fc2.weight', 'text_model.encoder.layers.14.self_attn.k_proj.bias', 'text_model.encoder.layers.14.self_attn.k_proj.weight', 'text_model.encoder.layers.14.self_attn.out_proj.bias', 'text_model.encoder.layers.14.self_attn.out_proj.weight', 'text_model.encoder.layers.14.self_attn.q_proj.bias', 'text_model.encoder.layers.14.self_attn.q_proj.weight', 'text_model.encoder.layers.14.self_attn.v_proj.bias', 'text_model.encoder.layers.14.self_attn.v_proj.weight', 'text_model.encoder.layers.15.layer_norm1.bias', 'text_model.encoder.layers.15.layer_norm1.weight', 'text_model.encoder.layers.15.layer_norm2.bias', 'text_model.encoder.layers.15.layer_norm2.weight', 'text_model.encoder.layers.15.mlp.fc1.bias', 'text_model.encoder.layers.15.mlp.fc1.weight', 'text_model.encoder.layers.15.mlp.fc2.bias', 'text_model.encoder.layers.15.mlp.fc2.weight', 'text_model.encoder.layers.15.self_attn.k_proj.bias', 'text_model.encoder.layers.15.self_attn.k_proj.weight', 'text_model.encoder.layers.15.self_attn.out_proj.bias', 'text_model.encoder.layers.15.self_attn.out_proj.weight', 'text_model.encoder.layers.15.self_attn.q_proj.bias', 'text_model.encoder.layers.15.self_attn.q_proj.weight', 'text_model.encoder.layers.15.self_attn.v_proj.bias', 'text_model.encoder.layers.15.self_attn.v_proj.weight', 'text_model.encoder.layers.16.layer_norm1.bias', 'text_model.encoder.layers.16.layer_norm1.weight', 'text_model.encoder.layers.16.layer_norm2.bias', 'text_model.encoder.layers.16.layer_norm2.weight', 'text_model.encoder.layers.16.mlp.fc1.bias', 'text_model.encoder.layers.16.mlp.fc1.weight', 'text_model.encoder.layers.16.mlp.fc2.bias', 'text_model.encoder.layers.16.mlp.fc2.weight', 'text_model.encoder.layers.16.self_attn.k_proj.bias', 'text_model.encoder.layers.16.self_attn.k_proj.weight', 'text_model.encoder.layers.16.self_attn.out_proj.bias', 'text_model.encoder.layers.16.self_attn.out_proj.weight', 'text_model.encoder.layers.16.self_attn.q_proj.bias', 'text_model.encoder.layers.16.self_attn.q_proj.weight', 'text_model.encoder.layers.16.self_attn.v_proj.bias', 'text_model.encoder.layers.16.self_attn.v_proj.weight', 'text_model.encoder.layers.17.layer_norm1.bias', 'text_model.encoder.layers.17.layer_norm1.weight', 'text_model.encoder.layers.17.layer_norm2.bias', 'text_model.encoder.layers.17.layer_norm2.weight', 'text_model.encoder.layers.17.mlp.fc1.bias', 'text_model.encoder.layers.17.mlp.fc1.weight', 'text_model.encoder.layers.17.mlp.fc2.bias', 'text_model.encoder.layers.17.mlp.fc2.weight', 'text_model.encoder.layers.17.self_attn.k_proj.bias', 'text_model.encoder.layers.17.self_attn.k_proj.weight', 'text_model.encoder.layers.17.self_attn.out_proj.bias', 'text_model.encoder.layers.17.self_attn.out_proj.weight', 'text_model.encoder.layers.17.self_attn.q_proj.bias', 'text_model.encoder.layers.17.self_attn.q_proj.weight', 'text_model.encoder.layers.17.self_attn.v_proj.bias', 'text_model.encoder.layers.17.self_attn.v_proj.weight', 'text_model.encoder.layers.18.layer_norm1.bias', 'text_model.encoder.layers.18.layer_norm1.weight', 'text_model.encoder.layers.18.layer_norm2.bias', 'text_model.encoder.layers.18.layer_norm2.weight', 'text_model.encoder.layers.18.mlp.fc1.bias', 'text_model.encoder.layers.18.mlp.fc1.weight', 'text_model.encoder.layers.18.mlp.fc2.bias', 'text_model.encoder.layers.18.mlp.fc2.weight', 'text_model.encoder.layers.18.self_attn.k_proj.bias', 'text_model.encoder.layers.18.self_attn.k_proj.weight', 'text_model.encoder.layers.18.self_attn.out_proj.bias', 'text_model.encoder.layers.18.self_attn.out_proj.weight', 'text_model.encoder.layers.18.self_attn.q_proj.bias', 'text_model.encoder.layers.18.self_attn.q_proj.weight', 'text_model.encoder.layers.18.self_attn.v_proj.bias', 'text_model.encoder.layers.18.self_attn.v_proj.weight', 'text_model.encoder.layers.19.layer_norm1.bias', 'text_model.encoder.layers.19.layer_norm1.weight', 'text_model.encoder.layers.19.layer_norm2.bias', 'text_model.encoder.layers.19.layer_norm2.weight', 'text_model.encoder.layers.19.mlp.fc1.bias', 'text_model.encoder.layers.19.mlp.fc1.weight', 'text_model.encoder.layers.19.mlp.fc2.bias', 'text_model.encoder.layers.19.mlp.fc2.weight', 'text_model.encoder.layers.19.self_attn.k_proj.bias', 'text_model.encoder.layers.19.self_attn.k_proj.weight', 'text_model.encoder.layers.19.self_attn.out_proj.bias', 'text_model.encoder.layers.19.self_attn.out_proj.weight', 'text_model.encoder.layers.19.self_attn.q_proj.bias', 'text_model.encoder.layers.19.self_attn.q_proj.weight', 'text_model.encoder.layers.19.self_attn.v_proj.bias', 'text_model.encoder.layers.19.self_attn.v_proj.weight', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.20.layer_norm1.bias', 'text_model.encoder.layers.20.layer_norm1.weight', 'text_model.encoder.layers.20.layer_norm2.bias', 'text_model.encoder.layers.20.layer_norm2.weight', 'text_model.encoder.layers.20.mlp.fc1.bias', 'text_model.encoder.layers.20.mlp.fc1.weight', 'text_model.encoder.layers.20.mlp.fc2.bias', 'text_model.encoder.layers.20.mlp.fc2.weight', 'text_model.encoder.layers.20.self_attn.k_proj.bias', 'text_model.encoder.layers.20.self_attn.k_proj.weight', 'text_model.encoder.layers.20.self_attn.out_proj.bias', 'text_model.encoder.layers.20.self_attn.out_proj.weight', 'text_model.encoder.layers.20.self_attn.q_proj.bias', 'text_model.encoder.layers.20.self_attn.q_proj.weight', 'text_model.encoder.layers.20.self_attn.v_proj.bias', 'text_model.encoder.layers.20.self_attn.v_proj.weight', 'text_model.encoder.layers.21.layer_norm1.bias', 'text_model.encoder.layers.21.layer_norm1.weight', 'text_model.encoder.layers.21.layer_norm2.bias', 'text_model.encoder.layers.21.layer_norm2.weight', 'text_model.encoder.layers.21.mlp.fc1.bias', 'text_model.encoder.layers.21.mlp.fc1.weight', 'text_model.encoder.layers.21.mlp.fc2.bias', 'text_model.encoder.layers.21.mlp.fc2.weight', 'text_model.encoder.layers.21.self_attn.k_proj.bias', 'text_model.encoder.layers.21.self_attn.k_proj.weight', 'text_model.encoder.layers.21.self_attn.out_proj.bias', 'text_model.encoder.layers.21.self_attn.out_proj.weight', 'text_model.encoder.layers.21.self_attn.q_proj.bias', 'text_model.encoder.layers.21.self_attn.q_proj.weight', 'text_model.encoder.layers.21.self_attn.v_proj.bias', 'text_model.encoder.layers.21.self_attn.v_proj.weight', 'text_model.encoder.layers.22.layer_norm1.bias', 'text_model.encoder.layers.22.layer_norm1.weight', 'text_model.encoder.layers.22.layer_norm2.bias', 'text_model.encoder.layers.22.layer_norm2.weight', 'text_model.encoder.layers.22.mlp.fc1.bias', 'text_model.encoder.layers.22.mlp.fc1.weight', 'text_model.encoder.layers.22.mlp.fc2.bias', 'text_model.encoder.layers.22.mlp.fc2.weight', 'text_model.encoder.layers.22.self_attn.k_proj.bias', 'text_model.encoder.layers.22.self_attn.k_proj.weight', 'text_model.encoder.layers.22.self_attn.out_proj.bias', 'text_model.encoder.layers.22.self_attn.out_proj.weight', 'text_model.encoder.layers.22.self_attn.q_proj.bias', 'text_model.encoder.layers.22.self_attn.q_proj.weight', 'text_model.encoder.layers.22.self_attn.v_proj.bias', 'text_model.encoder.layers.22.self_attn.v_proj.weight', 'text_model.encoder.layers.23.layer_norm1.bias', 'text_model.encoder.layers.23.layer_norm1.weight', 'text_model.encoder.layers.23.layer_norm2.bias', 'text_model.encoder.layers.23.layer_norm2.weight', 'text_model.encoder.layers.23.mlp.fc1.bias', 'text_model.encoder.layers.23.mlp.fc1.weight', 'text_model.encoder.layers.23.mlp.fc2.bias', 'text_model.encoder.layers.23.mlp.fc2.weight', 'text_model.encoder.layers.23.self_attn.k_proj.bias', 'text_model.encoder.layers.23.self_attn.k_proj.weight', 'text_model.encoder.layers.23.self_attn.out_proj.bias', 'text_model.encoder.layers.23.self_attn.out_proj.weight', 'text_model.encoder.layers.23.self_attn.q_proj.bias', 'text_model.encoder.layers.23.self_attn.q_proj.weight', 'text_model.encoder.layers.23.self_attn.v_proj.bias', 'text_model.encoder.layers.23.self_attn.v_proj.weight', 'text_model.encoder.layers.24.layer_norm1.bias', 'text_model.encoder.layers.24.layer_norm1.weight', 'text_model.encoder.layers.24.layer_norm2.bias', 'text_model.encoder.layers.24.layer_norm2.weight', 'text_model.encoder.layers.24.mlp.fc1.bias', 'text_model.encoder.layers.24.mlp.fc1.weight', 'text_model.encoder.layers.24.mlp.fc2.bias', 'text_model.encoder.layers.24.mlp.fc2.weight', 'text_model.encoder.layers.24.self_attn.k_proj.bias', 'text_model.encoder.layers.24.self_attn.k_proj.weight', 'text_model.encoder.layers.24.self_attn.out_proj.bias', 'text_model.encoder.layers.24.self_attn.out_proj.weight', 'text_model.encoder.layers.24.self_attn.q_proj.bias', 'text_model.encoder.layers.24.self_attn.q_proj.weight', 'text_model.encoder.layers.24.self_attn.v_proj.bias', 'text_model.encoder.layers.24.self_attn.v_proj.weight', 'text_model.encoder.layers.25.layer_norm1.bias', 'text_model.encoder.layers.25.layer_norm1.weight', 'text_model.encoder.layers.25.layer_norm2.bias', 'text_model.encoder.layers.25.layer_norm2.weight', 'text_model.encoder.layers.25.mlp.fc1.bias', 'text_model.encoder.layers.25.mlp.fc1.weight', 'text_model.encoder.layers.25.mlp.fc2.bias', 'text_model.encoder.layers.25.mlp.fc2.weight', 'text_model.encoder.layers.25.self_attn.k_proj.bias', 'text_model.encoder.layers.25.self_attn.k_proj.weight', 'text_model.encoder.layers.25.self_attn.out_proj.bias', 'text_model.encoder.layers.25.self_attn.out_proj.weight', 'text_model.encoder.layers.25.self_attn.q_proj.bias', 'text_model.encoder.layers.25.self_attn.q_proj.weight', 'text_model.encoder.layers.25.self_attn.v_proj.bias', 'text_model.encoder.layers.25.self_attn.v_proj.weight', 'text_model.encoder.layers.26.layer_norm1.bias', 'text_model.encoder.layers.26.layer_norm1.weight', 'text_model.encoder.layers.26.layer_norm2.bias', 'text_model.encoder.layers.26.layer_norm2.weight', 'text_model.encoder.layers.26.mlp.fc1.bias', 'text_model.encoder.layers.26.mlp.fc1.weight', 'text_model.encoder.layers.26.mlp.fc2.bias', 'text_model.encoder.layers.26.mlp.fc2.weight', 'text_model.encoder.layers.26.self_attn.k_proj.bias', 'text_model.encoder.layers.26.self_attn.k_proj.weight', 'text_model.encoder.layers.26.self_attn.out_proj.bias', 'text_model.encoder.layers.26.self_attn.out_proj.weight', 'text_model.encoder.layers.26.self_attn.q_proj.bias', 'text_model.encoder.layers.26.self_attn.q_proj.weight', 'text_model.encoder.layers.26.self_attn.v_proj.bias', 'text_model.encoder.layers.26.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.final_layer_norm.weight', 'text_model.head.bias', 'text_model.head.weight']
223
- - This IS expected if you are initializing SiglipVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
224
- - This IS NOT expected if you are initializing SiglipVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
225
- 2025-02-14 04:25:38,022 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of SiglipVisionModel were initialized from the model checkpoint at google/siglip-so400m-patch14-384.
226
- If your task is similar to the task the model of the checkpoint was trained on, you can already use SiglipVisionModel for predictions without further training.
227
- 2025-02-14 04:25:38,212 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/preprocessor_config.json
228
- 2025-02-14 04:25:38,212 - image_processing_base.py:429 - from_dict - INFO - Image processor SiglipImageProcessor {
229
- "do_convert_rgb": null,
230
- "do_normalize": true,
231
- "do_rescale": true,
232
- "do_resize": true,
233
- "image_mean": [
234
- 0.5,
235
- 0.5,
236
- 0.5
237
- ],
238
- "image_processor_type": "SiglipImageProcessor",
239
- "image_std": [
240
- 0.5,
241
- 0.5,
242
- 0.5
243
- ],
244
- "processor_class": "SiglipProcessor",
245
- "resample": 3,
246
- "rescale_factor": 0.00392156862745098,
247
- "size": {
248
- "height": 384,
249
- "width": 384
250
- }
251
- }
252
-
253
- 2025-02-14 04:25:38,584 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
254
- 2025-02-14 04:25:38,587 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
255
- "apply_layernorm": true,
256
- "architectures": [
257
- "Dinov2Model"
258
- ],
259
- "attention_probs_dropout_prob": 0.0,
260
- "drop_path_rate": 0.0,
261
- "hidden_act": "gelu",
262
- "hidden_dropout_prob": 0.0,
263
- "hidden_size": 1536,
264
- "image_size": 518,
265
- "initializer_range": 0.02,
266
- "layer_norm_eps": 1e-06,
267
- "layerscale_value": 1.0,
268
- "mlp_ratio": 4,
269
- "model_type": "dinov2",
270
- "num_attention_heads": 24,
271
- "num_channels": 3,
272
- "num_hidden_layers": 40,
273
- "out_features": [
274
- "stage40"
275
- ],
276
- "out_indices": [
277
- 40
278
- ],
279
- "patch_size": 14,
280
- "qkv_bias": true,
281
- "reshape_hidden_states": true,
282
- "stage_names": [
283
- "stem",
284
- "stage1",
285
- "stage2",
286
- "stage3",
287
- "stage4",
288
- "stage5",
289
- "stage6",
290
- "stage7",
291
- "stage8",
292
- "stage9",
293
- "stage10",
294
- "stage11",
295
- "stage12",
296
- "stage13",
297
- "stage14",
298
- "stage15",
299
- "stage16",
300
- "stage17",
301
- "stage18",
302
- "stage19",
303
- "stage20",
304
- "stage21",
305
- "stage22",
306
- "stage23",
307
- "stage24",
308
- "stage25",
309
- "stage26",
310
- "stage27",
311
- "stage28",
312
- "stage29",
313
- "stage30",
314
- "stage31",
315
- "stage32",
316
- "stage33",
317
- "stage34",
318
- "stage35",
319
- "stage36",
320
- "stage37",
321
- "stage38",
322
- "stage39",
323
- "stage40"
324
- ],
325
- "torch_dtype": "float32",
326
- "transformers_version": "4.43.1",
327
- "use_swiglu_ffn": true
328
- }
329
-
330
- 2025-02-14 04:25:38,588 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/model.safetensors
331
- 2025-02-14 04:25:39,038 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing Dinov2Model.
332
-
333
- 2025-02-14 04:25:39,038 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of Dinov2Model were initialized from the model checkpoint at facebook/dinov2-giant.
334
- If your task is similar to the task the model of the checkpoint was trained on, you can already use Dinov2Model for predictions without further training.
335
- 2025-02-14 04:25:39,536 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/preprocessor_config.json
336
- 2025-02-14 04:25:39,540 - image_processing_base.py:429 - from_dict - INFO - Image processor BitImageProcessor {
337
- "crop_size": {
338
- "height": 378,
339
- "width": 378
340
- },
341
- "do_center_crop": true,
342
- "do_convert_rgb": true,
343
- "do_normalize": true,
344
- "do_rescale": true,
345
- "do_resize": true,
346
- "image_mean": [
347
- 0.485,
348
- 0.456,
349
- 0.406
350
- ],
351
- "image_processor_type": "BitImageProcessor",
352
- "image_std": [
353
- 0.229,
354
- 0.224,
355
- 0.225
356
- ],
357
- "resample": 3,
358
- "rescale_factor": 0.00392156862745098,
359
- "size": {
360
- "shortest_edge": 378
361
- }
362
- }
363
-
364
- 2025-02-14 04:25:40,430 - finetune_llama.py:1239 - train - INFO - Total params: 3264865280
365
- 2025-02-14 04:25:40,430 - finetune_llama.py:1240 - train - INFO - Trainable params: 12589056
366
- 2025-02-14 04:25:40,430 - finetune_llama.py:1241 - train - INFO - LM head params: 394002432
367
- 2025-02-14 04:25:42,550 - trainer_callback.py:423 - add_callback - WARNING - You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
368
- :DefaultFlowCallback
369
- TensorBoardCallback
370
- 2025-02-14 04:25:42,550 - trainer.py:648 - __init__ - INFO - Using auto half precision backend
371
- 2025-02-14 04:25:42,857 - trainer.py:2134 - _inner_training_loop - INFO - ***** Running training *****
372
- 2025-02-14 04:25:42,857 - trainer.py:2135 - _inner_training_loop - INFO - Num examples = 554
373
- 2025-02-14 04:25:42,857 - trainer.py:2136 - _inner_training_loop - INFO - Num Epochs = 2
374
- 2025-02-14 04:25:42,857 - trainer.py:2137 - _inner_training_loop - INFO - Instantaneous batch size per device = 1
375
- 2025-02-14 04:25:42,857 - trainer.py:2140 - _inner_training_loop - INFO - Total train batch size (w. parallel, distributed & accumulation) = 1
376
- 2025-02-14 04:25:42,857 - trainer.py:2141 - _inner_training_loop - INFO - Gradient Accumulation steps = 1
377
- 2025-02-14 04:25:42,857 - trainer.py:2142 - _inner_training_loop - INFO - Total optimization steps = 1,108
378
- 2025-02-14 04:25:42,859 - trainer.py:2143 - _inner_training_loop - INFO - Number of trainable parameters = 406,591,488
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b74b5d170fd29d8ff01362cf2bd56abc278847a760cc589f3e9f2986ccc2a236
3
+ size 36433
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
runtime_logs/run_2025-02-14_04-26-13.log CHANGED
The diff for this file is too large to render. See raw diff
 
runtime_logs/run_2025-02-14_05-01-33.log CHANGED
@@ -1,366 +1,3 @@
1
- 2025-02-14 05:01:33,998 - training_args.py:2100 - _setup_devices - INFO - PyTorch: setting up devices
2
- 2025-02-14 05:01:34,505 - configuration_utils.py:731 - _get_config_dict - INFO - loading configuration file ./checkpoints/longvu_llama3_2/config.json
3
- 2025-02-14 05:01:34,508 - configuration_utils.py:800 - from_dict - INFO - Model config CambrianConfig {
4
- "_name_or_path": "/tmp/iopath_cache/manifold_cache/tree/users/shenx/finetune/09281004-cambrian_llama3_2_t576_ov",
5
- "architectures": [
6
- "CambrianLlamaForCausalLM"
7
- ],
8
- "attention_bias": false,
9
- "attention_dropout": 0.0,
10
- "bos_token_id": 128000,
11
- "connect_layer": 2,
12
- "connector_depth": 3,
13
- "connector_only": true,
14
- "dino_threshold": 0.83,
15
- "drop_threshold": 0.8,
16
- "eos_token_id": [
17
- 128001,
18
- 128008,
19
- 128009
20
- ],
21
- "frame_pos": false,
22
- "freeze_mm_mlp_adapter": false,
23
- "hidden_act": "silu",
24
- "hidden_size": 3072,
25
- "highres": true,
26
- "highres_connect": false,
27
- "image_aspect_ratio": "pad",
28
- "image_position": 91,
29
- "image_token_len": 144,
30
- "initializer_range": 0.02,
31
- "intermediate_size": 8192,
32
- "is_image_newline": true,
33
- "is_st_sampler": false,
34
- "lowres_token": 8,
35
- "max_position_embeddings": 131072,
36
- "mlp_bias": false,
37
- "mm_patch_merge_type": "flat",
38
- "mm_projector_lr": null,
39
- "mm_projector_type": "sva",
40
- "mm_use_im_patch_token": false,
41
- "mm_use_im_start_end": false,
42
- "mm_vision_sampler_lr": null,
43
- "mm_vision_select_feature": "patch",
44
- "mm_vision_select_layer": -2,
45
- "mm_vision_tower_aux_list": [
46
- "siglip/CLIP-ViT-SO400M-14-384",
47
- "facebook/dinov2-giant-res378"
48
- ],
49
- "mm_vision_tower_aux_token_len_list": [
50
- 576,
51
- 576
52
- ],
53
- "mm_vision_tower_lr": null,
54
- "model_type": "cambrian_llama",
55
- "num_attention_heads": 24,
56
- "num_hidden_layers": 28,
57
- "num_key_value_heads": 8,
58
- "num_of_vision_sampler_layers": 10,
59
- "num_query_group": 1,
60
- "pretraining_tp": 1,
61
- "query_num_list": [
62
- 144
63
- ],
64
- "rms_norm_eps": 1e-05,
65
- "rope_scaling": {
66
- "factor": 32.0,
67
- "high_freq_factor": 4.0,
68
- "low_freq_factor": 1.0,
69
- "original_max_position_embeddings": 8192,
70
- "rope_type": "llama3"
71
- },
72
- "rope_theta": 500000.0,
73
- "spmd_debug": null,
74
- "spmd_fsdp_sharding": null,
75
- "spmd_mesh": null,
76
- "start_of_vision_sampler_layers": 0,
77
- "stride_of_vision_sampler_layers": 3,
78
- "tie_word_embeddings": false,
79
- "tokenizer_model_max_length": 8192,
80
- "tokenizer_padding_side": "right",
81
- "torch_dtype": "float32",
82
- "transformers_version": "4.43.1",
83
- "tune_mm_mlp_adapter": false,
84
- "unfreeze_mm_vision_tower": false,
85
- "use_cache": false,
86
- "use_mm_proj": true,
87
- "vision_hidden_size": 1024,
88
- "vision_tower_aux_token_len_list": [
89
- 576,
90
- 576
91
- ],
92
- "vocab_size": 128256
93
- }
94
-
95
- 2025-02-14 05:01:34,508 - modeling_utils.py:3618 - from_pretrained - INFO - loading weights file ./checkpoints/longvu_llama3_2/pytorch_model.bin
96
- 2025-02-14 05:01:34,547 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
97
- "bos_token_id": 128000,
98
- "eos_token_id": [
99
- 128001,
100
- 128008,
101
- 128009
102
- ],
103
- "use_cache": false
104
- }
105
-
106
- 2025-02-14 05:01:34,753 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
107
- 2025-02-14 05:01:34,757 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
108
- "apply_layernorm": true,
109
- "architectures": [
110
- "Dinov2Model"
111
- ],
112
- "attention_probs_dropout_prob": 0.0,
113
- "drop_path_rate": 0.0,
114
- "hidden_act": "gelu",
115
- "hidden_dropout_prob": 0.0,
116
- "hidden_size": 1536,
117
- "image_size": 518,
118
- "initializer_range": 0.02,
119
- "layer_norm_eps": 1e-06,
120
- "layerscale_value": 1.0,
121
- "mlp_ratio": 4,
122
- "model_type": "dinov2",
123
- "num_attention_heads": 24,
124
- "num_channels": 3,
125
- "num_hidden_layers": 40,
126
- "out_features": [
127
- "stage40"
128
- ],
129
- "out_indices": [
130
- 40
131
- ],
132
- "patch_size": 14,
133
- "qkv_bias": true,
134
- "reshape_hidden_states": true,
135
- "stage_names": [
136
- "stem",
137
- "stage1",
138
- "stage2",
139
- "stage3",
140
- "stage4",
141
- "stage5",
142
- "stage6",
143
- "stage7",
144
- "stage8",
145
- "stage9",
146
- "stage10",
147
- "stage11",
148
- "stage12",
149
- "stage13",
150
- "stage14",
151
- "stage15",
152
- "stage16",
153
- "stage17",
154
- "stage18",
155
- "stage19",
156
- "stage20",
157
- "stage21",
158
- "stage22",
159
- "stage23",
160
- "stage24",
161
- "stage25",
162
- "stage26",
163
- "stage27",
164
- "stage28",
165
- "stage29",
166
- "stage30",
167
- "stage31",
168
- "stage32",
169
- "stage33",
170
- "stage34",
171
- "stage35",
172
- "stage36",
173
- "stage37",
174
- "stage38",
175
- "stage39",
176
- "stage40"
177
- ],
178
- "torch_dtype": "float32",
179
- "transformers_version": "4.43.1",
180
- "use_swiglu_ffn": true
181
- }
182
-
183
- 2025-02-14 05:01:36,113 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing CambrianLlamaForCausalLM.
184
-
185
- 2025-02-14 05:01:36,113 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of CambrianLlamaForCausalLM were initialized from the model checkpoint at ./checkpoints/longvu_llama3_2.
186
- If your task is similar to the task the model of the checkpoint was trained on, you can already use CambrianLlamaForCausalLM for predictions without further training.
187
- 2025-02-14 05:01:36,119 - configuration_utils.py:991 - from_pretrained - INFO - loading configuration file ./checkpoints/longvu_llama3_2/generation_config.json
188
- 2025-02-14 05:01:36,119 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
189
- "bos_token_id": 128000,
190
- "do_sample": true,
191
- "eos_token_id": [
192
- 128001,
193
- 128008,
194
- 128009
195
- ],
196
- "temperature": 0.6,
197
- "top_p": 0.9
198
- }
199
-
200
- 2025-02-14 05:01:36,370 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer.json
201
- 2025-02-14 05:01:36,370 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file added_tokens.json
202
- 2025-02-14 05:01:36,370 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file special_tokens_map.json
203
- 2025-02-14 05:01:36,370 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer_config.json
204
- 2025-02-14 05:01:36,806 - tokenization_utils_base.py:2533 - _from_pretrained - INFO - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
205
- 2025-02-14 05:01:37,166 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/config.json
206
- 2025-02-14 05:01:37,167 - configuration_utils.py:800 - from_dict - INFO - Model config SiglipVisionConfig {
207
- "attention_dropout": 0.0,
208
- "hidden_act": "gelu_pytorch_tanh",
209
- "hidden_size": 1152,
210
- "image_size": 384,
211
- "intermediate_size": 4304,
212
- "layer_norm_eps": 1e-06,
213
- "model_type": "siglip_vision_model",
214
- "num_attention_heads": 16,
215
- "num_channels": 3,
216
- "num_hidden_layers": 27,
217
- "patch_size": 14,
218
- "transformers_version": "4.43.1"
219
- }
220
-
221
- 2025-02-14 05:01:37,168 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/model.safetensors
222
- 2025-02-14 05:01:37,440 - modeling_utils.py:4440 - _load_pretrained_model - INFO - Some weights of the model checkpoint at google/siglip-so400m-patch14-384 were not used when initializing SiglipVisionModel: ['logit_bias', 'logit_scale', 'text_model.embeddings.position_embedding.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.12.layer_norm1.bias', 'text_model.encoder.layers.12.layer_norm1.weight', 'text_model.encoder.layers.12.layer_norm2.bias', 'text_model.encoder.layers.12.layer_norm2.weight', 'text_model.encoder.layers.12.mlp.fc1.bias', 'text_model.encoder.layers.12.mlp.fc1.weight', 'text_model.encoder.layers.12.mlp.fc2.bias', 'text_model.encoder.layers.12.mlp.fc2.weight', 'text_model.encoder.layers.12.self_attn.k_proj.bias', 'text_model.encoder.layers.12.self_attn.k_proj.weight', 'text_model.encoder.layers.12.self_attn.out_proj.bias', 'text_model.encoder.layers.12.self_attn.out_proj.weight', 'text_model.encoder.layers.12.self_attn.q_proj.bias', 'text_model.encoder.layers.12.self_attn.q_proj.weight', 'text_model.encoder.layers.12.self_attn.v_proj.bias', 'text_model.encoder.layers.12.self_attn.v_proj.weight', 'text_model.encoder.layers.13.layer_norm1.bias', 'text_model.encoder.layers.13.layer_norm1.weight', 'text_model.encoder.layers.13.layer_norm2.bias', 'text_model.encoder.layers.13.layer_norm2.weight', 'text_model.encoder.layers.13.mlp.fc1.bias', 'text_model.encoder.layers.13.mlp.fc1.weight', 'text_model.encoder.layers.13.mlp.fc2.bias', 'text_model.encoder.layers.13.mlp.fc2.weight', 'text_model.encoder.layers.13.self_attn.k_proj.bias', 'text_model.encoder.layers.13.self_attn.k_proj.weight', 'text_model.encoder.layers.13.self_attn.out_proj.bias', 'text_model.encoder.layers.13.self_attn.out_proj.weight', 'text_model.encoder.layers.13.self_attn.q_proj.bias', 'text_model.encoder.layers.13.self_attn.q_proj.weight', 'text_model.encoder.layers.13.self_attn.v_proj.bias', 'text_model.encoder.layers.13.self_attn.v_proj.weight', 'text_model.encoder.layers.14.layer_norm1.bias', 'text_model.encoder.layers.14.layer_norm1.weight', 'text_model.encoder.layers.14.layer_norm2.bias', 'text_model.encoder.layers.14.layer_norm2.weight', 'text_model.encoder.layers.14.mlp.fc1.bias', 'text_model.encoder.layers.14.mlp.fc1.weight', 'text_model.encoder.layers.14.mlp.fc2.bias', 'text_model.encoder.layers.14.mlp.fc2.weight', 'text_model.encoder.layers.14.self_attn.k_proj.bias', 'text_model.encoder.layers.14.self_attn.k_proj.weight', 'text_model.encoder.layers.14.self_attn.out_proj.bias', 'text_model.encoder.layers.14.self_attn.out_proj.weight', 'text_model.encoder.layers.14.self_attn.q_proj.bias', 'text_model.encoder.layers.14.self_attn.q_proj.weight', 'text_model.encoder.layers.14.self_attn.v_proj.bias', 'text_model.encoder.layers.14.self_attn.v_proj.weight', 'text_model.encoder.layers.15.layer_norm1.bias', 'text_model.encoder.layers.15.layer_norm1.weight', 'text_model.encoder.layers.15.layer_norm2.bias', 'text_model.encoder.layers.15.layer_norm2.weight', 'text_model.encoder.layers.15.mlp.fc1.bias', 'text_model.encoder.layers.15.mlp.fc1.weight', 'text_model.encoder.layers.15.mlp.fc2.bias', 'text_model.encoder.layers.15.mlp.fc2.weight', 'text_model.encoder.layers.15.self_attn.k_proj.bias', 'text_model.encoder.layers.15.self_attn.k_proj.weight', 'text_model.encoder.layers.15.self_attn.out_proj.bias', 'text_model.encoder.layers.15.self_attn.out_proj.weight', 'text_model.encoder.layers.15.self_attn.q_proj.bias', 'text_model.encoder.layers.15.self_attn.q_proj.weight', 'text_model.encoder.layers.15.self_attn.v_proj.bias', 'text_model.encoder.layers.15.self_attn.v_proj.weight', 'text_model.encoder.layers.16.layer_norm1.bias', 'text_model.encoder.layers.16.layer_norm1.weight', 'text_model.encoder.layers.16.layer_norm2.bias', 'text_model.encoder.layers.16.layer_norm2.weight', 'text_model.encoder.layers.16.mlp.fc1.bias', 'text_model.encoder.layers.16.mlp.fc1.weight', 'text_model.encoder.layers.16.mlp.fc2.bias', 'text_model.encoder.layers.16.mlp.fc2.weight', 'text_model.encoder.layers.16.self_attn.k_proj.bias', 'text_model.encoder.layers.16.self_attn.k_proj.weight', 'text_model.encoder.layers.16.self_attn.out_proj.bias', 'text_model.encoder.layers.16.self_attn.out_proj.weight', 'text_model.encoder.layers.16.self_attn.q_proj.bias', 'text_model.encoder.layers.16.self_attn.q_proj.weight', 'text_model.encoder.layers.16.self_attn.v_proj.bias', 'text_model.encoder.layers.16.self_attn.v_proj.weight', 'text_model.encoder.layers.17.layer_norm1.bias', 'text_model.encoder.layers.17.layer_norm1.weight', 'text_model.encoder.layers.17.layer_norm2.bias', 'text_model.encoder.layers.17.layer_norm2.weight', 'text_model.encoder.layers.17.mlp.fc1.bias', 'text_model.encoder.layers.17.mlp.fc1.weight', 'text_model.encoder.layers.17.mlp.fc2.bias', 'text_model.encoder.layers.17.mlp.fc2.weight', 'text_model.encoder.layers.17.self_attn.k_proj.bias', 'text_model.encoder.layers.17.self_attn.k_proj.weight', 'text_model.encoder.layers.17.self_attn.out_proj.bias', 'text_model.encoder.layers.17.self_attn.out_proj.weight', 'text_model.encoder.layers.17.self_attn.q_proj.bias', 'text_model.encoder.layers.17.self_attn.q_proj.weight', 'text_model.encoder.layers.17.self_attn.v_proj.bias', 'text_model.encoder.layers.17.self_attn.v_proj.weight', 'text_model.encoder.layers.18.layer_norm1.bias', 'text_model.encoder.layers.18.layer_norm1.weight', 'text_model.encoder.layers.18.layer_norm2.bias', 'text_model.encoder.layers.18.layer_norm2.weight', 'text_model.encoder.layers.18.mlp.fc1.bias', 'text_model.encoder.layers.18.mlp.fc1.weight', 'text_model.encoder.layers.18.mlp.fc2.bias', 'text_model.encoder.layers.18.mlp.fc2.weight', 'text_model.encoder.layers.18.self_attn.k_proj.bias', 'text_model.encoder.layers.18.self_attn.k_proj.weight', 'text_model.encoder.layers.18.self_attn.out_proj.bias', 'text_model.encoder.layers.18.self_attn.out_proj.weight', 'text_model.encoder.layers.18.self_attn.q_proj.bias', 'text_model.encoder.layers.18.self_attn.q_proj.weight', 'text_model.encoder.layers.18.self_attn.v_proj.bias', 'text_model.encoder.layers.18.self_attn.v_proj.weight', 'text_model.encoder.layers.19.layer_norm1.bias', 'text_model.encoder.layers.19.layer_norm1.weight', 'text_model.encoder.layers.19.layer_norm2.bias', 'text_model.encoder.layers.19.layer_norm2.weight', 'text_model.encoder.layers.19.mlp.fc1.bias', 'text_model.encoder.layers.19.mlp.fc1.weight', 'text_model.encoder.layers.19.mlp.fc2.bias', 'text_model.encoder.layers.19.mlp.fc2.weight', 'text_model.encoder.layers.19.self_attn.k_proj.bias', 'text_model.encoder.layers.19.self_attn.k_proj.weight', 'text_model.encoder.layers.19.self_attn.out_proj.bias', 'text_model.encoder.layers.19.self_attn.out_proj.weight', 'text_model.encoder.layers.19.self_attn.q_proj.bias', 'text_model.encoder.layers.19.self_attn.q_proj.weight', 'text_model.encoder.layers.19.self_attn.v_proj.bias', 'text_model.encoder.layers.19.self_attn.v_proj.weight', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.20.layer_norm1.bias', 'text_model.encoder.layers.20.layer_norm1.weight', 'text_model.encoder.layers.20.layer_norm2.bias', 'text_model.encoder.layers.20.layer_norm2.weight', 'text_model.encoder.layers.20.mlp.fc1.bias', 'text_model.encoder.layers.20.mlp.fc1.weight', 'text_model.encoder.layers.20.mlp.fc2.bias', 'text_model.encoder.layers.20.mlp.fc2.weight', 'text_model.encoder.layers.20.self_attn.k_proj.bias', 'text_model.encoder.layers.20.self_attn.k_proj.weight', 'text_model.encoder.layers.20.self_attn.out_proj.bias', 'text_model.encoder.layers.20.self_attn.out_proj.weight', 'text_model.encoder.layers.20.self_attn.q_proj.bias', 'text_model.encoder.layers.20.self_attn.q_proj.weight', 'text_model.encoder.layers.20.self_attn.v_proj.bias', 'text_model.encoder.layers.20.self_attn.v_proj.weight', 'text_model.encoder.layers.21.layer_norm1.bias', 'text_model.encoder.layers.21.layer_norm1.weight', 'text_model.encoder.layers.21.layer_norm2.bias', 'text_model.encoder.layers.21.layer_norm2.weight', 'text_model.encoder.layers.21.mlp.fc1.bias', 'text_model.encoder.layers.21.mlp.fc1.weight', 'text_model.encoder.layers.21.mlp.fc2.bias', 'text_model.encoder.layers.21.mlp.fc2.weight', 'text_model.encoder.layers.21.self_attn.k_proj.bias', 'text_model.encoder.layers.21.self_attn.k_proj.weight', 'text_model.encoder.layers.21.self_attn.out_proj.bias', 'text_model.encoder.layers.21.self_attn.out_proj.weight', 'text_model.encoder.layers.21.self_attn.q_proj.bias', 'text_model.encoder.layers.21.self_attn.q_proj.weight', 'text_model.encoder.layers.21.self_attn.v_proj.bias', 'text_model.encoder.layers.21.self_attn.v_proj.weight', 'text_model.encoder.layers.22.layer_norm1.bias', 'text_model.encoder.layers.22.layer_norm1.weight', 'text_model.encoder.layers.22.layer_norm2.bias', 'text_model.encoder.layers.22.layer_norm2.weight', 'text_model.encoder.layers.22.mlp.fc1.bias', 'text_model.encoder.layers.22.mlp.fc1.weight', 'text_model.encoder.layers.22.mlp.fc2.bias', 'text_model.encoder.layers.22.mlp.fc2.weight', 'text_model.encoder.layers.22.self_attn.k_proj.bias', 'text_model.encoder.layers.22.self_attn.k_proj.weight', 'text_model.encoder.layers.22.self_attn.out_proj.bias', 'text_model.encoder.layers.22.self_attn.out_proj.weight', 'text_model.encoder.layers.22.self_attn.q_proj.bias', 'text_model.encoder.layers.22.self_attn.q_proj.weight', 'text_model.encoder.layers.22.self_attn.v_proj.bias', 'text_model.encoder.layers.22.self_attn.v_proj.weight', 'text_model.encoder.layers.23.layer_norm1.bias', 'text_model.encoder.layers.23.layer_norm1.weight', 'text_model.encoder.layers.23.layer_norm2.bias', 'text_model.encoder.layers.23.layer_norm2.weight', 'text_model.encoder.layers.23.mlp.fc1.bias', 'text_model.encoder.layers.23.mlp.fc1.weight', 'text_model.encoder.layers.23.mlp.fc2.bias', 'text_model.encoder.layers.23.mlp.fc2.weight', 'text_model.encoder.layers.23.self_attn.k_proj.bias', 'text_model.encoder.layers.23.self_attn.k_proj.weight', 'text_model.encoder.layers.23.self_attn.out_proj.bias', 'text_model.encoder.layers.23.self_attn.out_proj.weight', 'text_model.encoder.layers.23.self_attn.q_proj.bias', 'text_model.encoder.layers.23.self_attn.q_proj.weight', 'text_model.encoder.layers.23.self_attn.v_proj.bias', 'text_model.encoder.layers.23.self_attn.v_proj.weight', 'text_model.encoder.layers.24.layer_norm1.bias', 'text_model.encoder.layers.24.layer_norm1.weight', 'text_model.encoder.layers.24.layer_norm2.bias', 'text_model.encoder.layers.24.layer_norm2.weight', 'text_model.encoder.layers.24.mlp.fc1.bias', 'text_model.encoder.layers.24.mlp.fc1.weight', 'text_model.encoder.layers.24.mlp.fc2.bias', 'text_model.encoder.layers.24.mlp.fc2.weight', 'text_model.encoder.layers.24.self_attn.k_proj.bias', 'text_model.encoder.layers.24.self_attn.k_proj.weight', 'text_model.encoder.layers.24.self_attn.out_proj.bias', 'text_model.encoder.layers.24.self_attn.out_proj.weight', 'text_model.encoder.layers.24.self_attn.q_proj.bias', 'text_model.encoder.layers.24.self_attn.q_proj.weight', 'text_model.encoder.layers.24.self_attn.v_proj.bias', 'text_model.encoder.layers.24.self_attn.v_proj.weight', 'text_model.encoder.layers.25.layer_norm1.bias', 'text_model.encoder.layers.25.layer_norm1.weight', 'text_model.encoder.layers.25.layer_norm2.bias', 'text_model.encoder.layers.25.layer_norm2.weight', 'text_model.encoder.layers.25.mlp.fc1.bias', 'text_model.encoder.layers.25.mlp.fc1.weight', 'text_model.encoder.layers.25.mlp.fc2.bias', 'text_model.encoder.layers.25.mlp.fc2.weight', 'text_model.encoder.layers.25.self_attn.k_proj.bias', 'text_model.encoder.layers.25.self_attn.k_proj.weight', 'text_model.encoder.layers.25.self_attn.out_proj.bias', 'text_model.encoder.layers.25.self_attn.out_proj.weight', 'text_model.encoder.layers.25.self_attn.q_proj.bias', 'text_model.encoder.layers.25.self_attn.q_proj.weight', 'text_model.encoder.layers.25.self_attn.v_proj.bias', 'text_model.encoder.layers.25.self_attn.v_proj.weight', 'text_model.encoder.layers.26.layer_norm1.bias', 'text_model.encoder.layers.26.layer_norm1.weight', 'text_model.encoder.layers.26.layer_norm2.bias', 'text_model.encoder.layers.26.layer_norm2.weight', 'text_model.encoder.layers.26.mlp.fc1.bias', 'text_model.encoder.layers.26.mlp.fc1.weight', 'text_model.encoder.layers.26.mlp.fc2.bias', 'text_model.encoder.layers.26.mlp.fc2.weight', 'text_model.encoder.layers.26.self_attn.k_proj.bias', 'text_model.encoder.layers.26.self_attn.k_proj.weight', 'text_model.encoder.layers.26.self_attn.out_proj.bias', 'text_model.encoder.layers.26.self_attn.out_proj.weight', 'text_model.encoder.layers.26.self_attn.q_proj.bias', 'text_model.encoder.layers.26.self_attn.q_proj.weight', 'text_model.encoder.layers.26.self_attn.v_proj.bias', 'text_model.encoder.layers.26.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.final_layer_norm.weight', 'text_model.head.bias', 'text_model.head.weight']
223
- - This IS expected if you are initializing SiglipVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
224
- - This IS NOT expected if you are initializing SiglipVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
225
- 2025-02-14 05:01:37,442 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of SiglipVisionModel were initialized from the model checkpoint at google/siglip-so400m-patch14-384.
226
- If your task is similar to the task the model of the checkpoint was trained on, you can already use SiglipVisionModel for predictions without further training.
227
- 2025-02-14 05:01:37,626 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/preprocessor_config.json
228
- 2025-02-14 05:01:37,627 - image_processing_base.py:429 - from_dict - INFO - Image processor SiglipImageProcessor {
229
- "do_convert_rgb": null,
230
- "do_normalize": true,
231
- "do_rescale": true,
232
- "do_resize": true,
233
- "image_mean": [
234
- 0.5,
235
- 0.5,
236
- 0.5
237
- ],
238
- "image_processor_type": "SiglipImageProcessor",
239
- "image_std": [
240
- 0.5,
241
- 0.5,
242
- 0.5
243
- ],
244
- "processor_class": "SiglipProcessor",
245
- "resample": 3,
246
- "rescale_factor": 0.00392156862745098,
247
- "size": {
248
- "height": 384,
249
- "width": 384
250
- }
251
- }
252
-
253
- 2025-02-14 05:01:37,991 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
254
- 2025-02-14 05:01:37,995 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
255
- "apply_layernorm": true,
256
- "architectures": [
257
- "Dinov2Model"
258
- ],
259
- "attention_probs_dropout_prob": 0.0,
260
- "drop_path_rate": 0.0,
261
- "hidden_act": "gelu",
262
- "hidden_dropout_prob": 0.0,
263
- "hidden_size": 1536,
264
- "image_size": 518,
265
- "initializer_range": 0.02,
266
- "layer_norm_eps": 1e-06,
267
- "layerscale_value": 1.0,
268
- "mlp_ratio": 4,
269
- "model_type": "dinov2",
270
- "num_attention_heads": 24,
271
- "num_channels": 3,
272
- "num_hidden_layers": 40,
273
- "out_features": [
274
- "stage40"
275
- ],
276
- "out_indices": [
277
- 40
278
- ],
279
- "patch_size": 14,
280
- "qkv_bias": true,
281
- "reshape_hidden_states": true,
282
- "stage_names": [
283
- "stem",
284
- "stage1",
285
- "stage2",
286
- "stage3",
287
- "stage4",
288
- "stage5",
289
- "stage6",
290
- "stage7",
291
- "stage8",
292
- "stage9",
293
- "stage10",
294
- "stage11",
295
- "stage12",
296
- "stage13",
297
- "stage14",
298
- "stage15",
299
- "stage16",
300
- "stage17",
301
- "stage18",
302
- "stage19",
303
- "stage20",
304
- "stage21",
305
- "stage22",
306
- "stage23",
307
- "stage24",
308
- "stage25",
309
- "stage26",
310
- "stage27",
311
- "stage28",
312
- "stage29",
313
- "stage30",
314
- "stage31",
315
- "stage32",
316
- "stage33",
317
- "stage34",
318
- "stage35",
319
- "stage36",
320
- "stage37",
321
- "stage38",
322
- "stage39",
323
- "stage40"
324
- ],
325
- "torch_dtype": "float32",
326
- "transformers_version": "4.43.1",
327
- "use_swiglu_ffn": true
328
- }
329
-
330
- 2025-02-14 05:01:37,995 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/model.safetensors
331
- 2025-02-14 05:01:38,584 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing Dinov2Model.
332
-
333
- 2025-02-14 05:01:38,584 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of Dinov2Model were initialized from the model checkpoint at facebook/dinov2-giant.
334
- If your task is similar to the task the model of the checkpoint was trained on, you can already use Dinov2Model for predictions without further training.
335
- 2025-02-14 05:01:38,771 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/preprocessor_config.json
336
- 2025-02-14 05:01:38,774 - image_processing_base.py:429 - from_dict - INFO - Image processor BitImageProcessor {
337
- "crop_size": {
338
- "height": 378,
339
- "width": 378
340
- },
341
- "do_center_crop": true,
342
- "do_convert_rgb": true,
343
- "do_normalize": true,
344
- "do_rescale": true,
345
- "do_resize": true,
346
- "image_mean": [
347
- 0.485,
348
- 0.456,
349
- 0.406
350
- ],
351
- "image_processor_type": "BitImageProcessor",
352
- "image_std": [
353
- 0.229,
354
- 0.224,
355
- 0.225
356
- ],
357
- "resample": 3,
358
- "rescale_factor": 0.00392156862745098,
359
- "size": {
360
- "shortest_edge": 378
361
- }
362
- }
363
-
364
- 2025-02-14 05:01:39,569 - finetune_llama.py:1239 - train - INFO - Total params: 3264865280
365
- 2025-02-14 05:01:39,569 - finetune_llama.py:1240 - train - INFO - Trainable params: 12589056
366
- 2025-02-14 05:01:39,569 - finetune_llama.py:1241 - train - INFO - LM head params: 394002432
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae628e76df568dd5e80f7f8759f86215724ca5777ddddb01d6531886e78a9ce
3
+ size 35149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
runtime_logs/run_2025-02-14_05-02-58.log CHANGED
The diff for this file is too large to render. See raw diff
 
runtime_logs/run_2025-02-14_17-32-33.log CHANGED
@@ -1,378 +1,3 @@
1
- 2025-02-14 17:32:33,162 - training_args.py:2100 - _setup_devices - INFO - PyTorch: setting up devices
2
- 2025-02-14 17:32:33,810 - configuration_utils.py:731 - _get_config_dict - INFO - loading configuration file ./checkpoints/longvu_llama3_2/config.json
3
- 2025-02-14 17:32:33,813 - configuration_utils.py:800 - from_dict - INFO - Model config CambrianConfig {
4
- "_name_or_path": "/tmp/iopath_cache/manifold_cache/tree/users/shenx/finetune/09281004-cambrian_llama3_2_t576_ov",
5
- "architectures": [
6
- "CambrianLlamaForCausalLM"
7
- ],
8
- "attention_bias": false,
9
- "attention_dropout": 0.0,
10
- "bos_token_id": 128000,
11
- "connect_layer": 2,
12
- "connector_depth": 3,
13
- "connector_only": true,
14
- "dino_threshold": 0.83,
15
- "drop_threshold": 0.8,
16
- "eos_token_id": [
17
- 128001,
18
- 128008,
19
- 128009
20
- ],
21
- "frame_pos": false,
22
- "freeze_mm_mlp_adapter": false,
23
- "hidden_act": "silu",
24
- "hidden_size": 3072,
25
- "highres": true,
26
- "highres_connect": false,
27
- "image_aspect_ratio": "pad",
28
- "image_position": 91,
29
- "image_token_len": 144,
30
- "initializer_range": 0.02,
31
- "intermediate_size": 8192,
32
- "is_image_newline": true,
33
- "is_st_sampler": false,
34
- "lowres_token": 8,
35
- "max_position_embeddings": 131072,
36
- "mlp_bias": false,
37
- "mm_patch_merge_type": "flat",
38
- "mm_projector_lr": null,
39
- "mm_projector_type": "sva",
40
- "mm_use_im_patch_token": false,
41
- "mm_use_im_start_end": false,
42
- "mm_vision_sampler_lr": null,
43
- "mm_vision_select_feature": "patch",
44
- "mm_vision_select_layer": -2,
45
- "mm_vision_tower_aux_list": [
46
- "siglip/CLIP-ViT-SO400M-14-384",
47
- "facebook/dinov2-giant-res378"
48
- ],
49
- "mm_vision_tower_aux_token_len_list": [
50
- 576,
51
- 576
52
- ],
53
- "mm_vision_tower_lr": null,
54
- "model_type": "cambrian_llama",
55
- "num_attention_heads": 24,
56
- "num_hidden_layers": 28,
57
- "num_key_value_heads": 8,
58
- "num_of_vision_sampler_layers": 10,
59
- "num_query_group": 1,
60
- "pretraining_tp": 1,
61
- "query_num_list": [
62
- 144
63
- ],
64
- "rms_norm_eps": 1e-05,
65
- "rope_scaling": {
66
- "factor": 32.0,
67
- "high_freq_factor": 4.0,
68
- "low_freq_factor": 1.0,
69
- "original_max_position_embeddings": 8192,
70
- "rope_type": "llama3"
71
- },
72
- "rope_theta": 500000.0,
73
- "spmd_debug": null,
74
- "spmd_fsdp_sharding": null,
75
- "spmd_mesh": null,
76
- "start_of_vision_sampler_layers": 0,
77
- "stride_of_vision_sampler_layers": 3,
78
- "tie_word_embeddings": false,
79
- "tokenizer_model_max_length": 8192,
80
- "tokenizer_padding_side": "right",
81
- "torch_dtype": "float32",
82
- "transformers_version": "4.43.1",
83
- "tune_mm_mlp_adapter": false,
84
- "unfreeze_mm_vision_tower": false,
85
- "use_cache": false,
86
- "use_mm_proj": true,
87
- "vision_hidden_size": 1024,
88
- "vision_tower_aux_token_len_list": [
89
- 576,
90
- 576
91
- ],
92
- "vocab_size": 128256
93
- }
94
-
95
- 2025-02-14 17:32:33,813 - modeling_utils.py:3618 - from_pretrained - INFO - loading weights file ./checkpoints/longvu_llama3_2/pytorch_model.bin
96
- 2025-02-14 17:32:33,874 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
97
- "bos_token_id": 128000,
98
- "eos_token_id": [
99
- 128001,
100
- 128008,
101
- 128009
102
- ],
103
- "use_cache": false
104
- }
105
-
106
- 2025-02-14 17:32:34,467 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
107
- 2025-02-14 17:32:34,470 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
108
- "apply_layernorm": true,
109
- "architectures": [
110
- "Dinov2Model"
111
- ],
112
- "attention_probs_dropout_prob": 0.0,
113
- "drop_path_rate": 0.0,
114
- "hidden_act": "gelu",
115
- "hidden_dropout_prob": 0.0,
116
- "hidden_size": 1536,
117
- "image_size": 518,
118
- "initializer_range": 0.02,
119
- "layer_norm_eps": 1e-06,
120
- "layerscale_value": 1.0,
121
- "mlp_ratio": 4,
122
- "model_type": "dinov2",
123
- "num_attention_heads": 24,
124
- "num_channels": 3,
125
- "num_hidden_layers": 40,
126
- "out_features": [
127
- "stage40"
128
- ],
129
- "out_indices": [
130
- 40
131
- ],
132
- "patch_size": 14,
133
- "qkv_bias": true,
134
- "reshape_hidden_states": true,
135
- "stage_names": [
136
- "stem",
137
- "stage1",
138
- "stage2",
139
- "stage3",
140
- "stage4",
141
- "stage5",
142
- "stage6",
143
- "stage7",
144
- "stage8",
145
- "stage9",
146
- "stage10",
147
- "stage11",
148
- "stage12",
149
- "stage13",
150
- "stage14",
151
- "stage15",
152
- "stage16",
153
- "stage17",
154
- "stage18",
155
- "stage19",
156
- "stage20",
157
- "stage21",
158
- "stage22",
159
- "stage23",
160
- "stage24",
161
- "stage25",
162
- "stage26",
163
- "stage27",
164
- "stage28",
165
- "stage29",
166
- "stage30",
167
- "stage31",
168
- "stage32",
169
- "stage33",
170
- "stage34",
171
- "stage35",
172
- "stage36",
173
- "stage37",
174
- "stage38",
175
- "stage39",
176
- "stage40"
177
- ],
178
- "torch_dtype": "float32",
179
- "transformers_version": "4.43.1",
180
- "use_swiglu_ffn": true
181
- }
182
-
183
- 2025-02-14 17:32:35,842 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing CambrianLlamaForCausalLM.
184
-
185
- 2025-02-14 17:32:35,843 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of CambrianLlamaForCausalLM were initialized from the model checkpoint at ./checkpoints/longvu_llama3_2.
186
- If your task is similar to the task the model of the checkpoint was trained on, you can already use CambrianLlamaForCausalLM for predictions without further training.
187
- 2025-02-14 17:32:35,848 - configuration_utils.py:991 - from_pretrained - INFO - loading configuration file ./checkpoints/longvu_llama3_2/generation_config.json
188
- 2025-02-14 17:32:35,848 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
189
- "bos_token_id": 128000,
190
- "do_sample": true,
191
- "eos_token_id": [
192
- 128001,
193
- 128008,
194
- 128009
195
- ],
196
- "temperature": 0.6,
197
- "top_p": 0.9
198
- }
199
-
200
- 2025-02-14 17:32:36,138 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer.json
201
- 2025-02-14 17:32:36,138 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file added_tokens.json
202
- 2025-02-14 17:32:36,139 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file special_tokens_map.json
203
- 2025-02-14 17:32:36,139 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer_config.json
204
- 2025-02-14 17:32:36,540 - tokenization_utils_base.py:2533 - _from_pretrained - INFO - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
205
- 2025-02-14 17:32:36,920 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/config.json
206
- 2025-02-14 17:32:36,923 - configuration_utils.py:800 - from_dict - INFO - Model config SiglipVisionConfig {
207
- "attention_dropout": 0.0,
208
- "hidden_act": "gelu_pytorch_tanh",
209
- "hidden_size": 1152,
210
- "image_size": 384,
211
- "intermediate_size": 4304,
212
- "layer_norm_eps": 1e-06,
213
- "model_type": "siglip_vision_model",
214
- "num_attention_heads": 16,
215
- "num_channels": 3,
216
- "num_hidden_layers": 27,
217
- "patch_size": 14,
218
- "transformers_version": "4.43.1"
219
- }
220
-
221
- 2025-02-14 17:32:36,924 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/model.safetensors
222
- 2025-02-14 17:32:37,192 - modeling_utils.py:4440 - _load_pretrained_model - INFO - Some weights of the model checkpoint at google/siglip-so400m-patch14-384 were not used when initializing SiglipVisionModel: ['logit_bias', 'logit_scale', 'text_model.embeddings.position_embedding.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.12.layer_norm1.bias', 'text_model.encoder.layers.12.layer_norm1.weight', 'text_model.encoder.layers.12.layer_norm2.bias', 'text_model.encoder.layers.12.layer_norm2.weight', 'text_model.encoder.layers.12.mlp.fc1.bias', 'text_model.encoder.layers.12.mlp.fc1.weight', 'text_model.encoder.layers.12.mlp.fc2.bias', 'text_model.encoder.layers.12.mlp.fc2.weight', 'text_model.encoder.layers.12.self_attn.k_proj.bias', 'text_model.encoder.layers.12.self_attn.k_proj.weight', 'text_model.encoder.layers.12.self_attn.out_proj.bias', 'text_model.encoder.layers.12.self_attn.out_proj.weight', 'text_model.encoder.layers.12.self_attn.q_proj.bias', 'text_model.encoder.layers.12.self_attn.q_proj.weight', 'text_model.encoder.layers.12.self_attn.v_proj.bias', 'text_model.encoder.layers.12.self_attn.v_proj.weight', 'text_model.encoder.layers.13.layer_norm1.bias', 'text_model.encoder.layers.13.layer_norm1.weight', 'text_model.encoder.layers.13.layer_norm2.bias', 'text_model.encoder.layers.13.layer_norm2.weight', 'text_model.encoder.layers.13.mlp.fc1.bias', 'text_model.encoder.layers.13.mlp.fc1.weight', 'text_model.encoder.layers.13.mlp.fc2.bias', 'text_model.encoder.layers.13.mlp.fc2.weight', 'text_model.encoder.layers.13.self_attn.k_proj.bias', 'text_model.encoder.layers.13.self_attn.k_proj.weight', 'text_model.encoder.layers.13.self_attn.out_proj.bias', 'text_model.encoder.layers.13.self_attn.out_proj.weight', 'text_model.encoder.layers.13.self_attn.q_proj.bias', 'text_model.encoder.layers.13.self_attn.q_proj.weight', 'text_model.encoder.layers.13.self_attn.v_proj.bias', 'text_model.encoder.layers.13.self_attn.v_proj.weight', 'text_model.encoder.layers.14.layer_norm1.bias', 'text_model.encoder.layers.14.layer_norm1.weight', 'text_model.encoder.layers.14.layer_norm2.bias', 'text_model.encoder.layers.14.layer_norm2.weight', 'text_model.encoder.layers.14.mlp.fc1.bias', 'text_model.encoder.layers.14.mlp.fc1.weight', 'text_model.encoder.layers.14.mlp.fc2.bias', 'text_model.encoder.layers.14.mlp.fc2.weight', 'text_model.encoder.layers.14.self_attn.k_proj.bias', 'text_model.encoder.layers.14.self_attn.k_proj.weight', 'text_model.encoder.layers.14.self_attn.out_proj.bias', 'text_model.encoder.layers.14.self_attn.out_proj.weight', 'text_model.encoder.layers.14.self_attn.q_proj.bias', 'text_model.encoder.layers.14.self_attn.q_proj.weight', 'text_model.encoder.layers.14.self_attn.v_proj.bias', 'text_model.encoder.layers.14.self_attn.v_proj.weight', 'text_model.encoder.layers.15.layer_norm1.bias', 'text_model.encoder.layers.15.layer_norm1.weight', 'text_model.encoder.layers.15.layer_norm2.bias', 'text_model.encoder.layers.15.layer_norm2.weight', 'text_model.encoder.layers.15.mlp.fc1.bias', 'text_model.encoder.layers.15.mlp.fc1.weight', 'text_model.encoder.layers.15.mlp.fc2.bias', 'text_model.encoder.layers.15.mlp.fc2.weight', 'text_model.encoder.layers.15.self_attn.k_proj.bias', 'text_model.encoder.layers.15.self_attn.k_proj.weight', 'text_model.encoder.layers.15.self_attn.out_proj.bias', 'text_model.encoder.layers.15.self_attn.out_proj.weight', 'text_model.encoder.layers.15.self_attn.q_proj.bias', 'text_model.encoder.layers.15.self_attn.q_proj.weight', 'text_model.encoder.layers.15.self_attn.v_proj.bias', 'text_model.encoder.layers.15.self_attn.v_proj.weight', 'text_model.encoder.layers.16.layer_norm1.bias', 'text_model.encoder.layers.16.layer_norm1.weight', 'text_model.encoder.layers.16.layer_norm2.bias', 'text_model.encoder.layers.16.layer_norm2.weight', 'text_model.encoder.layers.16.mlp.fc1.bias', 'text_model.encoder.layers.16.mlp.fc1.weight', 'text_model.encoder.layers.16.mlp.fc2.bias', 'text_model.encoder.layers.16.mlp.fc2.weight', 'text_model.encoder.layers.16.self_attn.k_proj.bias', 'text_model.encoder.layers.16.self_attn.k_proj.weight', 'text_model.encoder.layers.16.self_attn.out_proj.bias', 'text_model.encoder.layers.16.self_attn.out_proj.weight', 'text_model.encoder.layers.16.self_attn.q_proj.bias', 'text_model.encoder.layers.16.self_attn.q_proj.weight', 'text_model.encoder.layers.16.self_attn.v_proj.bias', 'text_model.encoder.layers.16.self_attn.v_proj.weight', 'text_model.encoder.layers.17.layer_norm1.bias', 'text_model.encoder.layers.17.layer_norm1.weight', 'text_model.encoder.layers.17.layer_norm2.bias', 'text_model.encoder.layers.17.layer_norm2.weight', 'text_model.encoder.layers.17.mlp.fc1.bias', 'text_model.encoder.layers.17.mlp.fc1.weight', 'text_model.encoder.layers.17.mlp.fc2.bias', 'text_model.encoder.layers.17.mlp.fc2.weight', 'text_model.encoder.layers.17.self_attn.k_proj.bias', 'text_model.encoder.layers.17.self_attn.k_proj.weight', 'text_model.encoder.layers.17.self_attn.out_proj.bias', 'text_model.encoder.layers.17.self_attn.out_proj.weight', 'text_model.encoder.layers.17.self_attn.q_proj.bias', 'text_model.encoder.layers.17.self_attn.q_proj.weight', 'text_model.encoder.layers.17.self_attn.v_proj.bias', 'text_model.encoder.layers.17.self_attn.v_proj.weight', 'text_model.encoder.layers.18.layer_norm1.bias', 'text_model.encoder.layers.18.layer_norm1.weight', 'text_model.encoder.layers.18.layer_norm2.bias', 'text_model.encoder.layers.18.layer_norm2.weight', 'text_model.encoder.layers.18.mlp.fc1.bias', 'text_model.encoder.layers.18.mlp.fc1.weight', 'text_model.encoder.layers.18.mlp.fc2.bias', 'text_model.encoder.layers.18.mlp.fc2.weight', 'text_model.encoder.layers.18.self_attn.k_proj.bias', 'text_model.encoder.layers.18.self_attn.k_proj.weight', 'text_model.encoder.layers.18.self_attn.out_proj.bias', 'text_model.encoder.layers.18.self_attn.out_proj.weight', 'text_model.encoder.layers.18.self_attn.q_proj.bias', 'text_model.encoder.layers.18.self_attn.q_proj.weight', 'text_model.encoder.layers.18.self_attn.v_proj.bias', 'text_model.encoder.layers.18.self_attn.v_proj.weight', 'text_model.encoder.layers.19.layer_norm1.bias', 'text_model.encoder.layers.19.layer_norm1.weight', 'text_model.encoder.layers.19.layer_norm2.bias', 'text_model.encoder.layers.19.layer_norm2.weight', 'text_model.encoder.layers.19.mlp.fc1.bias', 'text_model.encoder.layers.19.mlp.fc1.weight', 'text_model.encoder.layers.19.mlp.fc2.bias', 'text_model.encoder.layers.19.mlp.fc2.weight', 'text_model.encoder.layers.19.self_attn.k_proj.bias', 'text_model.encoder.layers.19.self_attn.k_proj.weight', 'text_model.encoder.layers.19.self_attn.out_proj.bias', 'text_model.encoder.layers.19.self_attn.out_proj.weight', 'text_model.encoder.layers.19.self_attn.q_proj.bias', 'text_model.encoder.layers.19.self_attn.q_proj.weight', 'text_model.encoder.layers.19.self_attn.v_proj.bias', 'text_model.encoder.layers.19.self_attn.v_proj.weight', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.20.layer_norm1.bias', 'text_model.encoder.layers.20.layer_norm1.weight', 'text_model.encoder.layers.20.layer_norm2.bias', 'text_model.encoder.layers.20.layer_norm2.weight', 'text_model.encoder.layers.20.mlp.fc1.bias', 'text_model.encoder.layers.20.mlp.fc1.weight', 'text_model.encoder.layers.20.mlp.fc2.bias', 'text_model.encoder.layers.20.mlp.fc2.weight', 'text_model.encoder.layers.20.self_attn.k_proj.bias', 'text_model.encoder.layers.20.self_attn.k_proj.weight', 'text_model.encoder.layers.20.self_attn.out_proj.bias', 'text_model.encoder.layers.20.self_attn.out_proj.weight', 'text_model.encoder.layers.20.self_attn.q_proj.bias', 'text_model.encoder.layers.20.self_attn.q_proj.weight', 'text_model.encoder.layers.20.self_attn.v_proj.bias', 'text_model.encoder.layers.20.self_attn.v_proj.weight', 'text_model.encoder.layers.21.layer_norm1.bias', 'text_model.encoder.layers.21.layer_norm1.weight', 'text_model.encoder.layers.21.layer_norm2.bias', 'text_model.encoder.layers.21.layer_norm2.weight', 'text_model.encoder.layers.21.mlp.fc1.bias', 'text_model.encoder.layers.21.mlp.fc1.weight', 'text_model.encoder.layers.21.mlp.fc2.bias', 'text_model.encoder.layers.21.mlp.fc2.weight', 'text_model.encoder.layers.21.self_attn.k_proj.bias', 'text_model.encoder.layers.21.self_attn.k_proj.weight', 'text_model.encoder.layers.21.self_attn.out_proj.bias', 'text_model.encoder.layers.21.self_attn.out_proj.weight', 'text_model.encoder.layers.21.self_attn.q_proj.bias', 'text_model.encoder.layers.21.self_attn.q_proj.weight', 'text_model.encoder.layers.21.self_attn.v_proj.bias', 'text_model.encoder.layers.21.self_attn.v_proj.weight', 'text_model.encoder.layers.22.layer_norm1.bias', 'text_model.encoder.layers.22.layer_norm1.weight', 'text_model.encoder.layers.22.layer_norm2.bias', 'text_model.encoder.layers.22.layer_norm2.weight', 'text_model.encoder.layers.22.mlp.fc1.bias', 'text_model.encoder.layers.22.mlp.fc1.weight', 'text_model.encoder.layers.22.mlp.fc2.bias', 'text_model.encoder.layers.22.mlp.fc2.weight', 'text_model.encoder.layers.22.self_attn.k_proj.bias', 'text_model.encoder.layers.22.self_attn.k_proj.weight', 'text_model.encoder.layers.22.self_attn.out_proj.bias', 'text_model.encoder.layers.22.self_attn.out_proj.weight', 'text_model.encoder.layers.22.self_attn.q_proj.bias', 'text_model.encoder.layers.22.self_attn.q_proj.weight', 'text_model.encoder.layers.22.self_attn.v_proj.bias', 'text_model.encoder.layers.22.self_attn.v_proj.weight', 'text_model.encoder.layers.23.layer_norm1.bias', 'text_model.encoder.layers.23.layer_norm1.weight', 'text_model.encoder.layers.23.layer_norm2.bias', 'text_model.encoder.layers.23.layer_norm2.weight', 'text_model.encoder.layers.23.mlp.fc1.bias', 'text_model.encoder.layers.23.mlp.fc1.weight', 'text_model.encoder.layers.23.mlp.fc2.bias', 'text_model.encoder.layers.23.mlp.fc2.weight', 'text_model.encoder.layers.23.self_attn.k_proj.bias', 'text_model.encoder.layers.23.self_attn.k_proj.weight', 'text_model.encoder.layers.23.self_attn.out_proj.bias', 'text_model.encoder.layers.23.self_attn.out_proj.weight', 'text_model.encoder.layers.23.self_attn.q_proj.bias', 'text_model.encoder.layers.23.self_attn.q_proj.weight', 'text_model.encoder.layers.23.self_attn.v_proj.bias', 'text_model.encoder.layers.23.self_attn.v_proj.weight', 'text_model.encoder.layers.24.layer_norm1.bias', 'text_model.encoder.layers.24.layer_norm1.weight', 'text_model.encoder.layers.24.layer_norm2.bias', 'text_model.encoder.layers.24.layer_norm2.weight', 'text_model.encoder.layers.24.mlp.fc1.bias', 'text_model.encoder.layers.24.mlp.fc1.weight', 'text_model.encoder.layers.24.mlp.fc2.bias', 'text_model.encoder.layers.24.mlp.fc2.weight', 'text_model.encoder.layers.24.self_attn.k_proj.bias', 'text_model.encoder.layers.24.self_attn.k_proj.weight', 'text_model.encoder.layers.24.self_attn.out_proj.bias', 'text_model.encoder.layers.24.self_attn.out_proj.weight', 'text_model.encoder.layers.24.self_attn.q_proj.bias', 'text_model.encoder.layers.24.self_attn.q_proj.weight', 'text_model.encoder.layers.24.self_attn.v_proj.bias', 'text_model.encoder.layers.24.self_attn.v_proj.weight', 'text_model.encoder.layers.25.layer_norm1.bias', 'text_model.encoder.layers.25.layer_norm1.weight', 'text_model.encoder.layers.25.layer_norm2.bias', 'text_model.encoder.layers.25.layer_norm2.weight', 'text_model.encoder.layers.25.mlp.fc1.bias', 'text_model.encoder.layers.25.mlp.fc1.weight', 'text_model.encoder.layers.25.mlp.fc2.bias', 'text_model.encoder.layers.25.mlp.fc2.weight', 'text_model.encoder.layers.25.self_attn.k_proj.bias', 'text_model.encoder.layers.25.self_attn.k_proj.weight', 'text_model.encoder.layers.25.self_attn.out_proj.bias', 'text_model.encoder.layers.25.self_attn.out_proj.weight', 'text_model.encoder.layers.25.self_attn.q_proj.bias', 'text_model.encoder.layers.25.self_attn.q_proj.weight', 'text_model.encoder.layers.25.self_attn.v_proj.bias', 'text_model.encoder.layers.25.self_attn.v_proj.weight', 'text_model.encoder.layers.26.layer_norm1.bias', 'text_model.encoder.layers.26.layer_norm1.weight', 'text_model.encoder.layers.26.layer_norm2.bias', 'text_model.encoder.layers.26.layer_norm2.weight', 'text_model.encoder.layers.26.mlp.fc1.bias', 'text_model.encoder.layers.26.mlp.fc1.weight', 'text_model.encoder.layers.26.mlp.fc2.bias', 'text_model.encoder.layers.26.mlp.fc2.weight', 'text_model.encoder.layers.26.self_attn.k_proj.bias', 'text_model.encoder.layers.26.self_attn.k_proj.weight', 'text_model.encoder.layers.26.self_attn.out_proj.bias', 'text_model.encoder.layers.26.self_attn.out_proj.weight', 'text_model.encoder.layers.26.self_attn.q_proj.bias', 'text_model.encoder.layers.26.self_attn.q_proj.weight', 'text_model.encoder.layers.26.self_attn.v_proj.bias', 'text_model.encoder.layers.26.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.final_layer_norm.weight', 'text_model.head.bias', 'text_model.head.weight']
223
- - This IS expected if you are initializing SiglipVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
224
- - This IS NOT expected if you are initializing SiglipVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
225
- 2025-02-14 17:32:37,194 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of SiglipVisionModel were initialized from the model checkpoint at google/siglip-so400m-patch14-384.
226
- If your task is similar to the task the model of the checkpoint was trained on, you can already use SiglipVisionModel for predictions without further training.
227
- 2025-02-14 17:32:37,381 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/preprocessor_config.json
228
- 2025-02-14 17:32:37,382 - image_processing_base.py:429 - from_dict - INFO - Image processor SiglipImageProcessor {
229
- "do_convert_rgb": null,
230
- "do_normalize": true,
231
- "do_rescale": true,
232
- "do_resize": true,
233
- "image_mean": [
234
- 0.5,
235
- 0.5,
236
- 0.5
237
- ],
238
- "image_processor_type": "SiglipImageProcessor",
239
- "image_std": [
240
- 0.5,
241
- 0.5,
242
- 0.5
243
- ],
244
- "processor_class": "SiglipProcessor",
245
- "resample": 3,
246
- "rescale_factor": 0.00392156862745098,
247
- "size": {
248
- "height": 384,
249
- "width": 384
250
- }
251
- }
252
-
253
- 2025-02-14 17:32:38,400 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
254
- 2025-02-14 17:32:38,403 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
255
- "apply_layernorm": true,
256
- "architectures": [
257
- "Dinov2Model"
258
- ],
259
- "attention_probs_dropout_prob": 0.0,
260
- "drop_path_rate": 0.0,
261
- "hidden_act": "gelu",
262
- "hidden_dropout_prob": 0.0,
263
- "hidden_size": 1536,
264
- "image_size": 518,
265
- "initializer_range": 0.02,
266
- "layer_norm_eps": 1e-06,
267
- "layerscale_value": 1.0,
268
- "mlp_ratio": 4,
269
- "model_type": "dinov2",
270
- "num_attention_heads": 24,
271
- "num_channels": 3,
272
- "num_hidden_layers": 40,
273
- "out_features": [
274
- "stage40"
275
- ],
276
- "out_indices": [
277
- 40
278
- ],
279
- "patch_size": 14,
280
- "qkv_bias": true,
281
- "reshape_hidden_states": true,
282
- "stage_names": [
283
- "stem",
284
- "stage1",
285
- "stage2",
286
- "stage3",
287
- "stage4",
288
- "stage5",
289
- "stage6",
290
- "stage7",
291
- "stage8",
292
- "stage9",
293
- "stage10",
294
- "stage11",
295
- "stage12",
296
- "stage13",
297
- "stage14",
298
- "stage15",
299
- "stage16",
300
- "stage17",
301
- "stage18",
302
- "stage19",
303
- "stage20",
304
- "stage21",
305
- "stage22",
306
- "stage23",
307
- "stage24",
308
- "stage25",
309
- "stage26",
310
- "stage27",
311
- "stage28",
312
- "stage29",
313
- "stage30",
314
- "stage31",
315
- "stage32",
316
- "stage33",
317
- "stage34",
318
- "stage35",
319
- "stage36",
320
- "stage37",
321
- "stage38",
322
- "stage39",
323
- "stage40"
324
- ],
325
- "torch_dtype": "float32",
326
- "transformers_version": "4.43.1",
327
- "use_swiglu_ffn": true
328
- }
329
-
330
- 2025-02-14 17:32:38,404 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/model.safetensors
331
- 2025-02-14 17:32:38,981 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing Dinov2Model.
332
-
333
- 2025-02-14 17:32:38,981 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of Dinov2Model were initialized from the model checkpoint at facebook/dinov2-giant.
334
- If your task is similar to the task the model of the checkpoint was trained on, you can already use Dinov2Model for predictions without further training.
335
- 2025-02-14 17:32:39,176 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/preprocessor_config.json
336
- 2025-02-14 17:32:39,179 - image_processing_base.py:429 - from_dict - INFO - Image processor BitImageProcessor {
337
- "crop_size": {
338
- "height": 378,
339
- "width": 378
340
- },
341
- "do_center_crop": true,
342
- "do_convert_rgb": true,
343
- "do_normalize": true,
344
- "do_rescale": true,
345
- "do_resize": true,
346
- "image_mean": [
347
- 0.485,
348
- 0.456,
349
- 0.406
350
- ],
351
- "image_processor_type": "BitImageProcessor",
352
- "image_std": [
353
- 0.229,
354
- 0.224,
355
- 0.225
356
- ],
357
- "resample": 3,
358
- "rescale_factor": 0.00392156862745098,
359
- "size": {
360
- "shortest_edge": 378
361
- }
362
- }
363
-
364
- 2025-02-14 17:32:39,938 - finetune_llama.py:1239 - train - INFO - Total params: 3264865280
365
- 2025-02-14 17:32:39,938 - finetune_llama.py:1240 - train - INFO - Trainable params: 12589056
366
- 2025-02-14 17:32:39,938 - finetune_llama.py:1241 - train - INFO - LM head params: 394002432
367
- 2025-02-14 17:32:42,649 - trainer_callback.py:423 - add_callback - WARNING - You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
368
- :DefaultFlowCallback
369
- TensorBoardCallback
370
- 2025-02-14 17:32:42,649 - trainer.py:648 - __init__ - INFO - Using auto half precision backend
371
- 2025-02-14 17:32:43,183 - trainer.py:2134 - _inner_training_loop - INFO - ***** Running training *****
372
- 2025-02-14 17:32:43,184 - trainer.py:2135 - _inner_training_loop - INFO - Num examples = 550
373
- 2025-02-14 17:32:43,184 - trainer.py:2136 - _inner_training_loop - INFO - Num Epochs = 2
374
- 2025-02-14 17:32:43,184 - trainer.py:2137 - _inner_training_loop - INFO - Instantaneous batch size per device = 1
375
- 2025-02-14 17:32:43,184 - trainer.py:2140 - _inner_training_loop - INFO - Total train batch size (w. parallel, distributed & accumulation) = 1
376
- 2025-02-14 17:32:43,184 - trainer.py:2141 - _inner_training_loop - INFO - Gradient Accumulation steps = 1
377
- 2025-02-14 17:32:43,184 - trainer.py:2142 - _inner_training_loop - INFO - Total optimization steps = 1,100
378
- 2025-02-14 17:32:43,186 - trainer.py:2143 - _inner_training_loop - INFO - Number of trainable parameters = 406,591,488
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:056216c6b30ea50e9aeb44579620fe3d934c8fcb178331c5e9d64e261c2eada6
3
+ size 36433
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
runtime_logs/run_2025-02-14_17-36-05.log CHANGED
The diff for this file is too large to render. See raw diff
 
runtime_logs/run_2025-02-15_02-18-13.log CHANGED
The diff for this file is too large to render. See raw diff
 
runtime_logs/run_2025-02-15_02-47-54.log CHANGED
The diff for this file is too large to render. See raw diff
 
runtime_logs/run_2025-02-15_02-56-12.log CHANGED
@@ -1,378 +1,3 @@
1
- 2025-02-15 02:56:12,191 - training_args.py:2100 - _setup_devices - INFO - PyTorch: setting up devices
2
- 2025-02-15 02:56:12,699 - configuration_utils.py:731 - _get_config_dict - INFO - loading configuration file ./checkpoints/longvu_llama3_2/config.json
3
- 2025-02-15 02:56:12,703 - configuration_utils.py:800 - from_dict - INFO - Model config CambrianConfig {
4
- "_name_or_path": "/tmp/iopath_cache/manifold_cache/tree/users/shenx/finetune/09281004-cambrian_llama3_2_t576_ov",
5
- "architectures": [
6
- "CambrianLlamaForCausalLM"
7
- ],
8
- "attention_bias": false,
9
- "attention_dropout": 0.0,
10
- "bos_token_id": 128000,
11
- "connect_layer": 2,
12
- "connector_depth": 3,
13
- "connector_only": true,
14
- "dino_threshold": 0.83,
15
- "drop_threshold": 0.8,
16
- "eos_token_id": [
17
- 128001,
18
- 128008,
19
- 128009
20
- ],
21
- "frame_pos": false,
22
- "freeze_mm_mlp_adapter": false,
23
- "hidden_act": "silu",
24
- "hidden_size": 3072,
25
- "highres": true,
26
- "highres_connect": false,
27
- "image_aspect_ratio": "pad",
28
- "image_position": 91,
29
- "image_token_len": 144,
30
- "initializer_range": 0.02,
31
- "intermediate_size": 8192,
32
- "is_image_newline": true,
33
- "is_st_sampler": false,
34
- "lowres_token": 8,
35
- "max_position_embeddings": 131072,
36
- "mlp_bias": false,
37
- "mm_patch_merge_type": "flat",
38
- "mm_projector_lr": null,
39
- "mm_projector_type": "sva",
40
- "mm_use_im_patch_token": false,
41
- "mm_use_im_start_end": false,
42
- "mm_vision_sampler_lr": null,
43
- "mm_vision_select_feature": "patch",
44
- "mm_vision_select_layer": -2,
45
- "mm_vision_tower_aux_list": [
46
- "siglip/CLIP-ViT-SO400M-14-384",
47
- "facebook/dinov2-giant-res378"
48
- ],
49
- "mm_vision_tower_aux_token_len_list": [
50
- 576,
51
- 576
52
- ],
53
- "mm_vision_tower_lr": null,
54
- "model_type": "cambrian_llama",
55
- "num_attention_heads": 24,
56
- "num_hidden_layers": 28,
57
- "num_key_value_heads": 8,
58
- "num_of_vision_sampler_layers": 10,
59
- "num_query_group": 1,
60
- "pretraining_tp": 1,
61
- "query_num_list": [
62
- 144
63
- ],
64
- "rms_norm_eps": 1e-05,
65
- "rope_scaling": {
66
- "factor": 32.0,
67
- "high_freq_factor": 4.0,
68
- "low_freq_factor": 1.0,
69
- "original_max_position_embeddings": 8192,
70
- "rope_type": "llama3"
71
- },
72
- "rope_theta": 500000.0,
73
- "spmd_debug": null,
74
- "spmd_fsdp_sharding": null,
75
- "spmd_mesh": null,
76
- "start_of_vision_sampler_layers": 0,
77
- "stride_of_vision_sampler_layers": 3,
78
- "tie_word_embeddings": false,
79
- "tokenizer_model_max_length": 8192,
80
- "tokenizer_padding_side": "right",
81
- "torch_dtype": "float32",
82
- "transformers_version": "4.43.1",
83
- "tune_mm_mlp_adapter": false,
84
- "unfreeze_mm_vision_tower": false,
85
- "use_cache": false,
86
- "use_mm_proj": true,
87
- "vision_hidden_size": 1024,
88
- "vision_tower_aux_token_len_list": [
89
- 576,
90
- 576
91
- ],
92
- "vocab_size": 128256
93
- }
94
-
95
- 2025-02-15 02:56:12,703 - modeling_utils.py:3618 - from_pretrained - INFO - loading weights file ./checkpoints/longvu_llama3_2/pytorch_model.bin
96
- 2025-02-15 02:56:12,742 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
97
- "bos_token_id": 128000,
98
- "eos_token_id": [
99
- 128001,
100
- 128008,
101
- 128009
102
- ],
103
- "use_cache": false
104
- }
105
-
106
- 2025-02-15 02:56:12,964 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
107
- 2025-02-15 02:56:12,967 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
108
- "apply_layernorm": true,
109
- "architectures": [
110
- "Dinov2Model"
111
- ],
112
- "attention_probs_dropout_prob": 0.0,
113
- "drop_path_rate": 0.0,
114
- "hidden_act": "gelu",
115
- "hidden_dropout_prob": 0.0,
116
- "hidden_size": 1536,
117
- "image_size": 518,
118
- "initializer_range": 0.02,
119
- "layer_norm_eps": 1e-06,
120
- "layerscale_value": 1.0,
121
- "mlp_ratio": 4,
122
- "model_type": "dinov2",
123
- "num_attention_heads": 24,
124
- "num_channels": 3,
125
- "num_hidden_layers": 40,
126
- "out_features": [
127
- "stage40"
128
- ],
129
- "out_indices": [
130
- 40
131
- ],
132
- "patch_size": 14,
133
- "qkv_bias": true,
134
- "reshape_hidden_states": true,
135
- "stage_names": [
136
- "stem",
137
- "stage1",
138
- "stage2",
139
- "stage3",
140
- "stage4",
141
- "stage5",
142
- "stage6",
143
- "stage7",
144
- "stage8",
145
- "stage9",
146
- "stage10",
147
- "stage11",
148
- "stage12",
149
- "stage13",
150
- "stage14",
151
- "stage15",
152
- "stage16",
153
- "stage17",
154
- "stage18",
155
- "stage19",
156
- "stage20",
157
- "stage21",
158
- "stage22",
159
- "stage23",
160
- "stage24",
161
- "stage25",
162
- "stage26",
163
- "stage27",
164
- "stage28",
165
- "stage29",
166
- "stage30",
167
- "stage31",
168
- "stage32",
169
- "stage33",
170
- "stage34",
171
- "stage35",
172
- "stage36",
173
- "stage37",
174
- "stage38",
175
- "stage39",
176
- "stage40"
177
- ],
178
- "torch_dtype": "float32",
179
- "transformers_version": "4.43.1",
180
- "use_swiglu_ffn": true
181
- }
182
-
183
- 2025-02-15 02:56:14,324 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing CambrianLlamaForCausalLM.
184
-
185
- 2025-02-15 02:56:14,324 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of CambrianLlamaForCausalLM were initialized from the model checkpoint at ./checkpoints/longvu_llama3_2.
186
- If your task is similar to the task the model of the checkpoint was trained on, you can already use CambrianLlamaForCausalLM for predictions without further training.
187
- 2025-02-15 02:56:14,328 - configuration_utils.py:991 - from_pretrained - INFO - loading configuration file ./checkpoints/longvu_llama3_2/generation_config.json
188
- 2025-02-15 02:56:14,328 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
189
- "bos_token_id": 128000,
190
- "do_sample": true,
191
- "eos_token_id": [
192
- 128001,
193
- 128008,
194
- 128009
195
- ],
196
- "temperature": 0.6,
197
- "top_p": 0.9
198
- }
199
-
200
- 2025-02-15 02:56:14,485 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer.json
201
- 2025-02-15 02:56:14,485 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file added_tokens.json
202
- 2025-02-15 02:56:14,485 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file special_tokens_map.json
203
- 2025-02-15 02:56:14,485 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer_config.json
204
- 2025-02-15 02:56:14,682 - tokenization_utils_base.py:2533 - _from_pretrained - INFO - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
205
- 2025-02-15 02:56:15,344 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/config.json
206
- 2025-02-15 02:56:15,347 - configuration_utils.py:800 - from_dict - INFO - Model config SiglipVisionConfig {
207
- "attention_dropout": 0.0,
208
- "hidden_act": "gelu_pytorch_tanh",
209
- "hidden_size": 1152,
210
- "image_size": 384,
211
- "intermediate_size": 4304,
212
- "layer_norm_eps": 1e-06,
213
- "model_type": "siglip_vision_model",
214
- "num_attention_heads": 16,
215
- "num_channels": 3,
216
- "num_hidden_layers": 27,
217
- "patch_size": 14,
218
- "transformers_version": "4.43.1"
219
- }
220
-
221
- 2025-02-15 02:56:15,347 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/model.safetensors
222
- 2025-02-15 02:56:15,490 - modeling_utils.py:4440 - _load_pretrained_model - INFO - Some weights of the model checkpoint at google/siglip-so400m-patch14-384 were not used when initializing SiglipVisionModel: ['logit_bias', 'logit_scale', 'text_model.embeddings.position_embedding.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.12.layer_norm1.bias', 'text_model.encoder.layers.12.layer_norm1.weight', 'text_model.encoder.layers.12.layer_norm2.bias', 'text_model.encoder.layers.12.layer_norm2.weight', 'text_model.encoder.layers.12.mlp.fc1.bias', 'text_model.encoder.layers.12.mlp.fc1.weight', 'text_model.encoder.layers.12.mlp.fc2.bias', 'text_model.encoder.layers.12.mlp.fc2.weight', 'text_model.encoder.layers.12.self_attn.k_proj.bias', 'text_model.encoder.layers.12.self_attn.k_proj.weight', 'text_model.encoder.layers.12.self_attn.out_proj.bias', 'text_model.encoder.layers.12.self_attn.out_proj.weight', 'text_model.encoder.layers.12.self_attn.q_proj.bias', 'text_model.encoder.layers.12.self_attn.q_proj.weight', 'text_model.encoder.layers.12.self_attn.v_proj.bias', 'text_model.encoder.layers.12.self_attn.v_proj.weight', 'text_model.encoder.layers.13.layer_norm1.bias', 'text_model.encoder.layers.13.layer_norm1.weight', 'text_model.encoder.layers.13.layer_norm2.bias', 'text_model.encoder.layers.13.layer_norm2.weight', 'text_model.encoder.layers.13.mlp.fc1.bias', 'text_model.encoder.layers.13.mlp.fc1.weight', 'text_model.encoder.layers.13.mlp.fc2.bias', 'text_model.encoder.layers.13.mlp.fc2.weight', 'text_model.encoder.layers.13.self_attn.k_proj.bias', 'text_model.encoder.layers.13.self_attn.k_proj.weight', 'text_model.encoder.layers.13.self_attn.out_proj.bias', 'text_model.encoder.layers.13.self_attn.out_proj.weight', 'text_model.encoder.layers.13.self_attn.q_proj.bias', 'text_model.encoder.layers.13.self_attn.q_proj.weight', 'text_model.encoder.layers.13.self_attn.v_proj.bias', 'text_model.encoder.layers.13.self_attn.v_proj.weight', 'text_model.encoder.layers.14.layer_norm1.bias', 'text_model.encoder.layers.14.layer_norm1.weight', 'text_model.encoder.layers.14.layer_norm2.bias', 'text_model.encoder.layers.14.layer_norm2.weight', 'text_model.encoder.layers.14.mlp.fc1.bias', 'text_model.encoder.layers.14.mlp.fc1.weight', 'text_model.encoder.layers.14.mlp.fc2.bias', 'text_model.encoder.layers.14.mlp.fc2.weight', 'text_model.encoder.layers.14.self_attn.k_proj.bias', 'text_model.encoder.layers.14.self_attn.k_proj.weight', 'text_model.encoder.layers.14.self_attn.out_proj.bias', 'text_model.encoder.layers.14.self_attn.out_proj.weight', 'text_model.encoder.layers.14.self_attn.q_proj.bias', 'text_model.encoder.layers.14.self_attn.q_proj.weight', 'text_model.encoder.layers.14.self_attn.v_proj.bias', 'text_model.encoder.layers.14.self_attn.v_proj.weight', 'text_model.encoder.layers.15.layer_norm1.bias', 'text_model.encoder.layers.15.layer_norm1.weight', 'text_model.encoder.layers.15.layer_norm2.bias', 'text_model.encoder.layers.15.layer_norm2.weight', 'text_model.encoder.layers.15.mlp.fc1.bias', 'text_model.encoder.layers.15.mlp.fc1.weight', 'text_model.encoder.layers.15.mlp.fc2.bias', 'text_model.encoder.layers.15.mlp.fc2.weight', 'text_model.encoder.layers.15.self_attn.k_proj.bias', 'text_model.encoder.layers.15.self_attn.k_proj.weight', 'text_model.encoder.layers.15.self_attn.out_proj.bias', 'text_model.encoder.layers.15.self_attn.out_proj.weight', 'text_model.encoder.layers.15.self_attn.q_proj.bias', 'text_model.encoder.layers.15.self_attn.q_proj.weight', 'text_model.encoder.layers.15.self_attn.v_proj.bias', 'text_model.encoder.layers.15.self_attn.v_proj.weight', 'text_model.encoder.layers.16.layer_norm1.bias', 'text_model.encoder.layers.16.layer_norm1.weight', 'text_model.encoder.layers.16.layer_norm2.bias', 'text_model.encoder.layers.16.layer_norm2.weight', 'text_model.encoder.layers.16.mlp.fc1.bias', 'text_model.encoder.layers.16.mlp.fc1.weight', 'text_model.encoder.layers.16.mlp.fc2.bias', 'text_model.encoder.layers.16.mlp.fc2.weight', 'text_model.encoder.layers.16.self_attn.k_proj.bias', 'text_model.encoder.layers.16.self_attn.k_proj.weight', 'text_model.encoder.layers.16.self_attn.out_proj.bias', 'text_model.encoder.layers.16.self_attn.out_proj.weight', 'text_model.encoder.layers.16.self_attn.q_proj.bias', 'text_model.encoder.layers.16.self_attn.q_proj.weight', 'text_model.encoder.layers.16.self_attn.v_proj.bias', 'text_model.encoder.layers.16.self_attn.v_proj.weight', 'text_model.encoder.layers.17.layer_norm1.bias', 'text_model.encoder.layers.17.layer_norm1.weight', 'text_model.encoder.layers.17.layer_norm2.bias', 'text_model.encoder.layers.17.layer_norm2.weight', 'text_model.encoder.layers.17.mlp.fc1.bias', 'text_model.encoder.layers.17.mlp.fc1.weight', 'text_model.encoder.layers.17.mlp.fc2.bias', 'text_model.encoder.layers.17.mlp.fc2.weight', 'text_model.encoder.layers.17.self_attn.k_proj.bias', 'text_model.encoder.layers.17.self_attn.k_proj.weight', 'text_model.encoder.layers.17.self_attn.out_proj.bias', 'text_model.encoder.layers.17.self_attn.out_proj.weight', 'text_model.encoder.layers.17.self_attn.q_proj.bias', 'text_model.encoder.layers.17.self_attn.q_proj.weight', 'text_model.encoder.layers.17.self_attn.v_proj.bias', 'text_model.encoder.layers.17.self_attn.v_proj.weight', 'text_model.encoder.layers.18.layer_norm1.bias', 'text_model.encoder.layers.18.layer_norm1.weight', 'text_model.encoder.layers.18.layer_norm2.bias', 'text_model.encoder.layers.18.layer_norm2.weight', 'text_model.encoder.layers.18.mlp.fc1.bias', 'text_model.encoder.layers.18.mlp.fc1.weight', 'text_model.encoder.layers.18.mlp.fc2.bias', 'text_model.encoder.layers.18.mlp.fc2.weight', 'text_model.encoder.layers.18.self_attn.k_proj.bias', 'text_model.encoder.layers.18.self_attn.k_proj.weight', 'text_model.encoder.layers.18.self_attn.out_proj.bias', 'text_model.encoder.layers.18.self_attn.out_proj.weight', 'text_model.encoder.layers.18.self_attn.q_proj.bias', 'text_model.encoder.layers.18.self_attn.q_proj.weight', 'text_model.encoder.layers.18.self_attn.v_proj.bias', 'text_model.encoder.layers.18.self_attn.v_proj.weight', 'text_model.encoder.layers.19.layer_norm1.bias', 'text_model.encoder.layers.19.layer_norm1.weight', 'text_model.encoder.layers.19.layer_norm2.bias', 'text_model.encoder.layers.19.layer_norm2.weight', 'text_model.encoder.layers.19.mlp.fc1.bias', 'text_model.encoder.layers.19.mlp.fc1.weight', 'text_model.encoder.layers.19.mlp.fc2.bias', 'text_model.encoder.layers.19.mlp.fc2.weight', 'text_model.encoder.layers.19.self_attn.k_proj.bias', 'text_model.encoder.layers.19.self_attn.k_proj.weight', 'text_model.encoder.layers.19.self_attn.out_proj.bias', 'text_model.encoder.layers.19.self_attn.out_proj.weight', 'text_model.encoder.layers.19.self_attn.q_proj.bias', 'text_model.encoder.layers.19.self_attn.q_proj.weight', 'text_model.encoder.layers.19.self_attn.v_proj.bias', 'text_model.encoder.layers.19.self_attn.v_proj.weight', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.20.layer_norm1.bias', 'text_model.encoder.layers.20.layer_norm1.weight', 'text_model.encoder.layers.20.layer_norm2.bias', 'text_model.encoder.layers.20.layer_norm2.weight', 'text_model.encoder.layers.20.mlp.fc1.bias', 'text_model.encoder.layers.20.mlp.fc1.weight', 'text_model.encoder.layers.20.mlp.fc2.bias', 'text_model.encoder.layers.20.mlp.fc2.weight', 'text_model.encoder.layers.20.self_attn.k_proj.bias', 'text_model.encoder.layers.20.self_attn.k_proj.weight', 'text_model.encoder.layers.20.self_attn.out_proj.bias', 'text_model.encoder.layers.20.self_attn.out_proj.weight', 'text_model.encoder.layers.20.self_attn.q_proj.bias', 'text_model.encoder.layers.20.self_attn.q_proj.weight', 'text_model.encoder.layers.20.self_attn.v_proj.bias', 'text_model.encoder.layers.20.self_attn.v_proj.weight', 'text_model.encoder.layers.21.layer_norm1.bias', 'text_model.encoder.layers.21.layer_norm1.weight', 'text_model.encoder.layers.21.layer_norm2.bias', 'text_model.encoder.layers.21.layer_norm2.weight', 'text_model.encoder.layers.21.mlp.fc1.bias', 'text_model.encoder.layers.21.mlp.fc1.weight', 'text_model.encoder.layers.21.mlp.fc2.bias', 'text_model.encoder.layers.21.mlp.fc2.weight', 'text_model.encoder.layers.21.self_attn.k_proj.bias', 'text_model.encoder.layers.21.self_attn.k_proj.weight', 'text_model.encoder.layers.21.self_attn.out_proj.bias', 'text_model.encoder.layers.21.self_attn.out_proj.weight', 'text_model.encoder.layers.21.self_attn.q_proj.bias', 'text_model.encoder.layers.21.self_attn.q_proj.weight', 'text_model.encoder.layers.21.self_attn.v_proj.bias', 'text_model.encoder.layers.21.self_attn.v_proj.weight', 'text_model.encoder.layers.22.layer_norm1.bias', 'text_model.encoder.layers.22.layer_norm1.weight', 'text_model.encoder.layers.22.layer_norm2.bias', 'text_model.encoder.layers.22.layer_norm2.weight', 'text_model.encoder.layers.22.mlp.fc1.bias', 'text_model.encoder.layers.22.mlp.fc1.weight', 'text_model.encoder.layers.22.mlp.fc2.bias', 'text_model.encoder.layers.22.mlp.fc2.weight', 'text_model.encoder.layers.22.self_attn.k_proj.bias', 'text_model.encoder.layers.22.self_attn.k_proj.weight', 'text_model.encoder.layers.22.self_attn.out_proj.bias', 'text_model.encoder.layers.22.self_attn.out_proj.weight', 'text_model.encoder.layers.22.self_attn.q_proj.bias', 'text_model.encoder.layers.22.self_attn.q_proj.weight', 'text_model.encoder.layers.22.self_attn.v_proj.bias', 'text_model.encoder.layers.22.self_attn.v_proj.weight', 'text_model.encoder.layers.23.layer_norm1.bias', 'text_model.encoder.layers.23.layer_norm1.weight', 'text_model.encoder.layers.23.layer_norm2.bias', 'text_model.encoder.layers.23.layer_norm2.weight', 'text_model.encoder.layers.23.mlp.fc1.bias', 'text_model.encoder.layers.23.mlp.fc1.weight', 'text_model.encoder.layers.23.mlp.fc2.bias', 'text_model.encoder.layers.23.mlp.fc2.weight', 'text_model.encoder.layers.23.self_attn.k_proj.bias', 'text_model.encoder.layers.23.self_attn.k_proj.weight', 'text_model.encoder.layers.23.self_attn.out_proj.bias', 'text_model.encoder.layers.23.self_attn.out_proj.weight', 'text_model.encoder.layers.23.self_attn.q_proj.bias', 'text_model.encoder.layers.23.self_attn.q_proj.weight', 'text_model.encoder.layers.23.self_attn.v_proj.bias', 'text_model.encoder.layers.23.self_attn.v_proj.weight', 'text_model.encoder.layers.24.layer_norm1.bias', 'text_model.encoder.layers.24.layer_norm1.weight', 'text_model.encoder.layers.24.layer_norm2.bias', 'text_model.encoder.layers.24.layer_norm2.weight', 'text_model.encoder.layers.24.mlp.fc1.bias', 'text_model.encoder.layers.24.mlp.fc1.weight', 'text_model.encoder.layers.24.mlp.fc2.bias', 'text_model.encoder.layers.24.mlp.fc2.weight', 'text_model.encoder.layers.24.self_attn.k_proj.bias', 'text_model.encoder.layers.24.self_attn.k_proj.weight', 'text_model.encoder.layers.24.self_attn.out_proj.bias', 'text_model.encoder.layers.24.self_attn.out_proj.weight', 'text_model.encoder.layers.24.self_attn.q_proj.bias', 'text_model.encoder.layers.24.self_attn.q_proj.weight', 'text_model.encoder.layers.24.self_attn.v_proj.bias', 'text_model.encoder.layers.24.self_attn.v_proj.weight', 'text_model.encoder.layers.25.layer_norm1.bias', 'text_model.encoder.layers.25.layer_norm1.weight', 'text_model.encoder.layers.25.layer_norm2.bias', 'text_model.encoder.layers.25.layer_norm2.weight', 'text_model.encoder.layers.25.mlp.fc1.bias', 'text_model.encoder.layers.25.mlp.fc1.weight', 'text_model.encoder.layers.25.mlp.fc2.bias', 'text_model.encoder.layers.25.mlp.fc2.weight', 'text_model.encoder.layers.25.self_attn.k_proj.bias', 'text_model.encoder.layers.25.self_attn.k_proj.weight', 'text_model.encoder.layers.25.self_attn.out_proj.bias', 'text_model.encoder.layers.25.self_attn.out_proj.weight', 'text_model.encoder.layers.25.self_attn.q_proj.bias', 'text_model.encoder.layers.25.self_attn.q_proj.weight', 'text_model.encoder.layers.25.self_attn.v_proj.bias', 'text_model.encoder.layers.25.self_attn.v_proj.weight', 'text_model.encoder.layers.26.layer_norm1.bias', 'text_model.encoder.layers.26.layer_norm1.weight', 'text_model.encoder.layers.26.layer_norm2.bias', 'text_model.encoder.layers.26.layer_norm2.weight', 'text_model.encoder.layers.26.mlp.fc1.bias', 'text_model.encoder.layers.26.mlp.fc1.weight', 'text_model.encoder.layers.26.mlp.fc2.bias', 'text_model.encoder.layers.26.mlp.fc2.weight', 'text_model.encoder.layers.26.self_attn.k_proj.bias', 'text_model.encoder.layers.26.self_attn.k_proj.weight', 'text_model.encoder.layers.26.self_attn.out_proj.bias', 'text_model.encoder.layers.26.self_attn.out_proj.weight', 'text_model.encoder.layers.26.self_attn.q_proj.bias', 'text_model.encoder.layers.26.self_attn.q_proj.weight', 'text_model.encoder.layers.26.self_attn.v_proj.bias', 'text_model.encoder.layers.26.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.final_layer_norm.weight', 'text_model.head.bias', 'text_model.head.weight']
223
- - This IS expected if you are initializing SiglipVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
224
- - This IS NOT expected if you are initializing SiglipVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
225
- 2025-02-15 02:56:15,491 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of SiglipVisionModel were initialized from the model checkpoint at google/siglip-so400m-patch14-384.
226
- If your task is similar to the task the model of the checkpoint was trained on, you can already use SiglipVisionModel for predictions without further training.
227
- 2025-02-15 02:56:15,674 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/preprocessor_config.json
228
- 2025-02-15 02:56:15,674 - image_processing_base.py:429 - from_dict - INFO - Image processor SiglipImageProcessor {
229
- "do_convert_rgb": null,
230
- "do_normalize": true,
231
- "do_rescale": true,
232
- "do_resize": true,
233
- "image_mean": [
234
- 0.5,
235
- 0.5,
236
- 0.5
237
- ],
238
- "image_processor_type": "SiglipImageProcessor",
239
- "image_std": [
240
- 0.5,
241
- 0.5,
242
- 0.5
243
- ],
244
- "processor_class": "SiglipProcessor",
245
- "resample": 3,
246
- "rescale_factor": 0.00392156862745098,
247
- "size": {
248
- "height": 384,
249
- "width": 384
250
- }
251
- }
252
-
253
- 2025-02-15 02:56:16,050 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
254
- 2025-02-15 02:56:16,052 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
255
- "apply_layernorm": true,
256
- "architectures": [
257
- "Dinov2Model"
258
- ],
259
- "attention_probs_dropout_prob": 0.0,
260
- "drop_path_rate": 0.0,
261
- "hidden_act": "gelu",
262
- "hidden_dropout_prob": 0.0,
263
- "hidden_size": 1536,
264
- "image_size": 518,
265
- "initializer_range": 0.02,
266
- "layer_norm_eps": 1e-06,
267
- "layerscale_value": 1.0,
268
- "mlp_ratio": 4,
269
- "model_type": "dinov2",
270
- "num_attention_heads": 24,
271
- "num_channels": 3,
272
- "num_hidden_layers": 40,
273
- "out_features": [
274
- "stage40"
275
- ],
276
- "out_indices": [
277
- 40
278
- ],
279
- "patch_size": 14,
280
- "qkv_bias": true,
281
- "reshape_hidden_states": true,
282
- "stage_names": [
283
- "stem",
284
- "stage1",
285
- "stage2",
286
- "stage3",
287
- "stage4",
288
- "stage5",
289
- "stage6",
290
- "stage7",
291
- "stage8",
292
- "stage9",
293
- "stage10",
294
- "stage11",
295
- "stage12",
296
- "stage13",
297
- "stage14",
298
- "stage15",
299
- "stage16",
300
- "stage17",
301
- "stage18",
302
- "stage19",
303
- "stage20",
304
- "stage21",
305
- "stage22",
306
- "stage23",
307
- "stage24",
308
- "stage25",
309
- "stage26",
310
- "stage27",
311
- "stage28",
312
- "stage29",
313
- "stage30",
314
- "stage31",
315
- "stage32",
316
- "stage33",
317
- "stage34",
318
- "stage35",
319
- "stage36",
320
- "stage37",
321
- "stage38",
322
- "stage39",
323
- "stage40"
324
- ],
325
- "torch_dtype": "float32",
326
- "transformers_version": "4.43.1",
327
- "use_swiglu_ffn": true
328
- }
329
-
330
- 2025-02-15 02:56:16,053 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/model.safetensors
331
- 2025-02-15 02:56:16,377 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing Dinov2Model.
332
-
333
- 2025-02-15 02:56:16,377 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of Dinov2Model were initialized from the model checkpoint at facebook/dinov2-giant.
334
- If your task is similar to the task the model of the checkpoint was trained on, you can already use Dinov2Model for predictions without further training.
335
- 2025-02-15 02:56:16,563 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/preprocessor_config.json
336
- 2025-02-15 02:56:16,566 - image_processing_base.py:429 - from_dict - INFO - Image processor BitImageProcessor {
337
- "crop_size": {
338
- "height": 378,
339
- "width": 378
340
- },
341
- "do_center_crop": true,
342
- "do_convert_rgb": true,
343
- "do_normalize": true,
344
- "do_rescale": true,
345
- "do_resize": true,
346
- "image_mean": [
347
- 0.485,
348
- 0.456,
349
- 0.406
350
- ],
351
- "image_processor_type": "BitImageProcessor",
352
- "image_std": [
353
- 0.229,
354
- 0.224,
355
- 0.225
356
- ],
357
- "resample": 3,
358
- "rescale_factor": 0.00392156862745098,
359
- "size": {
360
- "shortest_edge": 378
361
- }
362
- }
363
-
364
- 2025-02-15 02:56:17,230 - finetune_llama.py:1239 - train - INFO - Total params: 3264865280
365
- 2025-02-15 02:56:17,230 - finetune_llama.py:1240 - train - INFO - Trainable params: 12589056
366
- 2025-02-15 02:56:17,230 - finetune_llama.py:1241 - train - INFO - LM head params: 394002432
367
- 2025-02-15 02:56:19,287 - trainer_callback.py:423 - add_callback - WARNING - You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
368
- :DefaultFlowCallback
369
- TensorBoardCallback
370
- 2025-02-15 02:56:19,287 - trainer.py:648 - __init__ - INFO - Using auto half precision backend
371
- 2025-02-15 02:56:19,591 - trainer.py:2134 - _inner_training_loop - INFO - ***** Running training *****
372
- 2025-02-15 02:56:19,591 - trainer.py:2135 - _inner_training_loop - INFO - Num examples = 2
373
- 2025-02-15 02:56:19,591 - trainer.py:2136 - _inner_training_loop - INFO - Num Epochs = 2
374
- 2025-02-15 02:56:19,591 - trainer.py:2137 - _inner_training_loop - INFO - Instantaneous batch size per device = 1
375
- 2025-02-15 02:56:19,591 - trainer.py:2140 - _inner_training_loop - INFO - Total train batch size (w. parallel, distributed & accumulation) = 1
376
- 2025-02-15 02:56:19,591 - trainer.py:2141 - _inner_training_loop - INFO - Gradient Accumulation steps = 1
377
- 2025-02-15 02:56:19,591 - trainer.py:2142 - _inner_training_loop - INFO - Total optimization steps = 4
378
- 2025-02-15 02:56:19,593 - trainer.py:2143 - _inner_training_loop - INFO - Number of trainable parameters = 406,591,488
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb53b35784cd4c3deffdb4f7fbb4b55778d0370f0a55cccf53af92339759f3a2
3
+ size 36427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
runtime_logs/run_2025-02-15_03-00-20.log CHANGED
The diff for this file is too large to render. See raw diff
 
runtime_logs/run_2025-02-15_03-02-43.log CHANGED
@@ -1,383 +1,3 @@
1
- 2025-02-15 03:02:43,575 - training_args.py:2100 - _setup_devices - INFO - PyTorch: setting up devices
2
- 2025-02-15 03:02:44,236 - configuration_utils.py:731 - _get_config_dict - INFO - loading configuration file ./checkpoints/longvu_llama3_2/config.json
3
- 2025-02-15 03:02:44,240 - configuration_utils.py:800 - from_dict - INFO - Model config CambrianConfig {
4
- "_name_or_path": "/tmp/iopath_cache/manifold_cache/tree/users/shenx/finetune/09281004-cambrian_llama3_2_t576_ov",
5
- "architectures": [
6
- "CambrianLlamaForCausalLM"
7
- ],
8
- "attention_bias": false,
9
- "attention_dropout": 0.0,
10
- "bos_token_id": 128000,
11
- "connect_layer": 2,
12
- "connector_depth": 3,
13
- "connector_only": true,
14
- "dino_threshold": 0.83,
15
- "drop_threshold": 0.8,
16
- "eos_token_id": [
17
- 128001,
18
- 128008,
19
- 128009
20
- ],
21
- "frame_pos": false,
22
- "freeze_mm_mlp_adapter": false,
23
- "hidden_act": "silu",
24
- "hidden_size": 3072,
25
- "highres": true,
26
- "highres_connect": false,
27
- "image_aspect_ratio": "pad",
28
- "image_position": 91,
29
- "image_token_len": 144,
30
- "initializer_range": 0.02,
31
- "intermediate_size": 8192,
32
- "is_image_newline": true,
33
- "is_st_sampler": false,
34
- "lowres_token": 8,
35
- "max_position_embeddings": 131072,
36
- "mlp_bias": false,
37
- "mm_patch_merge_type": "flat",
38
- "mm_projector_lr": null,
39
- "mm_projector_type": "sva",
40
- "mm_use_im_patch_token": false,
41
- "mm_use_im_start_end": false,
42
- "mm_vision_sampler_lr": null,
43
- "mm_vision_select_feature": "patch",
44
- "mm_vision_select_layer": -2,
45
- "mm_vision_tower_aux_list": [
46
- "siglip/CLIP-ViT-SO400M-14-384",
47
- "facebook/dinov2-giant-res378"
48
- ],
49
- "mm_vision_tower_aux_token_len_list": [
50
- 576,
51
- 576
52
- ],
53
- "mm_vision_tower_lr": null,
54
- "model_type": "cambrian_llama",
55
- "num_attention_heads": 24,
56
- "num_hidden_layers": 28,
57
- "num_key_value_heads": 8,
58
- "num_of_vision_sampler_layers": 10,
59
- "num_query_group": 1,
60
- "pretraining_tp": 1,
61
- "query_num_list": [
62
- 144
63
- ],
64
- "rms_norm_eps": 1e-05,
65
- "rope_scaling": {
66
- "factor": 32.0,
67
- "high_freq_factor": 4.0,
68
- "low_freq_factor": 1.0,
69
- "original_max_position_embeddings": 8192,
70
- "rope_type": "llama3"
71
- },
72
- "rope_theta": 500000.0,
73
- "spmd_debug": null,
74
- "spmd_fsdp_sharding": null,
75
- "spmd_mesh": null,
76
- "start_of_vision_sampler_layers": 0,
77
- "stride_of_vision_sampler_layers": 3,
78
- "tie_word_embeddings": false,
79
- "tokenizer_model_max_length": 8192,
80
- "tokenizer_padding_side": "right",
81
- "torch_dtype": "float32",
82
- "transformers_version": "4.43.1",
83
- "tune_mm_mlp_adapter": false,
84
- "unfreeze_mm_vision_tower": false,
85
- "use_cache": false,
86
- "use_mm_proj": true,
87
- "vision_hidden_size": 1024,
88
- "vision_tower_aux_token_len_list": [
89
- 576,
90
- 576
91
- ],
92
- "vocab_size": 128256
93
- }
94
-
95
- 2025-02-15 03:02:44,240 - modeling_utils.py:3618 - from_pretrained - INFO - loading weights file ./checkpoints/longvu_llama3_2/pytorch_model.bin
96
- 2025-02-15 03:02:44,301 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
97
- "bos_token_id": 128000,
98
- "eos_token_id": [
99
- 128001,
100
- 128008,
101
- 128009
102
- ],
103
- "use_cache": false
104
- }
105
-
106
- 2025-02-15 03:02:44,543 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
107
- 2025-02-15 03:02:44,547 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
108
- "apply_layernorm": true,
109
- "architectures": [
110
- "Dinov2Model"
111
- ],
112
- "attention_probs_dropout_prob": 0.0,
113
- "drop_path_rate": 0.0,
114
- "hidden_act": "gelu",
115
- "hidden_dropout_prob": 0.0,
116
- "hidden_size": 1536,
117
- "image_size": 518,
118
- "initializer_range": 0.02,
119
- "layer_norm_eps": 1e-06,
120
- "layerscale_value": 1.0,
121
- "mlp_ratio": 4,
122
- "model_type": "dinov2",
123
- "num_attention_heads": 24,
124
- "num_channels": 3,
125
- "num_hidden_layers": 40,
126
- "out_features": [
127
- "stage40"
128
- ],
129
- "out_indices": [
130
- 40
131
- ],
132
- "patch_size": 14,
133
- "qkv_bias": true,
134
- "reshape_hidden_states": true,
135
- "stage_names": [
136
- "stem",
137
- "stage1",
138
- "stage2",
139
- "stage3",
140
- "stage4",
141
- "stage5",
142
- "stage6",
143
- "stage7",
144
- "stage8",
145
- "stage9",
146
- "stage10",
147
- "stage11",
148
- "stage12",
149
- "stage13",
150
- "stage14",
151
- "stage15",
152
- "stage16",
153
- "stage17",
154
- "stage18",
155
- "stage19",
156
- "stage20",
157
- "stage21",
158
- "stage22",
159
- "stage23",
160
- "stage24",
161
- "stage25",
162
- "stage26",
163
- "stage27",
164
- "stage28",
165
- "stage29",
166
- "stage30",
167
- "stage31",
168
- "stage32",
169
- "stage33",
170
- "stage34",
171
- "stage35",
172
- "stage36",
173
- "stage37",
174
- "stage38",
175
- "stage39",
176
- "stage40"
177
- ],
178
- "torch_dtype": "float32",
179
- "transformers_version": "4.43.1",
180
- "use_swiglu_ffn": true
181
- }
182
-
183
- 2025-02-15 03:02:46,020 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing CambrianLlamaForCausalLM.
184
-
185
- 2025-02-15 03:02:46,020 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of CambrianLlamaForCausalLM were initialized from the model checkpoint at ./checkpoints/longvu_llama3_2.
186
- If your task is similar to the task the model of the checkpoint was trained on, you can already use CambrianLlamaForCausalLM for predictions without further training.
187
- 2025-02-15 03:02:46,026 - configuration_utils.py:991 - from_pretrained - INFO - loading configuration file ./checkpoints/longvu_llama3_2/generation_config.json
188
- 2025-02-15 03:02:46,026 - configuration_utils.py:1038 - from_dict - INFO - Generate config GenerationConfig {
189
- "bos_token_id": 128000,
190
- "do_sample": true,
191
- "eos_token_id": [
192
- 128001,
193
- 128008,
194
- 128009
195
- ],
196
- "temperature": 0.6,
197
- "top_p": 0.9
198
- }
199
-
200
- 2025-02-15 03:02:46,298 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer.json
201
- 2025-02-15 03:02:46,298 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file added_tokens.json
202
- 2025-02-15 03:02:46,299 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file special_tokens_map.json
203
- 2025-02-15 03:02:46,299 - tokenization_utils_base.py:2287 - from_pretrained - INFO - loading file tokenizer_config.json
204
- 2025-02-15 03:02:46,709 - tokenization_utils_base.py:2533 - _from_pretrained - INFO - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
205
- 2025-02-15 03:02:47,394 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/config.json
206
- 2025-02-15 03:02:47,396 - configuration_utils.py:800 - from_dict - INFO - Model config SiglipVisionConfig {
207
- "attention_dropout": 0.0,
208
- "hidden_act": "gelu_pytorch_tanh",
209
- "hidden_size": 1152,
210
- "image_size": 384,
211
- "intermediate_size": 4304,
212
- "layer_norm_eps": 1e-06,
213
- "model_type": "siglip_vision_model",
214
- "num_attention_heads": 16,
215
- "num_channels": 3,
216
- "num_hidden_layers": 27,
217
- "patch_size": 14,
218
- "transformers_version": "4.43.1"
219
- }
220
-
221
- 2025-02-15 03:02:47,397 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/model.safetensors
222
- 2025-02-15 03:02:47,666 - modeling_utils.py:4440 - _load_pretrained_model - INFO - Some weights of the model checkpoint at google/siglip-so400m-patch14-384 were not used when initializing SiglipVisionModel: ['logit_bias', 'logit_scale', 'text_model.embeddings.position_embedding.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.12.layer_norm1.bias', 'text_model.encoder.layers.12.layer_norm1.weight', 'text_model.encoder.layers.12.layer_norm2.bias', 'text_model.encoder.layers.12.layer_norm2.weight', 'text_model.encoder.layers.12.mlp.fc1.bias', 'text_model.encoder.layers.12.mlp.fc1.weight', 'text_model.encoder.layers.12.mlp.fc2.bias', 'text_model.encoder.layers.12.mlp.fc2.weight', 'text_model.encoder.layers.12.self_attn.k_proj.bias', 'text_model.encoder.layers.12.self_attn.k_proj.weight', 'text_model.encoder.layers.12.self_attn.out_proj.bias', 'text_model.encoder.layers.12.self_attn.out_proj.weight', 'text_model.encoder.layers.12.self_attn.q_proj.bias', 'text_model.encoder.layers.12.self_attn.q_proj.weight', 'text_model.encoder.layers.12.self_attn.v_proj.bias', 'text_model.encoder.layers.12.self_attn.v_proj.weight', 'text_model.encoder.layers.13.layer_norm1.bias', 'text_model.encoder.layers.13.layer_norm1.weight', 'text_model.encoder.layers.13.layer_norm2.bias', 'text_model.encoder.layers.13.layer_norm2.weight', 'text_model.encoder.layers.13.mlp.fc1.bias', 'text_model.encoder.layers.13.mlp.fc1.weight', 'text_model.encoder.layers.13.mlp.fc2.bias', 'text_model.encoder.layers.13.mlp.fc2.weight', 'text_model.encoder.layers.13.self_attn.k_proj.bias', 'text_model.encoder.layers.13.self_attn.k_proj.weight', 'text_model.encoder.layers.13.self_attn.out_proj.bias', 'text_model.encoder.layers.13.self_attn.out_proj.weight', 'text_model.encoder.layers.13.self_attn.q_proj.bias', 'text_model.encoder.layers.13.self_attn.q_proj.weight', 'text_model.encoder.layers.13.self_attn.v_proj.bias', 'text_model.encoder.layers.13.self_attn.v_proj.weight', 'text_model.encoder.layers.14.layer_norm1.bias', 'text_model.encoder.layers.14.layer_norm1.weight', 'text_model.encoder.layers.14.layer_norm2.bias', 'text_model.encoder.layers.14.layer_norm2.weight', 'text_model.encoder.layers.14.mlp.fc1.bias', 'text_model.encoder.layers.14.mlp.fc1.weight', 'text_model.encoder.layers.14.mlp.fc2.bias', 'text_model.encoder.layers.14.mlp.fc2.weight', 'text_model.encoder.layers.14.self_attn.k_proj.bias', 'text_model.encoder.layers.14.self_attn.k_proj.weight', 'text_model.encoder.layers.14.self_attn.out_proj.bias', 'text_model.encoder.layers.14.self_attn.out_proj.weight', 'text_model.encoder.layers.14.self_attn.q_proj.bias', 'text_model.encoder.layers.14.self_attn.q_proj.weight', 'text_model.encoder.layers.14.self_attn.v_proj.bias', 'text_model.encoder.layers.14.self_attn.v_proj.weight', 'text_model.encoder.layers.15.layer_norm1.bias', 'text_model.encoder.layers.15.layer_norm1.weight', 'text_model.encoder.layers.15.layer_norm2.bias', 'text_model.encoder.layers.15.layer_norm2.weight', 'text_model.encoder.layers.15.mlp.fc1.bias', 'text_model.encoder.layers.15.mlp.fc1.weight', 'text_model.encoder.layers.15.mlp.fc2.bias', 'text_model.encoder.layers.15.mlp.fc2.weight', 'text_model.encoder.layers.15.self_attn.k_proj.bias', 'text_model.encoder.layers.15.self_attn.k_proj.weight', 'text_model.encoder.layers.15.self_attn.out_proj.bias', 'text_model.encoder.layers.15.self_attn.out_proj.weight', 'text_model.encoder.layers.15.self_attn.q_proj.bias', 'text_model.encoder.layers.15.self_attn.q_proj.weight', 'text_model.encoder.layers.15.self_attn.v_proj.bias', 'text_model.encoder.layers.15.self_attn.v_proj.weight', 'text_model.encoder.layers.16.layer_norm1.bias', 'text_model.encoder.layers.16.layer_norm1.weight', 'text_model.encoder.layers.16.layer_norm2.bias', 'text_model.encoder.layers.16.layer_norm2.weight', 'text_model.encoder.layers.16.mlp.fc1.bias', 'text_model.encoder.layers.16.mlp.fc1.weight', 'text_model.encoder.layers.16.mlp.fc2.bias', 'text_model.encoder.layers.16.mlp.fc2.weight', 'text_model.encoder.layers.16.self_attn.k_proj.bias', 'text_model.encoder.layers.16.self_attn.k_proj.weight', 'text_model.encoder.layers.16.self_attn.out_proj.bias', 'text_model.encoder.layers.16.self_attn.out_proj.weight', 'text_model.encoder.layers.16.self_attn.q_proj.bias', 'text_model.encoder.layers.16.self_attn.q_proj.weight', 'text_model.encoder.layers.16.self_attn.v_proj.bias', 'text_model.encoder.layers.16.self_attn.v_proj.weight', 'text_model.encoder.layers.17.layer_norm1.bias', 'text_model.encoder.layers.17.layer_norm1.weight', 'text_model.encoder.layers.17.layer_norm2.bias', 'text_model.encoder.layers.17.layer_norm2.weight', 'text_model.encoder.layers.17.mlp.fc1.bias', 'text_model.encoder.layers.17.mlp.fc1.weight', 'text_model.encoder.layers.17.mlp.fc2.bias', 'text_model.encoder.layers.17.mlp.fc2.weight', 'text_model.encoder.layers.17.self_attn.k_proj.bias', 'text_model.encoder.layers.17.self_attn.k_proj.weight', 'text_model.encoder.layers.17.self_attn.out_proj.bias', 'text_model.encoder.layers.17.self_attn.out_proj.weight', 'text_model.encoder.layers.17.self_attn.q_proj.bias', 'text_model.encoder.layers.17.self_attn.q_proj.weight', 'text_model.encoder.layers.17.self_attn.v_proj.bias', 'text_model.encoder.layers.17.self_attn.v_proj.weight', 'text_model.encoder.layers.18.layer_norm1.bias', 'text_model.encoder.layers.18.layer_norm1.weight', 'text_model.encoder.layers.18.layer_norm2.bias', 'text_model.encoder.layers.18.layer_norm2.weight', 'text_model.encoder.layers.18.mlp.fc1.bias', 'text_model.encoder.layers.18.mlp.fc1.weight', 'text_model.encoder.layers.18.mlp.fc2.bias', 'text_model.encoder.layers.18.mlp.fc2.weight', 'text_model.encoder.layers.18.self_attn.k_proj.bias', 'text_model.encoder.layers.18.self_attn.k_proj.weight', 'text_model.encoder.layers.18.self_attn.out_proj.bias', 'text_model.encoder.layers.18.self_attn.out_proj.weight', 'text_model.encoder.layers.18.self_attn.q_proj.bias', 'text_model.encoder.layers.18.self_attn.q_proj.weight', 'text_model.encoder.layers.18.self_attn.v_proj.bias', 'text_model.encoder.layers.18.self_attn.v_proj.weight', 'text_model.encoder.layers.19.layer_norm1.bias', 'text_model.encoder.layers.19.layer_norm1.weight', 'text_model.encoder.layers.19.layer_norm2.bias', 'text_model.encoder.layers.19.layer_norm2.weight', 'text_model.encoder.layers.19.mlp.fc1.bias', 'text_model.encoder.layers.19.mlp.fc1.weight', 'text_model.encoder.layers.19.mlp.fc2.bias', 'text_model.encoder.layers.19.mlp.fc2.weight', 'text_model.encoder.layers.19.self_attn.k_proj.bias', 'text_model.encoder.layers.19.self_attn.k_proj.weight', 'text_model.encoder.layers.19.self_attn.out_proj.bias', 'text_model.encoder.layers.19.self_attn.out_proj.weight', 'text_model.encoder.layers.19.self_attn.q_proj.bias', 'text_model.encoder.layers.19.self_attn.q_proj.weight', 'text_model.encoder.layers.19.self_attn.v_proj.bias', 'text_model.encoder.layers.19.self_attn.v_proj.weight', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.20.layer_norm1.bias', 'text_model.encoder.layers.20.layer_norm1.weight', 'text_model.encoder.layers.20.layer_norm2.bias', 'text_model.encoder.layers.20.layer_norm2.weight', 'text_model.encoder.layers.20.mlp.fc1.bias', 'text_model.encoder.layers.20.mlp.fc1.weight', 'text_model.encoder.layers.20.mlp.fc2.bias', 'text_model.encoder.layers.20.mlp.fc2.weight', 'text_model.encoder.layers.20.self_attn.k_proj.bias', 'text_model.encoder.layers.20.self_attn.k_proj.weight', 'text_model.encoder.layers.20.self_attn.out_proj.bias', 'text_model.encoder.layers.20.self_attn.out_proj.weight', 'text_model.encoder.layers.20.self_attn.q_proj.bias', 'text_model.encoder.layers.20.self_attn.q_proj.weight', 'text_model.encoder.layers.20.self_attn.v_proj.bias', 'text_model.encoder.layers.20.self_attn.v_proj.weight', 'text_model.encoder.layers.21.layer_norm1.bias', 'text_model.encoder.layers.21.layer_norm1.weight', 'text_model.encoder.layers.21.layer_norm2.bias', 'text_model.encoder.layers.21.layer_norm2.weight', 'text_model.encoder.layers.21.mlp.fc1.bias', 'text_model.encoder.layers.21.mlp.fc1.weight', 'text_model.encoder.layers.21.mlp.fc2.bias', 'text_model.encoder.layers.21.mlp.fc2.weight', 'text_model.encoder.layers.21.self_attn.k_proj.bias', 'text_model.encoder.layers.21.self_attn.k_proj.weight', 'text_model.encoder.layers.21.self_attn.out_proj.bias', 'text_model.encoder.layers.21.self_attn.out_proj.weight', 'text_model.encoder.layers.21.self_attn.q_proj.bias', 'text_model.encoder.layers.21.self_attn.q_proj.weight', 'text_model.encoder.layers.21.self_attn.v_proj.bias', 'text_model.encoder.layers.21.self_attn.v_proj.weight', 'text_model.encoder.layers.22.layer_norm1.bias', 'text_model.encoder.layers.22.layer_norm1.weight', 'text_model.encoder.layers.22.layer_norm2.bias', 'text_model.encoder.layers.22.layer_norm2.weight', 'text_model.encoder.layers.22.mlp.fc1.bias', 'text_model.encoder.layers.22.mlp.fc1.weight', 'text_model.encoder.layers.22.mlp.fc2.bias', 'text_model.encoder.layers.22.mlp.fc2.weight', 'text_model.encoder.layers.22.self_attn.k_proj.bias', 'text_model.encoder.layers.22.self_attn.k_proj.weight', 'text_model.encoder.layers.22.self_attn.out_proj.bias', 'text_model.encoder.layers.22.self_attn.out_proj.weight', 'text_model.encoder.layers.22.self_attn.q_proj.bias', 'text_model.encoder.layers.22.self_attn.q_proj.weight', 'text_model.encoder.layers.22.self_attn.v_proj.bias', 'text_model.encoder.layers.22.self_attn.v_proj.weight', 'text_model.encoder.layers.23.layer_norm1.bias', 'text_model.encoder.layers.23.layer_norm1.weight', 'text_model.encoder.layers.23.layer_norm2.bias', 'text_model.encoder.layers.23.layer_norm2.weight', 'text_model.encoder.layers.23.mlp.fc1.bias', 'text_model.encoder.layers.23.mlp.fc1.weight', 'text_model.encoder.layers.23.mlp.fc2.bias', 'text_model.encoder.layers.23.mlp.fc2.weight', 'text_model.encoder.layers.23.self_attn.k_proj.bias', 'text_model.encoder.layers.23.self_attn.k_proj.weight', 'text_model.encoder.layers.23.self_attn.out_proj.bias', 'text_model.encoder.layers.23.self_attn.out_proj.weight', 'text_model.encoder.layers.23.self_attn.q_proj.bias', 'text_model.encoder.layers.23.self_attn.q_proj.weight', 'text_model.encoder.layers.23.self_attn.v_proj.bias', 'text_model.encoder.layers.23.self_attn.v_proj.weight', 'text_model.encoder.layers.24.layer_norm1.bias', 'text_model.encoder.layers.24.layer_norm1.weight', 'text_model.encoder.layers.24.layer_norm2.bias', 'text_model.encoder.layers.24.layer_norm2.weight', 'text_model.encoder.layers.24.mlp.fc1.bias', 'text_model.encoder.layers.24.mlp.fc1.weight', 'text_model.encoder.layers.24.mlp.fc2.bias', 'text_model.encoder.layers.24.mlp.fc2.weight', 'text_model.encoder.layers.24.self_attn.k_proj.bias', 'text_model.encoder.layers.24.self_attn.k_proj.weight', 'text_model.encoder.layers.24.self_attn.out_proj.bias', 'text_model.encoder.layers.24.self_attn.out_proj.weight', 'text_model.encoder.layers.24.self_attn.q_proj.bias', 'text_model.encoder.layers.24.self_attn.q_proj.weight', 'text_model.encoder.layers.24.self_attn.v_proj.bias', 'text_model.encoder.layers.24.self_attn.v_proj.weight', 'text_model.encoder.layers.25.layer_norm1.bias', 'text_model.encoder.layers.25.layer_norm1.weight', 'text_model.encoder.layers.25.layer_norm2.bias', 'text_model.encoder.layers.25.layer_norm2.weight', 'text_model.encoder.layers.25.mlp.fc1.bias', 'text_model.encoder.layers.25.mlp.fc1.weight', 'text_model.encoder.layers.25.mlp.fc2.bias', 'text_model.encoder.layers.25.mlp.fc2.weight', 'text_model.encoder.layers.25.self_attn.k_proj.bias', 'text_model.encoder.layers.25.self_attn.k_proj.weight', 'text_model.encoder.layers.25.self_attn.out_proj.bias', 'text_model.encoder.layers.25.self_attn.out_proj.weight', 'text_model.encoder.layers.25.self_attn.q_proj.bias', 'text_model.encoder.layers.25.self_attn.q_proj.weight', 'text_model.encoder.layers.25.self_attn.v_proj.bias', 'text_model.encoder.layers.25.self_attn.v_proj.weight', 'text_model.encoder.layers.26.layer_norm1.bias', 'text_model.encoder.layers.26.layer_norm1.weight', 'text_model.encoder.layers.26.layer_norm2.bias', 'text_model.encoder.layers.26.layer_norm2.weight', 'text_model.encoder.layers.26.mlp.fc1.bias', 'text_model.encoder.layers.26.mlp.fc1.weight', 'text_model.encoder.layers.26.mlp.fc2.bias', 'text_model.encoder.layers.26.mlp.fc2.weight', 'text_model.encoder.layers.26.self_attn.k_proj.bias', 'text_model.encoder.layers.26.self_attn.k_proj.weight', 'text_model.encoder.layers.26.self_attn.out_proj.bias', 'text_model.encoder.layers.26.self_attn.out_proj.weight', 'text_model.encoder.layers.26.self_attn.q_proj.bias', 'text_model.encoder.layers.26.self_attn.q_proj.weight', 'text_model.encoder.layers.26.self_attn.v_proj.bias', 'text_model.encoder.layers.26.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.final_layer_norm.weight', 'text_model.head.bias', 'text_model.head.weight']
223
- - This IS expected if you are initializing SiglipVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
224
- - This IS NOT expected if you are initializing SiglipVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
225
- 2025-02-15 03:02:47,668 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of SiglipVisionModel were initialized from the model checkpoint at google/siglip-so400m-patch14-384.
226
- If your task is similar to the task the model of the checkpoint was trained on, you can already use SiglipVisionModel for predictions without further training.
227
- 2025-02-15 03:02:47,864 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/preprocessor_config.json
228
- 2025-02-15 03:02:47,865 - image_processing_base.py:429 - from_dict - INFO - Image processor SiglipImageProcessor {
229
- "do_convert_rgb": null,
230
- "do_normalize": true,
231
- "do_rescale": true,
232
- "do_resize": true,
233
- "image_mean": [
234
- 0.5,
235
- 0.5,
236
- 0.5
237
- ],
238
- "image_processor_type": "SiglipImageProcessor",
239
- "image_std": [
240
- 0.5,
241
- 0.5,
242
- 0.5
243
- ],
244
- "processor_class": "SiglipProcessor",
245
- "resample": 3,
246
- "rescale_factor": 0.00392156862745098,
247
- "size": {
248
- "height": 384,
249
- "width": 384
250
- }
251
- }
252
-
253
- 2025-02-15 03:02:48,307 - configuration_utils.py:733 - _get_config_dict - INFO - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/config.json
254
- 2025-02-15 03:02:48,310 - configuration_utils.py:800 - from_dict - INFO - Model config Dinov2Config {
255
- "apply_layernorm": true,
256
- "architectures": [
257
- "Dinov2Model"
258
- ],
259
- "attention_probs_dropout_prob": 0.0,
260
- "drop_path_rate": 0.0,
261
- "hidden_act": "gelu",
262
- "hidden_dropout_prob": 0.0,
263
- "hidden_size": 1536,
264
- "image_size": 518,
265
- "initializer_range": 0.02,
266
- "layer_norm_eps": 1e-06,
267
- "layerscale_value": 1.0,
268
- "mlp_ratio": 4,
269
- "model_type": "dinov2",
270
- "num_attention_heads": 24,
271
- "num_channels": 3,
272
- "num_hidden_layers": 40,
273
- "out_features": [
274
- "stage40"
275
- ],
276
- "out_indices": [
277
- 40
278
- ],
279
- "patch_size": 14,
280
- "qkv_bias": true,
281
- "reshape_hidden_states": true,
282
- "stage_names": [
283
- "stem",
284
- "stage1",
285
- "stage2",
286
- "stage3",
287
- "stage4",
288
- "stage5",
289
- "stage6",
290
- "stage7",
291
- "stage8",
292
- "stage9",
293
- "stage10",
294
- "stage11",
295
- "stage12",
296
- "stage13",
297
- "stage14",
298
- "stage15",
299
- "stage16",
300
- "stage17",
301
- "stage18",
302
- "stage19",
303
- "stage20",
304
- "stage21",
305
- "stage22",
306
- "stage23",
307
- "stage24",
308
- "stage25",
309
- "stage26",
310
- "stage27",
311
- "stage28",
312
- "stage29",
313
- "stage30",
314
- "stage31",
315
- "stage32",
316
- "stage33",
317
- "stage34",
318
- "stage35",
319
- "stage36",
320
- "stage37",
321
- "stage38",
322
- "stage39",
323
- "stage40"
324
- ],
325
- "torch_dtype": "float32",
326
- "transformers_version": "4.43.1",
327
- "use_swiglu_ffn": true
328
- }
329
-
330
- 2025-02-15 03:02:48,310 - modeling_utils.py:3621 - from_pretrained - INFO - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/model.safetensors
331
- 2025-02-15 03:02:48,895 - modeling_utils.py:4450 - _load_pretrained_model - INFO - All model checkpoint weights were used when initializing Dinov2Model.
332
-
333
- 2025-02-15 03:02:48,895 - modeling_utils.py:4458 - _load_pretrained_model - INFO - All the weights of Dinov2Model were initialized from the model checkpoint at facebook/dinov2-giant.
334
- If your task is similar to the task the model of the checkpoint was trained on, you can already use Dinov2Model for predictions without further training.
335
- 2025-02-15 03:02:49,091 - image_processing_base.py:375 - get_image_processor_dict - INFO - loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--dinov2-giant/snapshots/611a9d42f2335e0f921f1e313ad3c1b7178d206d/preprocessor_config.json
336
- 2025-02-15 03:02:49,094 - image_processing_base.py:429 - from_dict - INFO - Image processor BitImageProcessor {
337
- "crop_size": {
338
- "height": 378,
339
- "width": 378
340
- },
341
- "do_center_crop": true,
342
- "do_convert_rgb": true,
343
- "do_normalize": true,
344
- "do_rescale": true,
345
- "do_resize": true,
346
- "image_mean": [
347
- 0.485,
348
- 0.456,
349
- 0.406
350
- ],
351
- "image_processor_type": "BitImageProcessor",
352
- "image_std": [
353
- 0.229,
354
- 0.224,
355
- 0.225
356
- ],
357
- "resample": 3,
358
- "rescale_factor": 0.00392156862745098,
359
- "size": {
360
- "shortest_edge": 378
361
- }
362
- }
363
-
364
- 2025-02-15 03:02:50,031 - finetune_llama.py:1239 - train - INFO - Total params: 3264865280
365
- 2025-02-15 03:02:50,031 - finetune_llama.py:1240 - train - INFO - Trainable params: 12589056
366
- 2025-02-15 03:02:50,031 - finetune_llama.py:1241 - train - INFO - LM head params: 394002432
367
- 2025-02-15 03:02:52,464 - trainer_callback.py:423 - add_callback - WARNING - You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
368
- :DefaultFlowCallback
369
- TensorBoardCallback
370
- 2025-02-15 03:02:52,464 - trainer.py:648 - __init__ - INFO - Using auto half precision backend
371
- 2025-02-15 03:02:52,465 - trainer.py:2526 - _load_from_checkpoint - INFO - Loading model from ./checkpoints/cambrian_llama3_2/checkpoint-4.
372
- 2025-02-15 03:02:55,358 - trainer.py:2134 - _inner_training_loop - INFO - ***** Running training *****
373
- 2025-02-15 03:02:55,358 - trainer.py:2135 - _inner_training_loop - INFO - Num examples = 540
374
- 2025-02-15 03:02:55,358 - trainer.py:2136 - _inner_training_loop - INFO - Num Epochs = 2
375
- 2025-02-15 03:02:55,358 - trainer.py:2137 - _inner_training_loop - INFO - Instantaneous batch size per device = 1
376
- 2025-02-15 03:02:55,358 - trainer.py:2140 - _inner_training_loop - INFO - Total train batch size (w. parallel, distributed & accumulation) = 1
377
- 2025-02-15 03:02:55,358 - trainer.py:2141 - _inner_training_loop - INFO - Gradient Accumulation steps = 1
378
- 2025-02-15 03:02:55,358 - trainer.py:2142 - _inner_training_loop - INFO - Total optimization steps = 1,080
379
- 2025-02-15 03:02:55,359 - trainer.py:2143 - _inner_training_loop - INFO - Number of trainable parameters = 406,591,488
380
- 2025-02-15 03:02:55,359 - trainer.py:2165 - _inner_training_loop - INFO - Continuing training from checkpoint, will skip to saved global_step
381
- 2025-02-15 03:02:55,359 - trainer.py:2166 - _inner_training_loop - INFO - Continuing training from epoch 0
382
- 2025-02-15 03:02:55,359 - trainer.py:2167 - _inner_training_loop - INFO - Continuing training from global step 4
383
- 2025-02-15 03:02:55,359 - trainer.py:2169 - _inner_training_loop - INFO - Will skip the first 0 epochs then the first 4 batches in the first epoch.
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8ca82d50a27c49e4b6b579372386820290395eb6a9384bbeba1d94c653c460a
3
+ size 37091