Safetensors
idefics3
Noé BRANDOLINI commited on
Commit
543c848
·
verified ·
1 Parent(s): 804fe28

Update config.json

Browse files

patch padding config

Files changed (1) hide show
  1. config.json +16 -146
config.json CHANGED
@@ -1,68 +1,28 @@
1
  {
 
2
  "architectures": [
3
  "Idefics3ForConditionalGeneration"
4
  ],
5
  "image_seq_len": 81,
6
  "image_token_id": 49153,
7
  "model_type": "idefics3",
 
8
  "scale_factor": 3,
9
  "text_config": {
10
- "_attn_implementation_autoset": false,
11
  "_flash_attn_2_enabled": true,
12
  "_name_or_path": "/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_324_opt_400/unwrapped_model",
13
- "add_cross_attention": false,
14
  "architectures": [
15
  "VLlama3ForCausalLM"
16
  ],
17
- "attention_bias": false,
18
- "attention_dropout": 0.0,
19
- "bad_words_ids": null,
20
- "begin_suppress_tokens": null,
21
  "bos_token_id": 0,
22
- "chunk_size_feed_forward": 0,
23
- "cross_attention_hidden_size": null,
24
- "decoder_start_token_id": null,
25
- "diversity_penalty": 0.0,
26
- "do_sample": false,
27
- "early_stopping": false,
28
- "encoder_no_repeat_ngram_size": 0,
29
  "eos_token_id": 0,
30
- "exponential_decay_length_penalty": null,
31
- "finetuning_task": null,
32
- "forced_bos_token_id": null,
33
- "forced_eos_token_id": null,
34
  "head_dim": 64,
35
- "hidden_act": "silu",
36
  "hidden_size": 2048,
37
- "id2label": {
38
- "0": "LABEL_0",
39
- "1": "LABEL_1"
40
- },
41
- "initializer_range": 0.02,
42
  "intermediate_size": 8192,
43
- "is_decoder": false,
44
- "is_encoder_decoder": false,
45
- "label2id": {
46
- "LABEL_0": 0,
47
- "LABEL_1": 1
48
- },
49
- "length_penalty": 1.0,
50
- "max_length": 20,
51
  "max_position_embeddings": 16384,
52
- "min_length": 0,
53
- "mlp_bias": false,
54
  "model_type": "llama",
55
  "neftune_noise_alpha": 0.0,
56
- "no_repeat_ngram_size": 0,
57
- "num_attention_heads": 32,
58
- "num_beam_groups": 1,
59
- "num_beams": 1,
60
  "num_hidden_layers": 24,
61
- "num_key_value_heads": 32,
62
- "num_return_sequences": 1,
63
- "output_attentions": false,
64
- "output_hidden_states": false,
65
- "output_scores": false,
66
  "pad_token_id": 2,
67
  "perceiver_config": {
68
  "_attn_implementation_autoset": false,
@@ -137,131 +97,41 @@
137
  "typical_p": 1.0,
138
  "use_bfloat16": false
139
  },
140
- "prefix": null,
141
- "pretraining_tp": 1,
142
- "problem_type": null,
143
- "pruned_heads": {},
144
  "qk_layer_norms": false,
145
- "remove_invalid_values": false,
146
- "repetition_penalty": 1.0,
147
- "return_dict": true,
148
- "return_dict_in_generate": false,
149
  "rms_norm_eps": 1e-05,
150
- "rope_scaling": null,
151
  "rope_theta": 273768.0,
152
- "sep_token_id": null,
153
- "suppress_tokens": null,
154
- "task_specific_params": null,
155
- "temperature": 1.0,
156
- "tf_legacy_loss": false,
157
- "tie_encoder_decoder": false,
158
- "tie_word_embeddings": false,
159
- "tokenizer_class": null,
160
- "top_k": 50,
161
- "top_p": 1.0,
162
  "torch_dtype": "bfloat16",
163
- "torchscript": false,
164
- "typical_p": 1.0,
165
- "use_bfloat16": false,
166
- "use_cache": true,
167
  "use_resampler": false,
168
  "vocab_size": 49155
169
  },
170
  "tie_word_embeddings": false,
171
  "torch_dtype": "bfloat16",
172
- "transformers_version": "4.46.0",
173
  "transformers.js_config": {
174
- "kv_cache_dtype": {
175
- "q4f16": "float16",
176
- "fp16": "float16"
177
- },
178
  "dtype": {
 
179
  "embed_tokens": "auto",
180
- "vision_encoder": "auto",
181
- "decoder_model_merged": "q4"
182
  },
183
- "use_external_data_format": {
184
- "decoder_model_merged.onnx": true,
185
- "decoder_model_merged_fp16.onnx": true
186
  }
187
  },
188
- "use_cache": true,
 
189
  "vision_config": {
190
- "size": {"longest_edge": 1920},
191
- "max_image_size": {"longest_edge": 384},
192
- "_attn_implementation_autoset": false,
193
- "_name_or_path": "",
194
- "add_cross_attention": false,
195
- "architectures": null,
196
- "attention_dropout": 0.0,
197
- "bad_words_ids": null,
198
- "begin_suppress_tokens": null,
199
- "bos_token_id": null,
200
- "chunk_size_feed_forward": 0,
201
- "cross_attention_hidden_size": null,
202
- "decoder_start_token_id": null,
203
- "diversity_penalty": 0.0,
204
- "do_sample": false,
205
- "early_stopping": false,
206
- "encoder_no_repeat_ngram_size": 0,
207
- "eos_token_id": null,
208
- "exponential_decay_length_penalty": null,
209
- "finetuning_task": null,
210
- "forced_bos_token_id": null,
211
- "forced_eos_token_id": null,
212
- "hidden_act": "gelu_pytorch_tanh",
213
- "hidden_size": 1152,
214
- "id2label": {
215
- "0": "LABEL_0",
216
- "1": "LABEL_1"
217
- },
218
  "image_size": 384,
219
- "initializer_range": 0.02,
220
  "intermediate_size": 4304,
221
- "is_decoder": false,
222
- "is_encoder_decoder": false,
223
- "label2id": {
224
- "LABEL_0": 0,
225
- "LABEL_1": 1
226
  },
227
- "layer_norm_eps": 1e-06,
228
- "length_penalty": 1.0,
229
- "max_length": 20,
230
- "min_length": 0,
231
- "model_type": "idefics3",
232
- "no_repeat_ngram_size": 0,
233
- "num_attention_heads": 16,
234
- "num_beam_groups": 1,
235
- "num_beams": 1,
236
- "num_channels": 3,
237
  "num_hidden_layers": 27,
238
- "num_return_sequences": 1,
239
- "output_attentions": false,
240
- "output_hidden_states": false,
241
- "output_scores": false,
242
- "pad_token_id": null,
243
  "patch_size": 14,
244
- "prefix": null,
245
- "problem_type": null,
246
- "pruned_heads": {},
247
- "remove_invalid_values": false,
248
- "repetition_penalty": 1.0,
249
- "return_dict": true,
250
- "return_dict_in_generate": false,
251
- "sep_token_id": null,
252
- "suppress_tokens": null,
253
- "task_specific_params": null,
254
- "temperature": 1.0,
255
- "tf_legacy_loss": false,
256
- "tie_encoder_decoder": false,
257
- "tie_word_embeddings": false,
258
- "tokenizer_class": null,
259
- "top_k": 50,
260
- "top_p": 1.0,
261
- "torch_dtype": null,
262
- "torchscript": false,
263
- "typical_p": 1.0,
264
- "use_bfloat16": false
265
  },
266
  "vocab_size": 49155
267
  }
 
1
  {
2
+ "_name_or_path": "HuggingFaceTB/SmolVLM-Instruct",
3
  "architectures": [
4
  "Idefics3ForConditionalGeneration"
5
  ],
6
  "image_seq_len": 81,
7
  "image_token_id": 49153,
8
  "model_type": "idefics3",
9
+ "pad_token_id": 128002,
10
  "scale_factor": 3,
11
  "text_config": {
 
12
  "_flash_attn_2_enabled": true,
13
  "_name_or_path": "/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_324_opt_400/unwrapped_model",
 
14
  "architectures": [
15
  "VLlama3ForCausalLM"
16
  ],
 
 
 
 
17
  "bos_token_id": 0,
 
 
 
 
 
 
 
18
  "eos_token_id": 0,
 
 
 
 
19
  "head_dim": 64,
 
20
  "hidden_size": 2048,
 
 
 
 
 
21
  "intermediate_size": 8192,
 
 
 
 
 
 
 
 
22
  "max_position_embeddings": 16384,
 
 
23
  "model_type": "llama",
24
  "neftune_noise_alpha": 0.0,
 
 
 
 
25
  "num_hidden_layers": 24,
 
 
 
 
 
26
  "pad_token_id": 2,
27
  "perceiver_config": {
28
  "_attn_implementation_autoset": false,
 
97
  "typical_p": 1.0,
98
  "use_bfloat16": false
99
  },
 
 
 
 
100
  "qk_layer_norms": false,
 
 
 
 
101
  "rms_norm_eps": 1e-05,
 
102
  "rope_theta": 273768.0,
 
 
 
 
 
 
 
 
 
 
103
  "torch_dtype": "bfloat16",
 
 
 
 
104
  "use_resampler": false,
105
  "vocab_size": 49155
106
  },
107
  "tie_word_embeddings": false,
108
  "torch_dtype": "bfloat16",
 
109
  "transformers.js_config": {
 
 
 
 
110
  "dtype": {
111
+ "decoder_model_merged": "q4",
112
  "embed_tokens": "auto",
113
+ "vision_encoder": "auto"
 
114
  },
115
+ "kv_cache_dtype": {
116
+ "fp16": "float16",
117
+ "q4f16": "float16"
118
  }
119
  },
120
+ "transformers_version": "4.48.0",
121
+ "use_cache": false,
122
  "vision_config": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  "image_size": 384,
 
124
  "intermediate_size": 4304,
125
+ "max_image_size": {
126
+ "longest_edge": 384
 
 
 
127
  },
128
+ "model_type": "idefics3_vision",
 
 
 
 
 
 
 
 
 
129
  "num_hidden_layers": 27,
 
 
 
 
 
130
  "patch_size": 14,
131
+ "size": {
132
+ "longest_edge": 1920
133
+ },
134
+ "tie_word_embeddings": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  },
136
  "vocab_size": 49155
137
  }