maikezu commited on
Commit
45922c4
·
1 Parent(s): f7ac02a

Upload model weights

Browse files
config.json ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../mw_checkpoints/speechlmm-v1/v1_01_ft/l",
3
+ "add_all_multimodal_tokens": true,
4
+ "add_lm_head": true,
5
+ "align_text_to_audio": false,
6
+ "architectures": [
7
+ "SpeechLmmModel"
8
+ ],
9
+ "audio_adapter": {
10
+ "add_cross_attention": true,
11
+ "attention_probs_dropout_prob": 0.1,
12
+ "compress_factor": 2,
13
+ "cross_attention_every_n_layers": 1,
14
+ "hidden_act": "gelu",
15
+ "hidden_dropout_prob": 0.1,
16
+ "hidden_size": 768,
17
+ "hop_size": 0,
18
+ "initializer_range": 0.02,
19
+ "input_dim": 1024,
20
+ "intermediate_size": 3072,
21
+ "layer_norm_eps": 1e-12,
22
+ "model_type": "qformer",
23
+ "num_attention_heads": 12,
24
+ "num_hidden_layers": 4,
25
+ "num_queries": 1,
26
+ "output_dim": 4096,
27
+ "transformers_version": "4.45.0",
28
+ "triplet_loss": false,
29
+ "window_size_in_seconds": 0.3333333333333
30
+ },
31
+ "audio_encoder": {
32
+ "_name_or_path": "/net/tscratch/people/plgzuefle/meetween_code/pretrained_components/facebook/seamless-m4t-v2-large",
33
+ "activation_dropout": 0.0,
34
+ "activation_function": "relu",
35
+ "adaptor_dropout": 0.1,
36
+ "adaptor_kernel_size": 8,
37
+ "adaptor_stride": 8,
38
+ "add_adapter": true,
39
+ "architectures": [
40
+ "SeamlessM4Tv2SpeechEncoder"
41
+ ],
42
+ "attention_dropout": 0.1,
43
+ "bos_token_id": 2,
44
+ "char_vocab_size": 10943,
45
+ "conv_depthwise_kernel_size": 31,
46
+ "decoder_attention_heads": 16,
47
+ "decoder_ffn_dim": 8192,
48
+ "decoder_layerdrop": 0.05,
49
+ "decoder_layers": 24,
50
+ "decoder_start_token_id": 3,
51
+ "dropout": 0.1,
52
+ "encoder_attention_heads": 16,
53
+ "encoder_ffn_dim": 8192,
54
+ "encoder_layerdrop": 0.05,
55
+ "encoder_layers": 24,
56
+ "eos_token_id": 3,
57
+ "feature_projection_input_dim": 160,
58
+ "hidden_size": 1024,
59
+ "initializer_range": 0.02,
60
+ "is_encoder_decoder": true,
61
+ "lang_embed_dim": 256,
62
+ "layer_norm_eps": 1e-05,
63
+ "leaky_relu_slope": 0.1,
64
+ "left_max_position_embeddings": 64,
65
+ "max_new_tokens": 256,
66
+ "max_position_embeddings": 4096,
67
+ "model_type": "seamless_m4t_v2",
68
+ "num_adapter_layers": 1,
69
+ "num_attention_heads": 16,
70
+ "num_hidden_layers": 24,
71
+ "pad_token_id": 0,
72
+ "position_embeddings_type": "relative_key",
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "resblock_kernel_sizes": [
91
+ 3,
92
+ 7,
93
+ 11
94
+ ],
95
+ "right_max_position_embeddings": 8,
96
+ "sampling_rate": 16000,
97
+ "scale_embedding": true,
98
+ "speech_encoder_attention_heads": 16,
99
+ "speech_encoder_chunk_size": 20000,
100
+ "speech_encoder_dropout": 0.0,
101
+ "speech_encoder_hidden_act": "swish",
102
+ "speech_encoder_intermediate_size": 4096,
103
+ "speech_encoder_layerdrop": 0.1,
104
+ "speech_encoder_layers": 24,
105
+ "speech_encoder_left_chunk_num": 128,
106
+ "spkr_embed_dim": 256,
107
+ "t2u_bos_token_id": 0,
108
+ "t2u_decoder_attention_heads": 16,
109
+ "t2u_decoder_ffn_dim": 8192,
110
+ "t2u_decoder_layers": 6,
111
+ "t2u_encoder_attention_heads": 16,
112
+ "t2u_encoder_ffn_dim": 8192,
113
+ "t2u_encoder_layers": 6,
114
+ "t2u_eos_token_id": 2,
115
+ "t2u_max_position_embeddings": 4096,
116
+ "t2u_pad_token_id": 1,
117
+ "t2u_variance_pred_dropout": 0.5,
118
+ "t2u_variance_predictor_embed_dim": 1024,
119
+ "t2u_variance_predictor_hidden_dim": 256,
120
+ "t2u_variance_predictor_kernel_size": 3,
121
+ "t2u_vocab_size": 10082,
122
+ "torch_dtype": "float32",
123
+ "transformers_version": "4.45.0",
124
+ "unit_embed_dim": 1280,
125
+ "unit_hifi_gan_vocab_size": 10000,
126
+ "upsample_initial_channel": 512,
127
+ "upsample_kernel_sizes": [
128
+ 11,
129
+ 8,
130
+ 8,
131
+ 4,
132
+ 4
133
+ ],
134
+ "upsample_rates": [
135
+ 5,
136
+ 4,
137
+ 4,
138
+ 2,
139
+ 2
140
+ ],
141
+ "use_cache": true,
142
+ "var_pred_dropout": 0.5,
143
+ "variance_predictor_kernel_size": 3,
144
+ "vocab_size": 256102,
145
+ "vocoder_num_langs": 36,
146
+ "vocoder_num_spkrs": 200,
147
+ "vocoder_offset": 4
148
+ },
149
+ "chunk_encoding_strategy": "loop",
150
+ "chunk_overlap_in_seconds": 1,
151
+ "chunk_size_in_seconds": 15,
152
+ "codebook_weights": [
153
+ 1,
154
+ 1,
155
+ 1,
156
+ 1,
157
+ 1,
158
+ 1,
159
+ 1,
160
+ 1,
161
+ 1,
162
+ 1,
163
+ 1,
164
+ 1,
165
+ 1,
166
+ 1,
167
+ 1,
168
+ 1,
169
+ 1,
170
+ 1,
171
+ 1,
172
+ 1,
173
+ 1,
174
+ 1,
175
+ 1,
176
+ 1,
177
+ 1,
178
+ 1,
179
+ 1,
180
+ 1,
181
+ 1,
182
+ 1,
183
+ 1,
184
+ 1
185
+ ],
186
+ "conversation_version": null,
187
+ "hidden_size": 4096,
188
+ "mm_use_audio_start_end": false,
189
+ "mm_use_im_start_end": false,
190
+ "mm_use_video_start_end": false,
191
+ "model_type": "speechlmm",
192
+ "text_decoder": {
193
+ "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
194
+ "architectures": [
195
+ "LlamaForCausalLM"
196
+ ],
197
+ "attention_bias": false,
198
+ "attention_dropout": 0.0,
199
+ "bos_token_id": 128000,
200
+ "conversation_version": "llama_3_1",
201
+ "eos_token_id": [
202
+ 128001,
203
+ 128008,
204
+ 128009
205
+ ],
206
+ "head_dim": 128,
207
+ "hidden_act": "silu",
208
+ "hidden_size": 4096,
209
+ "initializer_range": 0.02,
210
+ "intermediate_size": 14336,
211
+ "max_position_embeddings": 131072,
212
+ "mlp_bias": false,
213
+ "model_type": "llama",
214
+ "num_attention_heads": 32,
215
+ "num_hidden_layers": 32,
216
+ "num_key_value_heads": 8,
217
+ "pretraining_tp": 1,
218
+ "rms_norm_eps": 1e-05,
219
+ "rope_scaling": {
220
+ "factor": 8.0,
221
+ "high_freq_factor": 4.0,
222
+ "low_freq_factor": 1.0,
223
+ "original_max_position_embeddings": 8192,
224
+ "rope_type": "llama3"
225
+ },
226
+ "rope_theta": 500000.0,
227
+ "tie_word_embeddings": false,
228
+ "torch_dtype": "bfloat16",
229
+ "transformers_version": "4.45.0",
230
+ "use_cache": true,
231
+ "vocab_size": 128256
232
+ },
233
+ "tokenizer_padding_side": "right",
234
+ "torch_dtype": "bfloat16",
235
+ "transformers_version": "4.45.0",
236
+ "video_adapter": {
237
+ "force_input_projection": true,
238
+ "force_output_projection": true,
239
+ "hidden_layers": 4,
240
+ "hidden_size": 4096,
241
+ "input_dim": 768,
242
+ "model_type": "mlp",
243
+ "output_dim": 4096,
244
+ "residual_type": "interpolation",
245
+ "transformers_version": "4.45.0"
246
+ },
247
+ "video_encoder": {
248
+ "_name_or_path": "/net/tscratch/people/plgzuefle/meetween_code/pretrained_components/vsr_trlrs3vox2_base.pth",
249
+ "a_upsample_ratio": 1,
250
+ "adim": 768,
251
+ "aheads": 12,
252
+ "cnn_module_kernel": 31,
253
+ "ctc_type": "builtin",
254
+ "ddim": 768,
255
+ "dheads": 12,
256
+ "dlayers": 6,
257
+ "dropout_rate": 0.1,
258
+ "dunits": 3072,
259
+ "elayers": 12,
260
+ "eunits": 3072,
261
+ "hidden_size": 768,
262
+ "lsm_weight": 0.1,
263
+ "macaron_style": true,
264
+ "mtlalpha": 0.1,
265
+ "rel_pos_type": "latest",
266
+ "relu_type": "swish",
267
+ "transformer_attn_dropout_rate": 0.1,
268
+ "transformer_encoder_attn_layer_type": "rel_mha",
269
+ "transformer_input_layer": "conv3d",
270
+ "transformer_length_normalized_loss": false,
271
+ "transformers_version": "4.45.0",
272
+ "use_cnn_module": true,
273
+ "zero_triu": false
274
+ },
275
+ "vision_patch_merge_type": "flat",
276
+ "vision_select_feature": "patch",
277
+ "vision_select_layer": -1,
278
+ "vision_use_patch_token": true
279
+ }
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.45.0"
4
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38709cdd94804771f259b1b48e47371423becfcba0efc66db0d74c259ebded26
3
+ size 4976733000
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b2bdabf270ae51d0ce75245d5fd02938428ae564c16014b515c7f7848bcc438
3
+ size 4999804696
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22761736de181dd5841e55c275f769ce1b9890c6d3b824c0c3d215cabd472b28
3
+ size 4915918080
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0349047229d05dadb20bce042052647a5fa52d20463fff1c9d07591d11e815d3
3
+ size 3061868136
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff