Upload model

#1
by ArthurZ HF staff - opened
config.json CHANGED
@@ -1,227 +1,321 @@
1
  {
2
- "_name_or_path": "jukebox-5b-lyrics",
 
3
  "architectures": [
4
  "JukeboxModel"
5
  ],
6
- "cond_c_res": [
7
- 0,
8
- 1,
9
- 1
10
- ],
11
- "cond_depth": [
12
- 3,
13
- 16,
14
- 16
15
- ],
16
- "cond_dilation_cycle": [
17
- null,
18
- 8,
19
- 8
20
- ],
21
- "cond_dilation_growth_rate": [
22
- 1,
23
- 3,
24
- 3
25
- ],
26
- "cond_downs_t": [
27
- 3,
28
- 2,
29
- 2
30
- ],
31
- "cond_m_conv": 1,
32
- "cond_res_scale": [
33
- null,
34
- true,
35
- false
36
- ],
37
- "cond_strides_t": [
38
- 2,
39
- 2,
40
- 2
41
- ],
42
- "cond_width": [
43
- 128,
44
- 1024,
45
- 1024
46
- ],
47
- "cond_zero_out": false,
48
- "copy_input": false,
49
- "fp16_params": true,
50
  "hop_fraction": [
51
  0.125,
52
  0.5,
53
  0.5
54
  ],
55
  "init_std": 0.2,
56
- "lyric_conditioning": [
57
- true,
58
- false,
59
- false
60
- ],
61
  "max_duration": 600.0,
62
  "max_nb_genres": 5,
63
- "merged_decoder": [
64
- true,
65
- false,
66
- false
67
- ],
68
  "metadata_conditioning": true,
69
- "metadata_dims": [
70
- [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  120,
72
  4111
73
  ],
74
- [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  120,
76
  4111
77
  ],
78
- [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  120,
80
  4111
81
- ]
82
- ],
83
- "min_duration": 23.8,
84
- "model_type": "jukebox",
85
- "nb_priors": 3,
86
- "nb_relevant_lyric_tokens": [
87
- 512,
88
- 0,
89
- 0
90
- ],
91
- "lyric_enc_attn_dropout": 0.0,
92
- "lyric_enc_attn_order": [
93
- 2,
94
- 0,
95
- 0
96
- ],
97
- "lyric_enc_blocks": 32,
98
- "lyric_enc_depth": [
99
- 18,
100
- 3,
101
- 3
102
- ],
103
- "lyric_enc_emb_dropout": 0.0,
104
- "lyric_enc_heads": 4,
105
- "lyric_enc_init_scale": [
106
- 0.1,
107
- 0.4,
108
- 0.4
109
- ],
110
- "lyric_enc_loss_fraction": [
111
- 0.4,
112
- 0.0,
113
- 0.0
114
- ],
115
- "lyric_enc_m_attn": 0.25,
116
- "lyric_enc_m_mlp": 1.0,
117
- "lyric_enc_n_vocab": 80,
118
- "lyric_enc_pos_init": false,
119
- "lyric_enc_res_scale": false,
120
- "lyric_enc_resid_dropout": 0.0,
121
- "lyric_enc_spread": null,
122
- "lyric_enc_width": [
123
- 1280,
124
- 128,
125
- 128
126
- ],
127
- "lyric_enc_zero_out": false,
128
- "prior_alignment_head": [
129
- 2,
130
- null,
131
- null
132
- ],
133
- "prior_alignment_layer": [
134
- 68,
135
- null,
136
- null
137
- ],
138
- "prior_attn_dropout": 0,
139
- "prior_attn_order": [
140
- 10,
141
- 2,
142
- 2
143
- ],
144
- "prior_blocks": 128,
145
- "prior_depth": [
146
- 79,
147
- 72,
148
- 72
149
- ],
150
- "prior_emb_dropout": 0,
151
- "prior_init_scale": [
152
- 0.2,
153
- 1,
154
- 1
155
- ],
156
- "prior_latent_dim": 2048,
157
- "prior_m_attn": 0.25,
158
- "prior_n_ctx": [
159
- 8192,
160
- 8192,
161
- 8192
162
- ],
163
- "prior_n_heads": [
164
- 8,
165
- 1,
166
- 1
167
- ],
168
- "prior_pos_init": false,
169
- "prior_res_scale": false,
170
- "prior_resid_dropout": 0,
171
- "prior_spread": null,
172
- "prior_width": [
173
- 4800,
174
- 1920,
175
- 1920
176
- ],
177
- "prior_zero_out": false,
178
- "sample_length": 1058304,
179
  "sampling_rate": 44100,
180
- "single_enc_dec": [
181
- false,
182
- false,
183
- false
184
- ],
185
  "timing_dims": 128,
186
  "torch_dtype": "float32",
187
- "transformers_version": "4.22.0.dev0",
188
- "vqvae_codebook_dimension": 2048,
189
- "vqvae_commit": 0.02,
190
- "vqvae_conv_block_depth": 4,
191
- "vqvae_conv_block_width": 32,
192
- "vqvae_depth": 4,
193
- "vqvae_dilation_cycle": null,
194
- "vqvae_dilation_growth_rate": 3,
195
- "vqvae_downs_t": [
196
- 3,
197
- 2,
198
- 2
199
- ],
200
- "vqvae_emmbedding_width": 64,
201
- "vqvae_levels": 3,
202
- "vqvae_lmu": 0.99,
203
- "vqvae_m_conv": 1,
204
- "vqvae_multipliers": [
205
- 2,
206
- 1,
207
- 1
208
- ],
209
- "vqvae_music_tokens_shapes": [
210
- [
211
- 8268
212
  ],
213
- [
214
- 33072
 
 
 
 
 
215
  ],
216
- [
217
- 132288
218
- ]
219
- ],
220
- "vqvae_reverse_decoder_dilation": 1,
221
- "vqvae_strides_t": [
222
- 2,
223
- 2,
224
- 2
225
- ],
226
- "vqvae_width": 64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  }
 
1
  {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/home/arthur_huggingface_co/transformers/jukebox-5b-lyrics-converted",
4
  "architectures": [
5
  "JukeboxModel"
6
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  "hop_fraction": [
8
  0.125,
9
  0.5,
10
  0.5
11
  ],
12
  "init_std": 0.2,
 
 
 
 
 
13
  "max_duration": 600.0,
14
  "max_nb_genres": 5,
 
 
 
 
 
15
  "metadata_conditioning": true,
16
+ "min_duration": 23.8,
17
+ "model_type": "jukebox",
18
+ "nb_priors": 3,
19
+ "prior_0": {
20
+ "act_fn": "quick_gelu",
21
+ "alignment_head": 2,
22
+ "alignment_layer": 68,
23
+ "attention_multiplier": 0.25,
24
+ "attention_pattern": "large_separated_enc_dec_w_lyrics",
25
+ "attn_dropout": 0,
26
+ "attn_res_scale": false,
27
+ "blocks": 64,
28
+ "conv_res_scale": null,
29
+ "depth": 79,
30
+ "emb_dropout": 0,
31
+ "embed_dim": 2048,
32
+ "encoder_attention_multiplier": 0.25,
33
+ "encoder_attention_pattern": "RawColumnPreviousRowAttention",
34
+ "encoder_attn_dropout": 0.0,
35
+ "encoder_attn_res_scale": false,
36
+ "encoder_blocks": 32,
37
+ "encoder_depth": 18,
38
+ "encoder_emb_dropout": 0.0,
39
+ "encoder_heads": 4,
40
+ "encoder_init_scale": 0.1,
41
+ "encoder_loss_fraction": [
42
+ 0.4,
43
+ 0.0,
44
+ 0.0
45
+ ],
46
+ "encoder_mlp_multiplier": 1.0,
47
+ "encoder_n_vocab": 80,
48
+ "encoder_resid_dropout": 0.0,
49
+ "encoder_spread": null,
50
+ "encoder_width": 1280,
51
+ "encoder_zero_out": false,
52
+ "init_scale": 0.2,
53
+ "is_encoder_decoder": false,
54
+ "lyric_conditioning": true,
55
+ "mask": true,
56
+ "max_duration": 600.0,
57
+ "max_nb_genres": 5,
58
+ "merged_decoder": true,
59
+ "metadata_conditioning": true,
60
+ "metadata_dims": [
61
  120,
62
  4111
63
  ],
64
+ "min_duration": 23.8,
65
+ "mlp_multiplier": 1.0,
66
+ "model_type": "jukebox",
67
+ "n_ctx": 8192,
68
+ "n_heads": 8,
69
+ "nb_relevant_lyric_tokens": 512,
70
+ "res_conv_depth": null,
71
+ "res_conv_width": null,
72
+ "res_convolution_multiplier": null,
73
+ "res_dilation_cycle": null,
74
+ "res_dilation_growth_rate": null,
75
+ "res_downs_t": [
76
+ 3,
77
+ 2,
78
+ 2
79
+ ],
80
+ "res_strides_t": [
81
+ 2,
82
+ 2,
83
+ 2
84
+ ],
85
+ "resid_dropout": 0,
86
+ "sampling_rate": 44100,
87
+ "spread": null,
88
+ "timing_dims": 128,
89
+ "transformers_version": "4.25.0.dev0",
90
+ "width": 4800,
91
+ "zero_out": false
92
+ },
93
+ "prior_1": {
94
+ "act_fn": "quick_gelu",
95
+ "alignment_head": null,
96
+ "alignment_layer": null,
97
+ "attention_multiplier": 0.25,
98
+ "attention_pattern": "RawColumnPreviousRowAttention",
99
+ "attn_dropout": 0,
100
+ "attn_res_scale": false,
101
+ "blocks": 64,
102
+ "conv_res_scale": true,
103
+ "depth": 72,
104
+ "emb_dropout": 0,
105
+ "embed_dim": 2048,
106
+ "encoder_attention_multiplier": null,
107
+ "encoder_attention_pattern": null,
108
+ "encoder_attn_dropout": null,
109
+ "encoder_attn_res_scale": false,
110
+ "encoder_blocks": null,
111
+ "encoder_depth": null,
112
+ "encoder_emb_dropout": null,
113
+ "encoder_heads": null,
114
+ "encoder_init_scale": null,
115
+ "encoder_loss_fraction": [
116
+ 0.4,
117
+ 0.0,
118
+ 0.0
119
+ ],
120
+ "encoder_mlp_multiplier": null,
121
+ "encoder_n_vocab": 79,
122
+ "encoder_resid_dropout": null,
123
+ "encoder_spread": null,
124
+ "encoder_width": null,
125
+ "encoder_zero_out": null,
126
+ "init_scale": 1,
127
+ "is_encoder_decoder": false,
128
+ "lyric_conditioning": true,
129
+ "mask": true,
130
+ "max_duration": 600.0,
131
+ "max_nb_genres": 5,
132
+ "merged_decoder": false,
133
+ "metadata_conditioning": true,
134
+ "metadata_dims": [
135
  120,
136
  4111
137
  ],
138
+ "min_duration": 23.8,
139
+ "mlp_multiplier": 1.0,
140
+ "model_type": "jukebox",
141
+ "n_ctx": 8192,
142
+ "n_heads": 1,
143
+ "nb_relevant_lyric_tokens": 0,
144
+ "res_conv_depth": 16,
145
+ "res_conv_width": 1024,
146
+ "res_convolution_multiplier": 1,
147
+ "res_dilation_cycle": 8,
148
+ "res_dilation_growth_rate": 3,
149
+ "res_downs_t": [
150
+ 3,
151
+ 2,
152
+ 2
153
+ ],
154
+ "res_strides_t": [
155
+ 2,
156
+ 2,
157
+ 2
158
+ ],
159
+ "resid_dropout": 0,
160
+ "sampling_rate": 44100,
161
+ "spread": null,
162
+ "timing_dims": 128,
163
+ "transformers_version": "4.25.0.dev0",
164
+ "width": 1920,
165
+ "zero_out": false
166
+ },
167
+ "prior_2": {
168
+ "act_fn": "quick_gelu",
169
+ "alignment_head": null,
170
+ "alignment_layer": null,
171
+ "attention_multiplier": 0.25,
172
+ "attention_pattern": "RawColumnPreviousRowAttention",
173
+ "attn_dropout": 0,
174
+ "attn_res_scale": false,
175
+ "blocks": 64,
176
+ "conv_res_scale": false,
177
+ "depth": 72,
178
+ "emb_dropout": 0,
179
+ "embed_dim": 2048,
180
+ "encoder_attention_multiplier": null,
181
+ "encoder_attention_pattern": null,
182
+ "encoder_attn_dropout": null,
183
+ "encoder_attn_res_scale": false,
184
+ "encoder_blocks": null,
185
+ "encoder_depth": null,
186
+ "encoder_emb_dropout": null,
187
+ "encoder_heads": null,
188
+ "encoder_init_scale": null,
189
+ "encoder_loss_fraction": [
190
+ 0.4,
191
+ 0.0,
192
+ 0.0
193
+ ],
194
+ "encoder_mlp_multiplier": null,
195
+ "encoder_n_vocab": 79,
196
+ "encoder_resid_dropout": null,
197
+ "encoder_spread": null,
198
+ "encoder_width": null,
199
+ "encoder_zero_out": null,
200
+ "init_scale": 1,
201
+ "is_encoder_decoder": false,
202
+ "lyric_conditioning": false,
203
+ "mask": true,
204
+ "max_duration": 600.0,
205
+ "max_nb_genres": 5,
206
+ "merged_decoder": false,
207
+ "metadata_conditioning": true,
208
+ "metadata_dims": [
209
  120,
210
  4111
211
+ ],
212
+ "min_duration": 23.8,
213
+ "mlp_multiplier": 1.0,
214
+ "model_type": "jukebox",
215
+ "n_ctx": 8192,
216
+ "n_heads": 1,
217
+ "nb_relevant_lyric_tokens": 0,
218
+ "res_conv_depth": 16,
219
+ "res_conv_width": 1024,
220
+ "res_convolution_multiplier": 1,
221
+ "res_dilation_cycle": 8,
222
+ "res_dilation_growth_rate": 3,
223
+ "res_downs_t": [
224
+ 3,
225
+ 2,
226
+ 2
227
+ ],
228
+ "res_strides_t": [
229
+ 2,
230
+ 2,
231
+ 2
232
+ ],
233
+ "resid_dropout": 0,
234
+ "sampling_rate": 44100,
235
+ "spread": null,
236
+ "timing_dims": 128,
237
+ "transformers_version": "4.25.0.dev0",
238
+ "width": 1920,
239
+ "zero_out": false
240
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  "sampling_rate": 44100,
 
 
 
 
 
242
  "timing_dims": 128,
243
  "torch_dtype": "float32",
244
+ "transformers_version": null,
245
+ "vqvae_config": {
246
+ "act_fn": "relu",
247
+ "codebook_dimension": 2048,
248
+ "commit": 0.02,
249
+ "conv_input_shape": 1,
250
+ "conv_res_scale": false,
251
+ "embed_dim": 64,
252
+ "hop_fraction": [
253
+ 0.125,
254
+ 0.5,
255
+ 0.5
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  ],
257
+ "levels": 3,
258
+ "lmu": 0.99,
259
+ "model_type": "",
260
+ "multipliers": [
261
+ 2,
262
+ 1,
263
+ 1
264
  ],
265
+ "res_conv_depth": 4,
266
+ "res_conv_width": 32,
267
+ "res_convolution_multiplier": 1,
268
+ "res_dilation_cycle": null,
269
+ "res_dilation_growth_rate": 3,
270
+ "res_downs_t": [
271
+ 3,
272
+ 2,
273
+ 2
274
+ ],
275
+ "res_strides_t": [
276
+ 2,
277
+ 2,
278
+ 2
279
+ ],
280
+ "sample_length": 1058304,
281
+ "transformers_version": "4.25.0.dev0"
282
+ },
283
+ "vqvae_config_dict": {
284
+ "act_fn": "relu",
285
+ "codebook_dimension": 2048,
286
+ "commit": 0.02,
287
+ "conv_input_shape": 1,
288
+ "conv_res_scale": false,
289
+ "embed_dim": 64,
290
+ "hop_fraction": [
291
+ 0.125,
292
+ 0.5,
293
+ 0.5
294
+ ],
295
+ "levels": 3,
296
+ "lmu": 0.99,
297
+ "model_type": "",
298
+ "multipliers": [
299
+ 2,
300
+ 1,
301
+ 1
302
+ ],
303
+ "res_conv_depth": 4,
304
+ "res_conv_width": 32,
305
+ "res_convolution_multiplier": 1,
306
+ "res_dilation_cycle": null,
307
+ "res_dilation_growth_rate": 3,
308
+ "res_downs_t": [
309
+ 3,
310
+ 2,
311
+ 2
312
+ ],
313
+ "res_strides_t": [
314
+ 2,
315
+ 2,
316
+ 2
317
+ ],
318
+ "sample_length": 1058304,
319
+ "transformers_version": "4.24.0.dev0"
320
+ }
321
  }
pytorch_model-00001-of-00004.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e39936eb64985f559f496871e544cd36b0d0387bc399d767cc6c5b6d083bd84
3
- size 9911113867
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2aa4a5f1a79e9a66573cd86175e6a0dfb20ba1f2422ca4444efdba5ba504d845
3
+ size 9990169993
pytorch_model-00002-of-00004.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7018f8e2e8817f8ccd09731101a00b3558eed729ad57d04b7eb70adf754f2a7a
3
- size 9958785645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d50e7689ebecd75bc8f68b2025493d9bd23d36c37269a346a4d87c478dd4d33a
3
+ size 9958786369
pytorch_model-00003-of-00004.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:163ba8a3a74431a5c48a61716931260c1a9cecdb377ef7623ba73fe101511722
3
- size 9958785685
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b2759ceda225e9f888ee1bc8dfca7bcc031ab79ac2f2c0a2b81621232ff9659
3
+ size 9970242288
pytorch_model-00004-of-00004.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e9a5cdaa4bdbb9aabe609486df9c86e727e74413f90e1b87dd843490fa26865
3
- size 592587412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91ec3904c3a0f16b47058864036ecb5db0c8aa0b33e19f899d74d45eaeb5a423
3
+ size 517801519
pytorch_model.bin.index.json CHANGED
The diff for this file is too large to render. See raw diff