TF-Keras
chrisdonahue-goog commited on
Commit
eecec21
·
verified ·
1 Parent(s): a8392d9

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. README.md +224 -3
  2. checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/.zarray +1 -0
  3. checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/0 +0 -0
  4. checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_cross_attention_layer_norm.scale.v/.zarray +1 -0
  5. checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_cross_attention_layer_norm.scale.v/0 +0 -0
  6. checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_12.pre_self_attention_layer_norm.scale.v/.zarray +1 -0
  7. checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_12.pre_self_attention_layer_norm.scale.v/0 +0 -0
  8. checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_4.pre_cross_attention_layer_norm.scale.v/.zarray +1 -0
  9. checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_4.pre_cross_attention_layer_norm.scale.v/0 +0 -0
  10. checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_12.pre_attention_layer_norm.scale.v/.zarray +1 -0
  11. checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_12.pre_attention_layer_norm.scale.v/0 +0 -0
  12. checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_19.pre_attention_layer_norm.scale.v/.zarray +1 -0
  13. checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_19.pre_attention_layer_norm.scale.v/0 +0 -0
  14. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.self_attention.key.kernel/.zarray +1 -0
  15. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_0.encoder_decoder_attention.value.kernel/.zarray +1 -0
  16. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_10.encoder_decoder_attention.value.kernel/.zarray +1 -0
  17. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_11.encoder_decoder_attention.query.kernel/.zarray +1 -0
  18. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_12.pre_cross_attention_layer_norm.scale/.zarray +1 -0
  19. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_12.pre_cross_attention_layer_norm.scale/0 +0 -0
  20. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_13.mlp.wi_0.kernel/.zarray +1 -0
  21. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.pre_cross_attention_layer_norm.scale/.zarray +1 -0
  22. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.pre_cross_attention_layer_norm.scale/0 +0 -0
  23. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_16.self_attention.out.kernel/.zarray +1 -0
  24. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_17.mlp.wi_1.kernel/.zarray +1 -0
  25. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wi_0.kernel/.zarray +1 -0
  26. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wo.kernel/.zarray +1 -0
  27. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_4.self_attention.value.kernel/.zarray +1 -0
  28. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_5.self_attention.out.kernel/.zarray +1 -0
  29. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.key.kernel/.zarray +1 -0
  30. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.self_attention.out.kernel/.zarray +1 -0
  31. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.out.kernel/.zarray +1 -0
  32. checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.value.kernel/.zarray +1 -0
  33. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_1.mlp.wi_0.kernel/.zarray +1 -0
  34. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_11.pre_attention_layer_norm.scale/.zarray +1 -0
  35. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_11.pre_attention_layer_norm.scale/0 +0 -0
  36. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_14.mlp.wo.kernel/.zarray +1 -0
  37. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_16.mlp.wi_0.kernel/.zarray +1 -0
  38. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_17.mlp.wo.kernel/.zarray +1 -0
  39. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.attention.query.kernel/.zarray +1 -0
  40. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.pre_mlp_layer_norm.scale/.zarray +1 -0
  41. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.pre_mlp_layer_norm.scale/0 +0 -0
  42. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_2.attention.query.kernel/.zarray +1 -0
  43. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_21.attention.value.kernel/.zarray +1 -0
  44. checkpoints/llm_large_x3047_c1860k/target.encoder.layers_3.mlp.wi_1.kernel/.zarray +1 -0
  45. savedmodels/musiccoca_mv212_quant/variables/variables.index +0 -0
  46. savedmodels/musiccoca_mv212f_cpu_compat/variables/variables.index +0 -0
  47. savedmodels/ssv2_48k_stereo/decoder/variables/variables.index +0 -0
  48. savedmodels/ssv2_48k_stereo/encoder/variables/variables.index +0 -0
  49. savedmodels/ssv2_48k_stereo/quantizer/variables/variables.index +0 -0
  50. testdata/musiccoca_mv212/inputs.txt +26 -0
README.md CHANGED
@@ -1,3 +1,224 @@
1
- ---
2
- license: cc-by-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ ---
4
+
5
+ # Model Card for Magenta RT
6
+
7
+ **Authors**: Google DeepMind
8
+
9
+ **Resources**:
10
+
11
+ - [Blog Post](https://g.co/magenta/rt)
12
+ - [Colab Demo](https://colab.research.google.com/github/magenta/magenta-realtime/blob/main/notebooks/Magenta_RT_Demo.ipynb)
13
+ - [Repository](https://github.com/magenta/magenta-realtime)
14
+ - [HuggingFace](https://huggingface.co/google/magenta-realtime)
15
+
16
+ ## Terms of Use
17
+
18
+ Magenta RealTime is offered under a combination of licenses: the codebase is
19
+ licensed under
20
+ [Apache 2.0](https://github.com/magenta/magenta-realtime/blob/main/LICENSE), and
21
+ the model weights under
22
+ [Creative Commons Attribution 4.0 International](https://creativecommons.org/licenses/by/4.0/legalcode).
23
+ In addition, we specify the following usage terms:
24
+
25
+ Copyright 2025 Google LLC
26
+
27
+ Use these materials responsibly and do not generate content, including outputs,
28
+ that infringe or violate the rights of others, including rights in copyrighted
29
+ content.
30
+
31
+ Google claims no rights in outputs you generate using Magenta RealTime. You and
32
+ your users are solely responsible for outputs and their subsequent uses.
33
+
34
+ Unless required by applicable law or agreed to in writing, all software and
35
+ materials distributed here under the Apache 2.0 or CC-BY licenses are
36
+ distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
37
+ either express or implied. See the licenses for the specific language governing
38
+ permissions and limitations under those licenses. You are solely responsible for
39
+ determining the appropriateness of using, reproducing, modifying, performing,
40
+ displaying or distributing the software and materials, and any outputs, and
41
+ assume any and all risks associated with your use or distribution of any of the
42
+ software and materials, and any outputs, and your exercise of rights and
43
+ permissions under the licenses.
44
+
45
+ ## Model Details
46
+
47
+ Magenta RealTime is an open music generation model from Google built from the
48
+ same research and technology used to create
49
+ [MusicFX DJ](https://labs.google/fx/tools/music-fx-dj) and
50
+ [Lyria RealTime](http://goo.gle/lyria-realtime). Magenta RealTime enables the
51
+ continuous generation of musical audio steered by a text prompt, an audio
52
+ example, or a weighted combination of multiple text prompts and/or audio
53
+ examples. Its relatively small size makes it possible to deploy in environments
54
+ with limited resources, including live performance settings or freely available
55
+ Colab TPUs.
56
+
57
+ ### System Components
58
+
59
+ Magenta RealTime is composed of three components: SpectroStream, MusicCoCa, and
60
+ an LLM. A full technical report is forthcoming that will explain each component
61
+ in more detail.
62
+
63
+ 1. **SpectroStream** is a discrete audio codec that converts stereo 48kHz audio
64
+ into tokens, building on the SoundStream RVQ codec from
65
+ [Zeghidour+ 21](https://arxiv.org/abs/2107.03312)
66
+ 1. **MusicCoCa** is a contrastive-trained model capable of embedding audio and
67
+ text into a common embedding space, building on
68
+ [Yu+ 22](https://arxiv.org/abs/2205.01917) and
69
+ [Huang+ 22](https://arxiv.org/abs/2208.12415).
70
+ 1. An **encoder-decoder Transformer LLM** generates audio tokens given context
71
+ audio tokens and a tokenized MusicCoCa embedding, building on the MusicLM
72
+ method from [Agostinelli+ 23](https://arxiv.org/abs/2301.11325)
73
+
74
+ ### Inputs and outputs
75
+
76
+ - **SpectroStream RVQ codec**: Tokenizes high-fidelity music audio
77
+ - **Encoder input / Decoder output**: Music audio waveforms, 48kHz stereo
78
+ - **Encoder output / Decoder input**: Discrete audio tokens, 25Hz frame
79
+ rate, 64 RVQ depth, 10 bit codes, 16kbps
80
+ - **MusicCoCa**: Joint embeddings of text and music audio
81
+ - **Input**: Music audio waveforms, 16kHz mono, or text representation of
82
+ music style e.g. "heavy metal"
83
+ - **Output**: 768 dimensional embedding, quantized to 12 RVQ depth, 10 bit
84
+ codes
85
+ - **Encoder-decoder Transformer LLM**: Generates audio tokens given context
86
+ and style
87
+ - **Encoder Input**: (Context, 1000 tokens) 10s of audio context tokens w/
88
+ 4 RVQ depth, (Style, 6 tokens) Quantized MusicCoCa style embedding
89
+ - **Decoder Output**: (Generated, 800 tokens) 2s of audio w/ 16 RVQ depth
90
+
91
+ ## Uses
92
+
93
+ Music generation models, in particular ones targeted for continuous real-time
94
+ generation and control, have a wide range of applications across various
95
+ industries and domains. The following list of potential uses is not
96
+ comprehensive. The purpose of this list is to provide contextual information
97
+ about the possible use-cases that the model creators considered as part of model
98
+ training and development.
99
+
100
+ - **Interactive Music Creation**
101
+ - Live Performance / Improvisation: These models can be used to generate
102
+ music in a live performance setting, controlled by performers
103
+ manipulating style embeddings or the audio context
104
+ - Accessible Music-Making & Music Therapy: People with impediments to
105
+ using traditional instruments (skill gaps, disabilities, etc.) can
106
+ participate in communal jam sessions or solo music creation.
107
+ - Video Games: Developers can create a custom soundtrack for users in
108
+ real-time based on their actions and environment.
109
+ - **Research**
110
+ - Transfer learning: Researchers can leverage representations from
111
+ MusicCoCa and Magenta RT to recognize musical information.
112
+ - **Personalization**
113
+ - Musicians can finetune models with their own catalog to customize the
114
+ model to their style (fine tuning support coming soon).
115
+ - **Education**
116
+ - Exploring Genres, Instruments, and History: Natural language prompting
117
+ enables users to quickly learn about and experiment with musical
118
+ concepts.
119
+
120
+ ### Out-of-Scope Use
121
+
122
+ See our [Terms of Use](#terms-of-use) above for usage we consider out of scope.
123
+
124
+ ## Bias, Risks, and Limitations
125
+
126
+ Magenta RT supports the real-time generation and steering of instrumental music.
127
+ The purpose and intention of this capability is to foster the development of new
128
+ real-time, interactive co-creation workflows that seamlessly integrate with
129
+ human-centered forms of musical creativity.
130
+
131
+ Every AI music generation model, including Magenta RT, carries a risk of
132
+ impacting the economic and cultural landscape of music. We aim to mitigate these
133
+ risks through the following avenues:
134
+
135
+ - Prioritizing human-AI interaction as fundamental in the design of Magenta
136
+ RT.
137
+ - Distributing the model under a terms of service that prohibit developers
138
+ from generating outputs that infringe or violate the rights of others,
139
+ including rights in copyrighted content.
140
+ - Training on primarily instrumental data. With specific prompting, this model
141
+ has been observed to generate some vocal sounds and effects, though those
142
+ vocal sounds and effects tend to be non-lexical.
143
+
144
+ ### Known limitations
145
+
146
+ **Coverage of broad musical styles**. Magenta RT's training data primarily
147
+ consists of Western instrumental music. As a consequence, Magenta RT has
148
+ incomplete coverage of both vocal performance and the broader landscape of rich
149
+ musical traditions worldwide. For real-time generation with broader style
150
+ coverage, we refer users to our
151
+ [Lyria RealTime API](g.co/magenta/lyria-realtime).
152
+
153
+ **Vocals**. While the model is capable of generating non-lexical vocalizations
154
+ and humming, it is not conditioned on lyrics and is unlikely to generate actual
155
+ words. However, there remains some risk of generating explicit or
156
+ culturally-insensitive lyrical content.
157
+
158
+ **Latency**. Because the Magenta RT LLM operates on two second chunks, user
159
+ inputs for the style prompt may take two or more seconds to influence the
160
+ musical output.
161
+
162
+ **Limited context**. Because the Magenta RT encoder has a maximum audio context
163
+ window of ten seconds, the model is unable to directly reference music that has
164
+ been output earlier than that. While the context is sufficient to enable the
165
+ model to create melodies, rhythms, and chord progressions, the model is not
166
+ capable of automatically creating longer-term song structures.
167
+
168
+ ### Benefits
169
+
170
+ At the time of release, Magenta RealTime represents the only open weights model
171
+ supporting real-time, continuous musical audio generation. It is designed
172
+ specifically to enable live, interactive musical creation, bringing new
173
+ capabilities to musical performances, art installations, video games, and many
174
+ other applications.
175
+
176
+ ## How to Get Started with the Model
177
+
178
+ See our
179
+ [Colab demo](https://colab.research.google.com/github/magenta/magenta-realtime/blob/main/notebooks/Magenta_RT_Demo.ipynb)
180
+ and [GitHub repository](https://github.com/magenta/magenta-realtime) for usage
181
+ examples.
182
+
183
+ ## Training Details
184
+
185
+ ### Training Data
186
+
187
+ Magenta RealTime was trained on ~190k hours of stock music from multiple
188
+ sources, mostly instrumental.
189
+
190
+ ### Hardware
191
+
192
+ Magenta RealTime was trained using
193
+ [Tensor Processing Unit (TPU)](https://cloud.google.com/tpu/docs/intro-to-tpu)
194
+ hardware (TPUv6e / Trillium).
195
+
196
+ ### Software
197
+
198
+ Training was done using [JAX](https://github.com/jax-ml/jax) and
199
+ [T5X](https://github.com/google-research/t5x), utilizing
200
+ [SeqIO](https://github.com/google/seqio) for data pipelines. JAX allows
201
+ researchers to take advantage of the latest generation of hardware, including
202
+ TPUs, for faster and more efficient training of large models.
203
+
204
+ ## Evaluation
205
+
206
+ Model evaluation metrics and results will be shared in our forthcoming technical
207
+ report.
208
+
209
+ ## Citation
210
+
211
+ A technical report is forthcoming. For now, please cite our
212
+ [blog post](https://g.co/magenta/rt).
213
+
214
+ **BibTeX:**
215
+
216
+ ```
217
+ @article{magenta_rt,
218
+ title={Magenta RealTime},
219
+ url={https://g.co/magenta/rt},
220
+ publisher={Google DeepMind},
221
+ author={Lyria Team},
222
+ year={2025}
223
+ }
224
+ ```
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/0 ADDED
Binary file (3.68 kB). View file
 
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_cross_attention_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_cross_attention_layer_norm.scale.v/0 ADDED
Binary file (3.85 kB). View file
 
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_12.pre_self_attention_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_12.pre_self_attention_layer_norm.scale.v/0 ADDED
Binary file (3.75 kB). View file
 
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_4.pre_cross_attention_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_4.pre_cross_attention_layer_norm.scale.v/0 ADDED
Binary file (3.83 kB). View file
 
checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_12.pre_attention_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_12.pre_attention_layer_norm.scale.v/0 ADDED
Binary file (3.74 kB). View file
 
checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_19.pre_attention_layer_norm.scale.v/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_19.pre_attention_layer_norm.scale.v/0 ADDED
Binary file (3.77 kB). View file
 
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.self_attention.key.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_0.encoder_decoder_attention.value.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_10.encoder_decoder_attention.value.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_11.encoder_decoder_attention.query.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_12.pre_cross_attention_layer_norm.scale/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_12.pre_cross_attention_layer_norm.scale/0 ADDED
Binary file (3.72 kB). View file
 
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_13.mlp.wi_0.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.pre_cross_attention_layer_norm.scale/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.pre_cross_attention_layer_norm.scale/0 ADDED
Binary file (3.7 kB). View file
 
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_16.self_attention.out.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[512,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_17.mlp.wi_1.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wi_0.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wo.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1408,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[2816,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_4.self_attention.value.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_5.self_attention.out.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[512,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.key.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.self_attention.out.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[512,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.out.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[512,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.value.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_1.mlp.wi_0.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_11.pre_attention_layer_norm.scale/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_11.pre_attention_layer_norm.scale/0 ADDED
Binary file (3.78 kB). View file
 
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_14.mlp.wo.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1408,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[2816,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_16.mlp.wi_0.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_17.mlp.wo.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1408,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[2816,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.attention.query.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.pre_mlp_layer_norm.scale/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.pre_mlp_layer_norm.scale/0 ADDED
Binary file (3.63 kB). View file
 
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_2.attention.query.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_21.attention.value.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_3.mlp.wi_1.kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
savedmodels/musiccoca_mv212_quant/variables/variables.index ADDED
Binary file (437 Bytes). View file
 
savedmodels/musiccoca_mv212f_cpu_compat/variables/variables.index ADDED
Binary file (1.9 kB). View file
 
savedmodels/ssv2_48k_stereo/decoder/variables/variables.index ADDED
Binary file (2.42 kB). View file
 
savedmodels/ssv2_48k_stereo/encoder/variables/variables.index ADDED
Binary file (2.36 kB). View file
 
savedmodels/ssv2_48k_stereo/quantizer/variables/variables.index ADDED
Binary file (3.85 kB). View file
 
testdata/musiccoca_mv212/inputs.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Ambient
2
+ Blues
3
+ Classical
4
+ Dance
5
+ Electronic
6
+ Folk
7
+ Gospel
8
+ Hip hop
9
+ Indie
10
+ Jazz
11
+ K-pop
12
+ Latin
13
+ Metal
14
+ New age
15
+ Opera
16
+ Pop
17
+ Q-pop
18
+ Rock
19
+ Soul
20
+ Techno
21
+ Underground
22
+ Viking
23
+ World
24
+ Xylophone
25
+ Yacht
26
+ Zamba