Add files using upload-large-folder tool
Browse files- README.md +224 -3
- checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/0 +0 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_cross_attention_layer_norm.scale.v/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_cross_attention_layer_norm.scale.v/0 +0 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_12.pre_self_attention_layer_norm.scale.v/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_12.pre_self_attention_layer_norm.scale.v/0 +0 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_4.pre_cross_attention_layer_norm.scale.v/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_4.pre_cross_attention_layer_norm.scale.v/0 +0 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_12.pre_attention_layer_norm.scale.v/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_12.pre_attention_layer_norm.scale.v/0 +0 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_19.pre_attention_layer_norm.scale.v/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_19.pre_attention_layer_norm.scale.v/0 +0 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.self_attention.key.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_0.encoder_decoder_attention.value.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_10.encoder_decoder_attention.value.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_11.encoder_decoder_attention.query.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_12.pre_cross_attention_layer_norm.scale/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_12.pre_cross_attention_layer_norm.scale/0 +0 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_13.mlp.wi_0.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.pre_cross_attention_layer_norm.scale/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.pre_cross_attention_layer_norm.scale/0 +0 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_16.self_attention.out.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_17.mlp.wi_1.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wi_0.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wo.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_4.self_attention.value.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_5.self_attention.out.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.key.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.self_attention.out.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.out.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.value.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_1.mlp.wi_0.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_11.pre_attention_layer_norm.scale/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_11.pre_attention_layer_norm.scale/0 +0 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_14.mlp.wo.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_16.mlp.wi_0.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_17.mlp.wo.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.attention.query.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.pre_mlp_layer_norm.scale/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.pre_mlp_layer_norm.scale/0 +0 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_2.attention.query.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_21.attention.value.kernel/.zarray +1 -0
- checkpoints/llm_large_x3047_c1860k/target.encoder.layers_3.mlp.wi_1.kernel/.zarray +1 -0
- savedmodels/musiccoca_mv212_quant/variables/variables.index +0 -0
- savedmodels/musiccoca_mv212f_cpu_compat/variables/variables.index +0 -0
- savedmodels/ssv2_48k_stereo/decoder/variables/variables.index +0 -0
- savedmodels/ssv2_48k_stereo/encoder/variables/variables.index +0 -0
- savedmodels/ssv2_48k_stereo/quantizer/variables/variables.index +0 -0
- testdata/musiccoca_mv212/inputs.txt +26 -0
README.md
CHANGED
@@ -1,3 +1,224 @@
|
|
1 |
-
---
|
2 |
-
license: cc-by-4.0
|
3 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: cc-by-4.0
|
3 |
+
---
|
4 |
+
|
5 |
+
# Model Card for Magenta RT
|
6 |
+
|
7 |
+
**Authors**: Google DeepMind
|
8 |
+
|
9 |
+
**Resources**:
|
10 |
+
|
11 |
+
- [Blog Post](https://g.co/magenta/rt)
|
12 |
+
- [Colab Demo](https://colab.research.google.com/github/magenta/magenta-realtime/blob/main/notebooks/Magenta_RT_Demo.ipynb)
|
13 |
+
- [Repository](https://github.com/magenta/magenta-realtime)
|
14 |
+
- [HuggingFace](https://huggingface.co/google/magenta-realtime)
|
15 |
+
|
16 |
+
## Terms of Use
|
17 |
+
|
18 |
+
Magenta RealTime is offered under a combination of licenses: the codebase is
|
19 |
+
licensed under
|
20 |
+
[Apache 2.0](https://github.com/magenta/magenta-realtime/blob/main/LICENSE), and
|
21 |
+
the model weights under
|
22 |
+
[Creative Commons Attribution 4.0 International](https://creativecommons.org/licenses/by/4.0/legalcode).
|
23 |
+
In addition, we specify the following usage terms:
|
24 |
+
|
25 |
+
Copyright 2025 Google LLC
|
26 |
+
|
27 |
+
Use these materials responsibly and do not generate content, including outputs,
|
28 |
+
that infringe or violate the rights of others, including rights in copyrighted
|
29 |
+
content.
|
30 |
+
|
31 |
+
Google claims no rights in outputs you generate using Magenta RealTime. You and
|
32 |
+
your users are solely responsible for outputs and their subsequent uses.
|
33 |
+
|
34 |
+
Unless required by applicable law or agreed to in writing, all software and
|
35 |
+
materials distributed here under the Apache 2.0 or CC-BY licenses are
|
36 |
+
distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
37 |
+
either express or implied. See the licenses for the specific language governing
|
38 |
+
permissions and limitations under those licenses. You are solely responsible for
|
39 |
+
determining the appropriateness of using, reproducing, modifying, performing,
|
40 |
+
displaying or distributing the software and materials, and any outputs, and
|
41 |
+
assume any and all risks associated with your use or distribution of any of the
|
42 |
+
software and materials, and any outputs, and your exercise of rights and
|
43 |
+
permissions under the licenses.
|
44 |
+
|
45 |
+
## Model Details
|
46 |
+
|
47 |
+
Magenta RealTime is an open music generation model from Google built from the
|
48 |
+
same research and technology used to create
|
49 |
+
[MusicFX DJ](https://labs.google/fx/tools/music-fx-dj) and
|
50 |
+
[Lyria RealTime](http://goo.gle/lyria-realtime). Magenta RealTime enables the
|
51 |
+
continuous generation of musical audio steered by a text prompt, an audio
|
52 |
+
example, or a weighted combination of multiple text prompts and/or audio
|
53 |
+
examples. Its relatively small size makes it possible to deploy in environments
|
54 |
+
with limited resources, including live performance settings or freely available
|
55 |
+
Colab TPUs.
|
56 |
+
|
57 |
+
### System Components
|
58 |
+
|
59 |
+
Magenta RealTime is composed of three components: SpectroStream, MusicCoCa, and
|
60 |
+
an LLM. A full technical report is forthcoming that will explain each component
|
61 |
+
in more detail.
|
62 |
+
|
63 |
+
1. **SpectroStream** is a discrete audio codec that converts stereo 48kHz audio
|
64 |
+
into tokens, building on the SoundStream RVQ codec from
|
65 |
+
[Zeghidour+ 21](https://arxiv.org/abs/2107.03312)
|
66 |
+
1. **MusicCoCa** is a contrastive-trained model capable of embedding audio and
|
67 |
+
text into a common embedding space, building on
|
68 |
+
[Yu+ 22](https://arxiv.org/abs/2205.01917) and
|
69 |
+
[Huang+ 22](https://arxiv.org/abs/2208.12415).
|
70 |
+
1. An **encoder-decoder Transformer LLM** generates audio tokens given context
|
71 |
+
audio tokens and a tokenized MusicCoCa embedding, building on the MusicLM
|
72 |
+
method from [Agostinelli+ 23](https://arxiv.org/abs/2301.11325)
|
73 |
+
|
74 |
+
### Inputs and outputs
|
75 |
+
|
76 |
+
- **SpectroStream RVQ codec**: Tokenizes high-fidelity music audio
|
77 |
+
- **Encoder input / Decoder output**: Music audio waveforms, 48kHz stereo
|
78 |
+
- **Encoder output / Decoder input**: Discrete audio tokens, 25Hz frame
|
79 |
+
rate, 64 RVQ depth, 10 bit codes, 16kbps
|
80 |
+
- **MusicCoCa**: Joint embeddings of text and music audio
|
81 |
+
- **Input**: Music audio waveforms, 16kHz mono, or text representation of
|
82 |
+
music style e.g. "heavy metal"
|
83 |
+
- **Output**: 768 dimensional embedding, quantized to 12 RVQ depth, 10 bit
|
84 |
+
codes
|
85 |
+
- **Encoder-decoder Transformer LLM**: Generates audio tokens given context
|
86 |
+
and style
|
87 |
+
- **Encoder Input**: (Context, 1000 tokens) 10s of audio context tokens w/
|
88 |
+
4 RVQ depth, (Style, 6 tokens) Quantized MusicCoCa style embedding
|
89 |
+
- **Decoder Output**: (Generated, 800 tokens) 2s of audio w/ 16 RVQ depth
|
90 |
+
|
91 |
+
## Uses
|
92 |
+
|
93 |
+
Music generation models, in particular ones targeted for continuous real-time
|
94 |
+
generation and control, have a wide range of applications across various
|
95 |
+
industries and domains. The following list of potential uses is not
|
96 |
+
comprehensive. The purpose of this list is to provide contextual information
|
97 |
+
about the possible use-cases that the model creators considered as part of model
|
98 |
+
training and development.
|
99 |
+
|
100 |
+
- **Interactive Music Creation**
|
101 |
+
- Live Performance / Improvisation: These models can be used to generate
|
102 |
+
music in a live performance setting, controlled by performers
|
103 |
+
manipulating style embeddings or the audio context
|
104 |
+
- Accessible Music-Making & Music Therapy: People with impediments to
|
105 |
+
using traditional instruments (skill gaps, disabilities, etc.) can
|
106 |
+
participate in communal jam sessions or solo music creation.
|
107 |
+
- Video Games: Developers can create a custom soundtrack for users in
|
108 |
+
real-time based on their actions and environment.
|
109 |
+
- **Research**
|
110 |
+
- Transfer learning: Researchers can leverage representations from
|
111 |
+
MusicCoCa and Magenta RT to recognize musical information.
|
112 |
+
- **Personalization**
|
113 |
+
- Musicians can finetune models with their own catalog to customize the
|
114 |
+
model to their style (fine tuning support coming soon).
|
115 |
+
- **Education**
|
116 |
+
- Exploring Genres, Instruments, and History: Natural language prompting
|
117 |
+
enables users to quickly learn about and experiment with musical
|
118 |
+
concepts.
|
119 |
+
|
120 |
+
### Out-of-Scope Use
|
121 |
+
|
122 |
+
See our [Terms of Use](#terms-of-use) above for usage we consider out of scope.
|
123 |
+
|
124 |
+
## Bias, Risks, and Limitations
|
125 |
+
|
126 |
+
Magenta RT supports the real-time generation and steering of instrumental music.
|
127 |
+
The purpose and intention of this capability is to foster the development of new
|
128 |
+
real-time, interactive co-creation workflows that seamlessly integrate with
|
129 |
+
human-centered forms of musical creativity.
|
130 |
+
|
131 |
+
Every AI music generation model, including Magenta RT, carries a risk of
|
132 |
+
impacting the economic and cultural landscape of music. We aim to mitigate these
|
133 |
+
risks through the following avenues:
|
134 |
+
|
135 |
+
- Prioritizing human-AI interaction as fundamental in the design of Magenta
|
136 |
+
RT.
|
137 |
+
- Distributing the model under a terms of service that prohibit developers
|
138 |
+
from generating outputs that infringe or violate the rights of others,
|
139 |
+
including rights in copyrighted content.
|
140 |
+
- Training on primarily instrumental data. With specific prompting, this model
|
141 |
+
has been observed to generate some vocal sounds and effects, though those
|
142 |
+
vocal sounds and effects tend to be non-lexical.
|
143 |
+
|
144 |
+
### Known limitations
|
145 |
+
|
146 |
+
**Coverage of broad musical styles**. Magenta RT's training data primarily
|
147 |
+
consists of Western instrumental music. As a consequence, Magenta RT has
|
148 |
+
incomplete coverage of both vocal performance and the broader landscape of rich
|
149 |
+
musical traditions worldwide. For real-time generation with broader style
|
150 |
+
coverage, we refer users to our
|
151 |
+
[Lyria RealTime API](g.co/magenta/lyria-realtime).
|
152 |
+
|
153 |
+
**Vocals**. While the model is capable of generating non-lexical vocalizations
|
154 |
+
and humming, it is not conditioned on lyrics and is unlikely to generate actual
|
155 |
+
words. However, there remains some risk of generating explicit or
|
156 |
+
culturally-insensitive lyrical content.
|
157 |
+
|
158 |
+
**Latency**. Because the Magenta RT LLM operates on two second chunks, user
|
159 |
+
inputs for the style prompt may take two or more seconds to influence the
|
160 |
+
musical output.
|
161 |
+
|
162 |
+
**Limited context**. Because the Magenta RT encoder has a maximum audio context
|
163 |
+
window of ten seconds, the model is unable to directly reference music that has
|
164 |
+
been output earlier than that. While the context is sufficient to enable the
|
165 |
+
model to create melodies, rhythms, and chord progressions, the model is not
|
166 |
+
capable of automatically creating longer-term song structures.
|
167 |
+
|
168 |
+
### Benefits
|
169 |
+
|
170 |
+
At the time of release, Magenta RealTime represents the only open weights model
|
171 |
+
supporting real-time, continuous musical audio generation. It is designed
|
172 |
+
specifically to enable live, interactive musical creation, bringing new
|
173 |
+
capabilities to musical performances, art installations, video games, and many
|
174 |
+
other applications.
|
175 |
+
|
176 |
+
## How to Get Started with the Model
|
177 |
+
|
178 |
+
See our
|
179 |
+
[Colab demo](https://colab.research.google.com/github/magenta/magenta-realtime/blob/main/notebooks/Magenta_RT_Demo.ipynb)
|
180 |
+
and [GitHub repository](https://github.com/magenta/magenta-realtime) for usage
|
181 |
+
examples.
|
182 |
+
|
183 |
+
## Training Details
|
184 |
+
|
185 |
+
### Training Data
|
186 |
+
|
187 |
+
Magenta RealTime was trained on ~190k hours of stock music from multiple
|
188 |
+
sources, mostly instrumental.
|
189 |
+
|
190 |
+
### Hardware
|
191 |
+
|
192 |
+
Magenta RealTime was trained using
|
193 |
+
[Tensor Processing Unit (TPU)](https://cloud.google.com/tpu/docs/intro-to-tpu)
|
194 |
+
hardware (TPUv6e / Trillium).
|
195 |
+
|
196 |
+
### Software
|
197 |
+
|
198 |
+
Training was done using [JAX](https://github.com/jax-ml/jax) and
|
199 |
+
[T5X](https://github.com/google-research/t5x), utilizing
|
200 |
+
[SeqIO](https://github.com/google/seqio) for data pipelines. JAX allows
|
201 |
+
researchers to take advantage of the latest generation of hardware, including
|
202 |
+
TPUs, for faster and more efficient training of large models.
|
203 |
+
|
204 |
+
## Evaluation
|
205 |
+
|
206 |
+
Model evaluation metrics and results will be shared in our forthcoming technical
|
207 |
+
report.
|
208 |
+
|
209 |
+
## Citation
|
210 |
+
|
211 |
+
A technical report is forthcoming. For now, please cite our
|
212 |
+
[blog post](https://g.co/magenta/rt).
|
213 |
+
|
214 |
+
**BibTeX:**
|
215 |
+
|
216 |
+
```
|
217 |
+
@article{magenta_rt,
|
218 |
+
title={Magenta RealTime},
|
219 |
+
url={https://g.co/magenta/rt},
|
220 |
+
publisher={Google DeepMind},
|
221 |
+
author={Lyria Team},
|
222 |
+
year={2025}
|
223 |
+
}
|
224 |
+
```
|
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.depth_decoder.depth_layers_2.pre_self_attention_layer_norm.scale.v/0
ADDED
Binary file (3.68 kB). View file
|
|
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_cross_attention_layer_norm.scale.v/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_0.pre_cross_attention_layer_norm.scale.v/0
ADDED
Binary file (3.85 kB). View file
|
|
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_12.pre_self_attention_layer_norm.scale.v/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_12.pre_self_attention_layer_norm.scale.v/0
ADDED
Binary file (3.75 kB). View file
|
|
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_4.pre_cross_attention_layer_norm.scale.v/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/state.param_states.decoder.decoder.temporal_decoder.layers_4.pre_cross_attention_layer_norm.scale.v/0
ADDED
Binary file (3.83 kB). View file
|
|
checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_12.pre_attention_layer_norm.scale.v/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_12.pre_attention_layer_norm.scale.v/0
ADDED
Binary file (3.74 kB). View file
|
|
checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_19.pre_attention_layer_norm.scale.v/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/state.param_states.encoder.layers_19.pre_attention_layer_norm.scale.v/0
ADDED
Binary file (3.77 kB). View file
|
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.depth_decoder.depth_layers_0.self_attention.key.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_0.encoder_decoder_attention.value.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_10.encoder_decoder_attention.value.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_11.encoder_decoder_attention.query.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_12.pre_cross_attention_layer_norm.scale/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_12.pre_cross_attention_layer_norm.scale/0
ADDED
Binary file (3.72 kB). View file
|
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_13.mlp.wi_0.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.pre_cross_attention_layer_norm.scale/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_15.pre_cross_attention_layer_norm.scale/0
ADDED
Binary file (3.7 kB). View file
|
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_16.self_attention.out.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[512,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_17.mlp.wi_1.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wi_0.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_3.mlp.wo.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1408,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[2816,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_4.self_attention.value.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_5.self_attention.out.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[512,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.encoder_decoder_attention.key.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_6.self_attention.out.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[512,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.out.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[512,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.decoder.decoder.temporal_decoder.layers_8.self_attention.value.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_1.mlp.wi_0.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_11.pre_attention_layer_norm.scale/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_11.pre_attention_layer_norm.scale/0
ADDED
Binary file (3.78 kB). View file
|
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_14.mlp.wo.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1408,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[2816,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_16.mlp.wi_0.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_17.mlp.wo.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1408,1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[2816,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.attention.query.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.pre_mlp_layer_norm.scale/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_19.pre_mlp_layer_norm.scale/0
ADDED
Binary file (3.63 kB). View file
|
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_2.attention.query.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_21.attention.value.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,512],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,1024],"zarr_format":2}
|
checkpoints/llm_large_x3047_c1860k/target.encoder.layers_3.mlp.wi_1.kernel/.zarray
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"chunks":[1024,1408],"compressor":{"id":"gzip","level":1},"dimension_separator":".","dtype":"<f4","fill_value":null,"filters":null,"order":"C","shape":[1024,2816],"zarr_format":2}
|
savedmodels/musiccoca_mv212_quant/variables/variables.index
ADDED
Binary file (437 Bytes). View file
|
|
savedmodels/musiccoca_mv212f_cpu_compat/variables/variables.index
ADDED
Binary file (1.9 kB). View file
|
|
savedmodels/ssv2_48k_stereo/decoder/variables/variables.index
ADDED
Binary file (2.42 kB). View file
|
|
savedmodels/ssv2_48k_stereo/encoder/variables/variables.index
ADDED
Binary file (2.36 kB). View file
|
|
savedmodels/ssv2_48k_stereo/quantizer/variables/variables.index
ADDED
Binary file (3.85 kB). View file
|
|
testdata/musiccoca_mv212/inputs.txt
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Ambient
|
2 |
+
Blues
|
3 |
+
Classical
|
4 |
+
Dance
|
5 |
+
Electronic
|
6 |
+
Folk
|
7 |
+
Gospel
|
8 |
+
Hip hop
|
9 |
+
Indie
|
10 |
+
Jazz
|
11 |
+
K-pop
|
12 |
+
Latin
|
13 |
+
Metal
|
14 |
+
New age
|
15 |
+
Opera
|
16 |
+
Pop
|
17 |
+
Q-pop
|
18 |
+
Rock
|
19 |
+
Soul
|
20 |
+
Techno
|
21 |
+
Underground
|
22 |
+
Viking
|
23 |
+
World
|
24 |
+
Xylophone
|
25 |
+
Yacht
|
26 |
+
Zamba
|