# @package __global__ | |
# This is the training loop solver | |
# for the base MusicGen model (text-to-music) | |
# on monophonic audio sampled at 32 kHz | |
defaults: | |
- musicgen/default | |
- /model: lm/musicgen_lm | |
- override /dset: audio/default | |
- _self_ | |
autocast: true | |
autocast_dtype: float16 | |
# EnCodec large trained on mono-channel music audio sampled at 32khz | |
# with a total stride of 640 leading to 50 frames/s. | |
# rvq.n_q=4, rvq.bins=2048, no quantization dropout | |
# (transformer_lm card and n_q must be compatible) | |
compression_model_checkpoint: //pretrained/facebook/encodec_32khz | |
channels: 1 | |
sample_rate: 32000 | |
deadlock: | |
use: true # deadlock detection | |
dataset: | |
batch_size: 192 # 32 GPUs | |
sample_on_weight: false # Uniform sampling all the way | |
sample_on_duration: false # Uniform sampling all the way | |
generate: | |
lm: | |
use_sampling: true | |
top_k: 250 | |
top_p: 0.0 | |
optim: | |
epochs: 500 | |
optimizer: dadam | |
lr: 1 | |
ema: | |
use: true | |
updates: 10 | |
device: cuda | |
logging: | |
log_tensorboard: true | |
schedule: | |
lr_scheduler: cosine | |
cosine: | |
warmup: 4000 | |
lr_min_ratio: 0.0 | |
cycle_length: 1.0 | |