Spaces:
Running
on
L40S
Running
on
L40S
# ================ Logging ====================== # | |
root_dir: exp/song/${get_fname:} | |
# ================ Checkpoints ================== # | |
use_pretrained: deepspeed # ['ddp', 'continue', 'deepspeed'] | |
pretrained: | |
ddp_checkpoint: | |
deepspeed_checkpoint: ./ckpt/60000_alnew.pt | |
continue_checkpoint: | |
# ================ Data & loader ================== # | |
prompt_select: random | |
train_jsonl_list: | |
- .jsonl | |
val_jsonl_list: | |
- .jsonl | |
train_scp_list: | |
- .scp | |
val_scp_list: | |
- .scp | |
lyric_processor: | |
max_dur: 150 | |
min_dur: 30 | |
batch_size: 2 | |
prompt_len: 10 | |
pad_to_max: true | |
# ================ Training ======================= # | |
accelerator: gpu | |
devices: 8 | |
num_nodes: 4 | |
val_check_interval: 2500 | |
accumulate_grad_batches: 1 | |
strategy: 'deepspeed_stage_2' # ['ddp', 'fsdp', 'deepspeed_stage_2', 'ddp_find_unused_parameters_true'] | |
precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed'] | |
optim: | |
optimizer: adamw | |
updates_per_epoch: 1000 | |
epochs: 100 | |
old_lr: 0 # 1e-4 | |
new_lr: 1e-4 | |
max_norm: 0.5 | |
adam: | |
betas: | |
- 0.9 | |
- 0.95 | |
weight_decay: 0.00001 # 0.1 | |
eps: 1e-8 | |
schedule: | |
lr_scheduler: cosine | |
cosine: | |
warmup: 4000 | |
lr_min_ratio: 0.0 | |
cycle_length: 1.0 | |
# ================ Audio tokenzier ================ # | |
audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors | |
audio_tokenizer_frame_rate: 25 | |
audio_tokenizer_code_depth: 1 | |
sample_rate: 48000 | |
audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors | |
audio_tokenizer_frame_rate_sep: 25 | |
audio_tokenizer_code_depth_sep: 2 | |
sample_rate_sep: 48000 | |
# ================ VAE ================ # | |
vae_config: ./ckpt/vae/stable_audio_1920_vae.json | |
vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt | |
# ================== LM =========================== # | |
lm: | |
lm_type: Llama # [Llama] | |
dim: 1536 | |
intermediate_size: 8960 | |
num_heads: 12 | |
num_layers: 28 | |
code_depth: 3 | |
code_size: 16384 | |
dropout: 0.0 | |
activation: gelu | |
norm_first: true | |
bias_ff: false | |
bias_attn: false | |
bias_proj: false | |
causal: true | |
custom: false | |
memory_efficient: true | |
attention_as_float32: false | |
layer_scale: null | |
positional_embedding: sin | |
xpos: false | |
checkpointing: torch | |
weight_init: gaussian | |
depthwise_init: current | |
zero_bias_init: true | |
norm: layer_norm | |
cross_attention: false | |
qk_layer_norm: false | |
qk_layer_norm_cross: false | |
attention_dropout: null | |
kv_repeat: 1 | |
codebooks_pattern: | |
modeling: delay | |
delay: | |
delays: [ 0, 250, 250 ] | |
flatten_first: 0 | |
empty_initial: 0 | |
# ================ Conditioners ===================== # | |
classifier_free_guidance: | |
# drop all conditions simultaneously | |
training_dropout: 0.15 | |
inference_coef: 1.5 | |
attribute_dropout: | |
# drop each condition separately | |
args: | |
active_on_eval: false | |
text: | |
description: 0.0 | |
type_info: 0.5 | |
audio: | |
prompt_audio: 0.0 | |
use_text_training: True | |
fuser: | |
sum: [] | |
prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order | |
conditioners: | |
prompt_audio: | |
model: qt_embedding | |
qt_embedding: | |
code_size: 16384 | |
code_depth: 3 | |
max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1 | |
description: | |
model: QwTokenizer | |
QwTokenizer: | |
token_path: third_party/Qwen2-7B | |
max_len: 300 | |
add_token_list: ${load_yaml:conf/vocab.yaml} | |
type_info: | |
model: QwTextTokenizer | |
QwTextTokenizer: | |
token_path: third_party/Qwen2-7B | |
max_len: 50 | |