# ================ Logging ====================== # root_dir: exp/song/${get_fname:} # ================ Checkpoints ================== # use_pretrained: deepspeed # ['ddp', 'continue', 'deepspeed'] pretrained: ddp_checkpoint: deepspeed_checkpoint: ./ckpt/60000_alnew.pt continue_checkpoint: # ================ Data & loader ================== # prompt_select: random train_jsonl_list: - .jsonl val_jsonl_list: - .jsonl train_scp_list: - .scp val_scp_list: - .scp lyric_processor: max_dur: 150 min_dur: 30 batch_size: 2 prompt_len: 10 pad_to_max: true # ================ Training ======================= # accelerator: gpu devices: 8 num_nodes: 4 val_check_interval: 2500 accumulate_grad_batches: 1 strategy: 'deepspeed_stage_2' # ['ddp', 'fsdp', 'deepspeed_stage_2', 'ddp_find_unused_parameters_true'] precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed'] optim: optimizer: adamw updates_per_epoch: 1000 epochs: 100 old_lr: 0 # 1e-4 new_lr: 1e-4 max_norm: 0.5 adam: betas: - 0.9 - 0.95 weight_decay: 0.00001 # 0.1 eps: 1e-8 schedule: lr_scheduler: cosine cosine: warmup: 4000 lr_min_ratio: 0.0 cycle_length: 1.0 # ================ Audio tokenzier ================ # audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors audio_tokenizer_frame_rate: 25 audio_tokenizer_code_depth: 1 sample_rate: 48000 audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors audio_tokenizer_frame_rate_sep: 25 audio_tokenizer_code_depth_sep: 2 sample_rate_sep: 48000 # ================ VAE ================ # vae_config: ./ckpt/vae/stable_audio_1920_vae.json vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt # ================== LM =========================== # lm: lm_type: Llama # [Llama] dim: 1536 intermediate_size: 8960 num_heads: 12 num_layers: 28 code_depth: 3 code_size: 16384 dropout: 0.0 activation: gelu norm_first: true bias_ff: false bias_attn: false bias_proj: false causal: true custom: false memory_efficient: true attention_as_float32: false layer_scale: null positional_embedding: sin xpos: false checkpointing: torch weight_init: gaussian depthwise_init: current zero_bias_init: true norm: layer_norm cross_attention: false qk_layer_norm: false qk_layer_norm_cross: false attention_dropout: null kv_repeat: 1 codebooks_pattern: modeling: delay delay: delays: [ 0, 250, 250 ] flatten_first: 0 empty_initial: 0 # ================ Conditioners ===================== # classifier_free_guidance: # drop all conditions simultaneously training_dropout: 0.15 inference_coef: 1.5 attribute_dropout: # drop each condition separately args: active_on_eval: false text: description: 0.0 type_info: 0.5 audio: prompt_audio: 0.0 use_text_training: True fuser: sum: [] prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order conditioners: prompt_audio: model: qt_embedding qt_embedding: code_size: 16384 code_depth: 3 max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1 description: model: QwTokenizer QwTokenizer: token_path: third_party/Qwen2-7B max_len: 300 add_token_list: ${load_yaml:conf/vocab.yaml} type_info: model: QwTextTokenizer QwTextTokenizer: token_path: third_party/Qwen2-7B max_len: 50