SongGeneration / conf /infer.yaml
hainazhu
Add application file
258fd02
raw
history blame
3.49 kB
# ================ Logging ====================== #
root_dir: exp/song/${get_fname:}
# ================ Checkpoints ================== #
use_pretrained: deepspeed # ['ddp', 'continue', 'deepspeed']
pretrained:
ddp_checkpoint:
deepspeed_checkpoint: ./ckpt/60000_alnew.pt
continue_checkpoint:
# ================ Data & loader ================== #
prompt_select: random
train_jsonl_list:
- .jsonl
val_jsonl_list:
- .jsonl
train_scp_list:
- .scp
val_scp_list:
- .scp
lyric_processor:
max_dur: 150
min_dur: 30
batch_size: 2
prompt_len: 10
pad_to_max: true
# ================ Training ======================= #
accelerator: gpu
devices: 8
num_nodes: 4
val_check_interval: 2500
accumulate_grad_batches: 1
strategy: 'deepspeed_stage_2' # ['ddp', 'fsdp', 'deepspeed_stage_2', 'ddp_find_unused_parameters_true']
precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed']
optim:
optimizer: adamw
updates_per_epoch: 1000
epochs: 100
old_lr: 0 # 1e-4
new_lr: 1e-4
max_norm: 0.5
adam:
betas:
- 0.9
- 0.95
weight_decay: 0.00001 # 0.1
eps: 1e-8
schedule:
lr_scheduler: cosine
cosine:
warmup: 4000
lr_min_ratio: 0.0
cycle_length: 1.0
# ================ Audio tokenzier ================ #
audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
audio_tokenizer_frame_rate: 25
audio_tokenizer_code_depth: 1
sample_rate: 48000
audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors
audio_tokenizer_frame_rate_sep: 25
audio_tokenizer_code_depth_sep: 2
sample_rate_sep: 48000
# ================ VAE ================ #
vae_config: ./ckpt/vae/stable_audio_1920_vae.json
vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt
# ================== LM =========================== #
lm:
lm_type: Llama # [Llama]
dim: 1536
intermediate_size: 8960
num_heads: 12
num_layers: 28
code_depth: 3
code_size: 16384
dropout: 0.0
activation: gelu
norm_first: true
bias_ff: false
bias_attn: false
bias_proj: false
causal: true
custom: false
memory_efficient: true
attention_as_float32: false
layer_scale: null
positional_embedding: sin
xpos: false
checkpointing: torch
weight_init: gaussian
depthwise_init: current
zero_bias_init: true
norm: layer_norm
cross_attention: false
qk_layer_norm: false
qk_layer_norm_cross: false
attention_dropout: null
kv_repeat: 1
codebooks_pattern:
modeling: delay
delay:
delays: [ 0, 250, 250 ]
flatten_first: 0
empty_initial: 0
# ================ Conditioners ===================== #
classifier_free_guidance:
# drop all conditions simultaneously
training_dropout: 0.15
inference_coef: 1.5
attribute_dropout:
# drop each condition separately
args:
active_on_eval: false
text:
description: 0.0
type_info: 0.5
audio:
prompt_audio: 0.0
use_text_training: True
fuser:
sum: []
prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order
conditioners:
prompt_audio:
model: qt_embedding
qt_embedding:
code_size: 16384
code_depth: 3
max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1
description:
model: QwTokenizer
QwTokenizer:
token_path: third_party/Qwen2-7B
max_len: 300
add_token_list: ${load_yaml:conf/vocab.yaml}
type_info:
model: QwTextTokenizer
QwTextTokenizer:
token_path: third_party/Qwen2-7B
max_len: 50