Spaces:
Running
on
L40S
Running
on
L40S
File size: 3,494 Bytes
258fd02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# ================ Logging ====================== #
root_dir: exp/song/${get_fname:}
# ================ Checkpoints ================== #
use_pretrained: deepspeed # ['ddp', 'continue', 'deepspeed']
pretrained:
ddp_checkpoint:
deepspeed_checkpoint: ./ckpt/60000_alnew.pt
continue_checkpoint:
# ================ Data & loader ================== #
prompt_select: random
train_jsonl_list:
- .jsonl
val_jsonl_list:
- .jsonl
train_scp_list:
- .scp
val_scp_list:
- .scp
lyric_processor:
max_dur: 150
min_dur: 30
batch_size: 2
prompt_len: 10
pad_to_max: true
# ================ Training ======================= #
accelerator: gpu
devices: 8
num_nodes: 4
val_check_interval: 2500
accumulate_grad_batches: 1
strategy: 'deepspeed_stage_2' # ['ddp', 'fsdp', 'deepspeed_stage_2', 'ddp_find_unused_parameters_true']
precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed']
optim:
optimizer: adamw
updates_per_epoch: 1000
epochs: 100
old_lr: 0 # 1e-4
new_lr: 1e-4
max_norm: 0.5
adam:
betas:
- 0.9
- 0.95
weight_decay: 0.00001 # 0.1
eps: 1e-8
schedule:
lr_scheduler: cosine
cosine:
warmup: 4000
lr_min_ratio: 0.0
cycle_length: 1.0
# ================ Audio tokenzier ================ #
audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
audio_tokenizer_frame_rate: 25
audio_tokenizer_code_depth: 1
sample_rate: 48000
audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors
audio_tokenizer_frame_rate_sep: 25
audio_tokenizer_code_depth_sep: 2
sample_rate_sep: 48000
# ================ VAE ================ #
vae_config: ./ckpt/vae/stable_audio_1920_vae.json
vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt
# ================== LM =========================== #
lm:
lm_type: Llama # [Llama]
dim: 1536
intermediate_size: 8960
num_heads: 12
num_layers: 28
code_depth: 3
code_size: 16384
dropout: 0.0
activation: gelu
norm_first: true
bias_ff: false
bias_attn: false
bias_proj: false
causal: true
custom: false
memory_efficient: true
attention_as_float32: false
layer_scale: null
positional_embedding: sin
xpos: false
checkpointing: torch
weight_init: gaussian
depthwise_init: current
zero_bias_init: true
norm: layer_norm
cross_attention: false
qk_layer_norm: false
qk_layer_norm_cross: false
attention_dropout: null
kv_repeat: 1
codebooks_pattern:
modeling: delay
delay:
delays: [ 0, 250, 250 ]
flatten_first: 0
empty_initial: 0
# ================ Conditioners ===================== #
classifier_free_guidance:
# drop all conditions simultaneously
training_dropout: 0.15
inference_coef: 1.5
attribute_dropout:
# drop each condition separately
args:
active_on_eval: false
text:
description: 0.0
type_info: 0.5
audio:
prompt_audio: 0.0
use_text_training: True
fuser:
sum: []
prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order
conditioners:
prompt_audio:
model: qt_embedding
qt_embedding:
code_size: 16384
code_depth: 3
max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1
description:
model: QwTokenizer
QwTokenizer:
token_path: third_party/Qwen2-7B
max_len: 300
add_token_list: ${load_yaml:conf/vocab.yaml}
type_info:
model: QwTextTokenizer
QwTextTokenizer:
token_path: third_party/Qwen2-7B
max_len: 50
|