File size: 3,494 Bytes
258fd02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# ================ Logging ====================== #
root_dir: exp/song/${get_fname:}

# ================ Checkpoints ================== #
use_pretrained: deepspeed # ['ddp', 'continue', 'deepspeed']
pretrained:
  ddp_checkpoint:
  deepspeed_checkpoint: ./ckpt/60000_alnew.pt
  continue_checkpoint: 
  
# ================ Data & loader ================== #
prompt_select: random
train_jsonl_list:
- .jsonl
val_jsonl_list:
- .jsonl
train_scp_list:
- .scp
val_scp_list:
- .scp

lyric_processor:
max_dur: 150
min_dur: 30
batch_size: 2
prompt_len: 10
pad_to_max: true

# ================ Training ======================= #
accelerator: gpu
devices: 8
num_nodes: 4
val_check_interval: 2500
accumulate_grad_batches: 1
strategy: 'deepspeed_stage_2' # ['ddp', 'fsdp', 'deepspeed_stage_2', 'ddp_find_unused_parameters_true']
precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed']

optim:
  optimizer: adamw
  updates_per_epoch: 1000
  epochs: 100
  old_lr: 0 # 1e-4
  new_lr: 1e-4
  max_norm: 0.5
  adam:
    betas:
    - 0.9
    - 0.95
    weight_decay: 0.00001 # 0.1
    eps: 1e-8

schedule:
  lr_scheduler: cosine
  cosine:
    warmup: 4000
    lr_min_ratio: 0.0
    cycle_length: 1.0

# ================ Audio tokenzier ================ #
audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
audio_tokenizer_frame_rate: 25
audio_tokenizer_code_depth: 1
sample_rate: 48000

audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors
audio_tokenizer_frame_rate_sep: 25
audio_tokenizer_code_depth_sep: 2
sample_rate_sep: 48000

# ================ VAE ================ #
vae_config: ./ckpt/vae/stable_audio_1920_vae.json
vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt

# ================== LM =========================== #
lm:
  lm_type: Llama # [Llama]
  dim: 1536
  intermediate_size: 8960
  num_heads: 12
  num_layers: 28
  code_depth: 3
  code_size: 16384
  dropout: 0.0
  activation: gelu
  norm_first: true
  bias_ff: false
  bias_attn: false
  bias_proj: false
  causal: true
  custom: false
  memory_efficient: true
  attention_as_float32: false
  layer_scale: null
  positional_embedding: sin
  xpos: false
  checkpointing: torch
  weight_init: gaussian
  depthwise_init: current
  zero_bias_init: true
  norm: layer_norm
  cross_attention: false
  qk_layer_norm: false
  qk_layer_norm_cross: false
  attention_dropout: null
  kv_repeat: 1

codebooks_pattern:
  modeling: delay
  delay:
    delays: [ 0, 250, 250 ]
    flatten_first: 0
    empty_initial: 0

# ================ Conditioners ===================== #
classifier_free_guidance:
  # drop all conditions simultaneously
  training_dropout: 0.15
  inference_coef: 1.5

attribute_dropout:
  # drop each condition separately
  args:
    active_on_eval: false
  text:
    description: 0.0
    type_info: 0.5
  audio:
    prompt_audio: 0.0

use_text_training: True
fuser:
  sum: []
  prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order

conditioners:
  prompt_audio:
    model: qt_embedding
    qt_embedding:
      code_size: 16384
      code_depth: 3
      max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1
  description:
    model: QwTokenizer
    QwTokenizer:
      token_path: third_party/Qwen2-7B
      max_len: 300
      add_token_list: ${load_yaml:conf/vocab.yaml}
  type_info:
    model: QwTextTokenizer
    QwTextTokenizer:
      token_path: third_party/Qwen2-7B
      max_len: 50