Spaces:

tencent
/

SongGeneration

Running on L40S

hainazhu

Add application file

258fd02 18 days ago

3.49 kB

	# ================ Logging ====================== #
	root_dir: exp/song/${get_fname:}

	# ================ Checkpoints ================== #
	use_pretrained: deepspeed # ['ddp', 'continue', 'deepspeed']
	pretrained:
	ddp_checkpoint:
	deepspeed_checkpoint: ./ckpt/60000_alnew.pt
	continue_checkpoint:

	# ================ Data & loader ================== #
	prompt_select: random
	train_jsonl_list:
	- .jsonl
	val_jsonl_list:
	- .jsonl
	train_scp_list:
	- .scp
	val_scp_list:
	- .scp

	lyric_processor:
	max_dur: 150
	min_dur: 30
	batch_size: 2
	prompt_len: 10
	pad_to_max: true

	# ================ Training ======================= #
	accelerator: gpu
	devices: 8
	num_nodes: 4
	val_check_interval: 2500
	accumulate_grad_batches: 1
	strategy: 'deepspeed_stage_2' # ['ddp', 'fsdp', 'deepspeed_stage_2', 'ddp_find_unused_parameters_true']
	precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed']

	optim:
	optimizer: adamw
	updates_per_epoch: 1000
	epochs: 100
	old_lr: 0 # 1e-4
	new_lr: 1e-4
	max_norm: 0.5
	adam:
	betas:
	- 0.9
	- 0.95
	weight_decay: 0.00001 # 0.1
	eps: 1e-8

	schedule:
	lr_scheduler: cosine
	cosine:
	warmup: 4000
	lr_min_ratio: 0.0
	cycle_length: 1.0

	# ================ Audio tokenzier ================ #
	audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
	audio_tokenizer_frame_rate: 25
	audio_tokenizer_code_depth: 1
	sample_rate: 48000

	audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors
	audio_tokenizer_frame_rate_sep: 25
	audio_tokenizer_code_depth_sep: 2
	sample_rate_sep: 48000

	# ================ VAE ================ #
	vae_config: ./ckpt/vae/stable_audio_1920_vae.json
	vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt

	# ================== LM =========================== #
	lm:
	lm_type: Llama # [Llama]
	dim: 1536
	intermediate_size: 8960
	num_heads: 12
	num_layers: 28
	code_depth: 3
	code_size: 16384
	dropout: 0.0
	activation: gelu
	norm_first: true
	bias_ff: false
	bias_attn: false
	bias_proj: false
	causal: true
	custom: false
	memory_efficient: true
	attention_as_float32: false
	layer_scale: null
	positional_embedding: sin
	xpos: false
	checkpointing: torch
	weight_init: gaussian
	depthwise_init: current
	zero_bias_init: true
	norm: layer_norm
	cross_attention: false
	qk_layer_norm: false
	qk_layer_norm_cross: false
	attention_dropout: null
	kv_repeat: 1

	codebooks_pattern:
	modeling: delay
	delay:
	delays: [ 0, 250, 250 ]
	flatten_first: 0
	empty_initial: 0

	# ================ Conditioners ===================== #
	classifier_free_guidance:
	# drop all conditions simultaneously
	training_dropout: 0.15
	inference_coef: 1.5

	attribute_dropout:
	# drop each condition separately
	args:
	active_on_eval: false
	text:
	description: 0.0
	type_info: 0.5
	audio:
	prompt_audio: 0.0

	use_text_training: True
	fuser:
	sum: []
	prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order

	conditioners:
	prompt_audio:
	model: qt_embedding
	qt_embedding:
	code_size: 16384
	code_depth: 3
	max_len: ${eval:${prompt_len}${audio_tokenizer_frame_rate}+2} # 2510+2+1
	description:
	model: QwTokenizer
	QwTokenizer:
	token_path: third_party/Qwen2-7B
	max_len: 300
	add_token_list: ${load_yaml:conf/vocab.yaml}
	type_info:
	model: QwTextTokenizer
	QwTextTokenizer:
	token_path: third_party/Qwen2-7B
	max_len: 50