File size: 3,146 Bytes
8fac3b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
audio:
chunk_size: 132300 # samplerate * segment
min_mean_abs: 0.001
hop_length: 1024
training:
batch_size: 5
gradient_accumulation_steps: 3
grad_clip: 0
segment: 3
shift: 1
samplerate: 44100
channels: 2
normalize: true
instruments: ['similarity', 'difference']
target_instrument: null
num_epochs: 1000
num_steps: 1000
optimizer: prodigy
lr: 1.0
patience: 80
reduce_factor: 0.95
q: 0.95
coarse_loss_clip: true
ema_momentum: 0.999
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
use_amp: false # enable or disable usage of mixed precision (float16) - usually it must be true
augmentations:
enable: false # enable or disable all augmentations (to fast disable if needed)
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
loudness_min: 0.5
loudness_max: 1.5
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
mixup_probs: [0.2, 0.02]
mixup_loudness_min: 0.5
mixup_loudness_max: 1.5
inference:
num_overlap: 4
batch_size: 18
loss_multistft:
fft_sizes:
- 2048
- 4096
hop_sizes:
- 1024
- 2048
win_lengths:
- 2048
- 4096
window: "hann_window"
scale: "mel"
n_bins: 128
sample_rate: 44100
perceptual_weighting: true
w_sc: 1.0
w_log_mag: 1.0
w_lin_mag: 0.0
w_phs: 0.0
mag_distance: "L1"
model: htdemucs
htdemucs: # see demucs/htdemucs.py for a detailed description
# Channels
channels: 48
channels_time:
growth: 2
# STFT
num_subbands: 1
nfft: 4096
wiener_iters: 0
end_iters: 0
wiener_residual: false
cac: true
# Main structure
depth: 4
rewrite: true
# Frequency Branch
multi_freqs: []
multi_freqs_depth: 3
freq_emb: 0.2
emb_scale: 10
emb_smooth: true
# Convolutions
kernel_size: 8
stride: 4
time_stride: 2
context: 1
context_enc: 0
# normalization
norm_starts: 4
norm_groups: 4
# DConv residual branch
dconv_mode: 3
dconv_depth: 2
dconv_comp: 8
dconv_init: 1e-3
# Before the Transformer
bottom_channels: 512
# CrossTransformer
# ------ Common to all
# Regular parameters
t_layers: 5
t_hidden_scale: 4.0
t_heads: 8
t_dropout: 0.0
t_layer_scale: True
t_gelu: True
# ------------- Positional Embedding
t_emb: sin
t_max_positions: 10000 # for the scaled embedding
t_max_period: 10000.0
t_weight_pos_embed: 1.0
t_cape_mean_normalize: True
t_cape_augment: True
t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
t_sin_random_shift: 0
# ------------- norm before a transformer encoder
t_norm_in: True
t_norm_in_group: False
# ------------- norm inside the encoder
t_group_norm: False
t_norm_first: True
t_norm_out: True
# ------------- optim
t_weight_decay: 0.0
t_lr:
# ------------- sparsity
t_sparse_self_attn: False
t_sparse_cross_attn: False
t_mask_type: diag
t_mask_random_seed: 42
t_sparse_attn_window: 400
t_global_window: 100
t_sparsity: 0.95
t_auto_sparsity: False
# Cross Encoder First (False)
t_cross_first: False
# Weight init
rescale: 0.1
|