File size: 3,146 Bytes
8fac3b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
audio:
  chunk_size: 132300 # samplerate * segment
  min_mean_abs: 0.001
  hop_length: 1024

training:
  batch_size: 5
  gradient_accumulation_steps: 3
  grad_clip: 0
  segment: 3
  shift: 1
  samplerate: 44100
  channels: 2
  normalize: true
  instruments: ['similarity', 'difference']
  target_instrument: null
  num_epochs: 1000
  num_steps: 1000
  optimizer: prodigy
  lr: 1.0
  patience: 80
  reduce_factor: 0.95
  q: 0.95
  coarse_loss_clip: true
  ema_momentum: 0.999
  other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
  use_amp: false # enable or disable usage of mixed precision (float16) - usually it must be true

augmentations:
  enable: false # enable or disable all augmentations (to fast disable if needed)
  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
  loudness_min: 0.5
  loudness_max: 1.5
  mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
  mixup_probs: [0.2, 0.02]
  mixup_loudness_min: 0.5
  mixup_loudness_max: 1.5

inference:
  num_overlap: 4
  batch_size: 18

loss_multistft:
  fft_sizes:
  - 2048
  - 4096
  hop_sizes:
  - 1024
  - 2048
  win_lengths:
  - 2048
  - 4096
  window: "hann_window"
  scale: "mel"
  n_bins: 128
  sample_rate: 44100
  perceptual_weighting: true
  w_sc: 1.0
  w_log_mag: 1.0
  w_lin_mag: 0.0
  w_phs: 0.0
  mag_distance: "L1"

model: htdemucs

htdemucs:  # see demucs/htdemucs.py for a detailed description
  # Channels
  channels: 48
  channels_time:
  growth: 2
  # STFT
  num_subbands: 1
  nfft: 4096
  wiener_iters: 0
  end_iters: 0
  wiener_residual: false
  cac: true
  # Main structure
  depth: 4
  rewrite: true
  # Frequency Branch
  multi_freqs: []
  multi_freqs_depth: 3
  freq_emb: 0.2
  emb_scale: 10
  emb_smooth: true
  # Convolutions
  kernel_size: 8
  stride: 4
  time_stride: 2
  context: 1
  context_enc: 0
  # normalization
  norm_starts: 4
  norm_groups: 4
  # DConv residual branch
  dconv_mode: 3
  dconv_depth: 2
  dconv_comp: 8
  dconv_init: 1e-3
  # Before the Transformer
  bottom_channels: 512
  # CrossTransformer
  # ------ Common to all
  # Regular parameters
  t_layers: 5
  t_hidden_scale: 4.0
  t_heads: 8
  t_dropout: 0.0
  t_layer_scale: True
  t_gelu: True
  # ------------- Positional Embedding
  t_emb: sin
  t_max_positions: 10000 # for the scaled embedding
  t_max_period: 10000.0
  t_weight_pos_embed: 1.0
  t_cape_mean_normalize: True
  t_cape_augment: True
  t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
  t_sin_random_shift: 0
  # ------------- norm before a transformer encoder
  t_norm_in: True
  t_norm_in_group: False
  # ------------- norm inside the encoder
  t_group_norm: False
  t_norm_first: True
  t_norm_out: True
  # ------------- optim
  t_weight_decay: 0.0
  t_lr:
  # ------------- sparsity
  t_sparse_self_attn: False
  t_sparse_cross_attn: False
  t_mask_type: diag
  t_mask_random_seed: 42
  t_sparse_attn_window: 400
  t_global_window: 100
  t_sparsity: 0.95
  t_auto_sparsity: False
  # Cross Encoder First (False)
  t_cross_first: False
  # Weight init
  rescale: 0.1