|
|
|
|
|
defaults: |
|
- ../default |
|
- override /dset: audio/default |
|
- _self_ |
|
|
|
solver: compression |
|
sample_rate: ??? |
|
channels: ??? |
|
|
|
|
|
losses: |
|
adv: 4. |
|
feat: 4. |
|
l1: 0.1 |
|
mel: 0. |
|
msspec: 2. |
|
sisnr: 0. |
|
balancer: |
|
balance_grads: true |
|
ema_decay: 0.999 |
|
per_batch_item: true |
|
total_norm: 1. |
|
|
|
adversarial: |
|
every: 1 |
|
adversaries: [msstftd] |
|
adv_loss: hinge |
|
feat_loss: l1 |
|
|
|
|
|
l1: {} |
|
l2: {} |
|
mrstft: |
|
factor_sc: .5 |
|
factor_mag: .5 |
|
normalized: false |
|
mel: |
|
sample_rate: ${sample_rate} |
|
n_fft: 1024 |
|
hop_length: 256 |
|
win_length: 1024 |
|
n_mels: 64 |
|
f_min: 64 |
|
f_max: null |
|
normalized: false |
|
floor_level: 1e-5 |
|
sisnr: |
|
sample_rate: ${sample_rate} |
|
segment: 5. |
|
msspec: |
|
sample_rate: ${sample_rate} |
|
range_start: 6 |
|
range_end: 11 |
|
n_mels: 64 |
|
f_min: 64 |
|
f_max: null |
|
normalized: true |
|
alphas: false |
|
floor_level: 1e-5 |
|
|
|
|
|
metrics: |
|
visqol: |
|
mode: audio |
|
bin: null |
|
model: tcdaudio14_aacvopus_coresv_svrnsim_n.68_g.01_c1.model |
|
|
|
|
|
msstftd: |
|
in_channels: 1 |
|
out_channels: 1 |
|
filters: 32 |
|
norm: weight_norm |
|
n_ffts: [1024, 2048, 512, 256, 128] |
|
hop_lengths: [256, 512, 128, 64, 32] |
|
win_lengths: [1024, 2048, 512, 256, 128] |
|
activation: LeakyReLU |
|
activation_params: {negative_slope: 0.3} |
|
msd: |
|
in_channels: 1 |
|
out_channels: 1 |
|
scale_norms: [spectral_norm, weight_norm, weight_norm] |
|
kernel_sizes: [5, 3] |
|
filters: 16 |
|
max_filters: 1024 |
|
downsample_scales: [4, 4, 4, 4] |
|
inner_kernel_sizes: null |
|
groups: [4, 4, 4, 4] |
|
strides: null |
|
paddings: null |
|
activation: LeakyReLU |
|
activation_params: {negative_slope: 0.3} |
|
mpd: |
|
in_channels: 1 |
|
out_channels: 1 |
|
periods: [2, 3, 5, 7, 11] |
|
n_layers: 5 |
|
kernel_size: 5 |
|
stride: 3 |
|
filters: 8 |
|
filter_scales: 4 |
|
max_filters: 1024 |
|
activation: LeakyReLU |
|
activation_params: {negative_slope: 0.3} |
|
norm: weight_norm |
|
|
|
|
|
dataset: |
|
batch_size: 64 |
|
num_workers: 10 |
|
segment_duration: 1 |
|
train: |
|
num_samples: 500000 |
|
valid: |
|
num_samples: 10000 |
|
evaluate: |
|
batch_size: 32 |
|
num_samples: 10000 |
|
generate: |
|
batch_size: 32 |
|
num_samples: 50 |
|
segment_duration: 10 |
|
|
|
|
|
evaluate: |
|
every: 25 |
|
num_workers: 5 |
|
metrics: |
|
visqol: false |
|
sisnr: true |
|
generate: |
|
every: 25 |
|
num_workers: 5 |
|
audio: |
|
sample_rate: ${sample_rate} |
|
|
|
|
|
checkpoint: |
|
save_last: true |
|
save_every: 25 |
|
keep_last: 10 |
|
keep_every_states: null |
|
|
|
|
|
optim: |
|
epochs: 200 |
|
updates_per_epoch: 2000 |
|
lr: 3e-4 |
|
max_norm: 0. |
|
optimizer: adam |
|
adam: |
|
betas: [0.5, 0.9] |
|
weight_decay: 0. |
|
ema: |
|
use: true |
|
updates: 1 |
|
device: ${device} |
|
decay: 0.99 |
|
|