soumi-maiti commited on
Commit
164c86c
1 Parent(s): 169cf50

Update model

Browse files
README.md ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - diarization
6
+ language: noinfo
7
+ datasets:
8
+ - librimix
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 DIAR model
13
+
14
+ ### `soumi-maiti/libri23mix_eend_ss`
15
+
16
+ This model was trained by soumimaiti using librimix recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout d837c97c88f13ffe655a30bcff93d814f212b225
26
+ pip install -e .
27
+ cd egs2/librimix/enh_diar23
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model soumi-maiti/libri23mix_eend_ss
29
+ ```
30
+
31
+
32
+
33
+ ## DIAR config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_diar_enh_convtasnet_concat_feats_adapt.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: chunk
43
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_adapt
44
+ ngpu: 1
45
+ seed: 0
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: null
51
+ dist_rank: null
52
+ local_rank: 0
53
+ dist_master_addr: null
54
+ dist_master_port: null
55
+ dist_launcher: null
56
+ multiprocessing_distributed: false
57
+ unused_parameters: false
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: true
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 50
65
+ patience: 4
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - loss_enh
76
+ - min
77
+ keep_nbest_models: 1
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 5.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 16
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ use_wandb: false
91
+ wandb_project: null
92
+ wandb_id: null
93
+ wandb_entity: null
94
+ wandb_name: null
95
+ wandb_model_log_interval: -1
96
+ detect_anomaly: false
97
+ pretrain_path: null
98
+ init_param:
99
+ - ../enh_diar1/exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/valid.loss_enh.best.pth
100
+ ignore_init_mismatch: false
101
+ freeze_param: []
102
+ num_iters_per_epoch: null
103
+ batch_size: 1
104
+ valid_batch_size: null
105
+ batch_bins: 1000000
106
+ valid_batch_bins: null
107
+ train_shape_file:
108
+ - exp/diar_enh_stats_8k/train/speech_shape
109
+ - exp/diar_enh_stats_8k/train/text_shape
110
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
111
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
112
+ - exp/diar_enh_stats_8k/train/speech_ref3_shape
113
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
114
+ valid_shape_file:
115
+ - exp/diar_enh_stats_8k/valid/speech_shape
116
+ - exp/diar_enh_stats_8k/valid/text_shape
117
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
118
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
119
+ - exp/diar_enh_stats_8k/valid/speech_ref3_shape
120
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
121
+ batch_type: folded
122
+ valid_batch_type: null
123
+ fold_length:
124
+ - 800
125
+ - 80000
126
+ - 80000
127
+ - 80000
128
+ - 80000
129
+ - 80000
130
+ sort_in_batch: descending
131
+ sort_batch: descending
132
+ multiple_iterator: false
133
+ chunk_length: 24000
134
+ chunk_shift_ratio: 0.5
135
+ num_cache_chunks: 1024
136
+ train_data_path_and_name_and_type:
137
+ - - dump/raw/train/wav.scp
138
+ - speech
139
+ - sound
140
+ - - dump/raw/train/espnet_rttm
141
+ - text
142
+ - rttm
143
+ - - dump/raw/train/spk1.scp
144
+ - speech_ref1
145
+ - sound
146
+ - - dump/raw/train/spk2.scp
147
+ - speech_ref2
148
+ - sound
149
+ - - dump/raw/train/spk3.scp
150
+ - speech_ref3
151
+ - sound
152
+ - - dump/raw/train/noise1.scp
153
+ - noise_ref1
154
+ - sound
155
+ valid_data_path_and_name_and_type:
156
+ - - dump/raw/dev/wav.scp
157
+ - speech
158
+ - sound
159
+ - - dump/raw/dev/espnet_rttm
160
+ - text
161
+ - rttm
162
+ - - dump/raw/dev/spk1.scp
163
+ - speech_ref1
164
+ - sound
165
+ - - dump/raw/dev/spk2.scp
166
+ - speech_ref2
167
+ - sound
168
+ - - dump/raw/dev/spk3.scp
169
+ - speech_ref3
170
+ - sound
171
+ - - dump/raw/dev/noise1.scp
172
+ - noise_ref1
173
+ - sound
174
+ allow_variable_data_keys: false
175
+ max_cache_size: 0.0
176
+ max_cache_fd: 32
177
+ valid_max_cache_size: null
178
+ optim: adam
179
+ optim_conf:
180
+ lr: 0.001
181
+ eps: 1.0e-07
182
+ weight_decay: 0
183
+ scheduler: reducelronplateau
184
+ scheduler_conf:
185
+ mode: min
186
+ factor: 0.5
187
+ patience: 1
188
+ token_list: null
189
+ src_token_list: null
190
+ init: xavier_uniform
191
+ input_size: null
192
+ ctc_conf:
193
+ dropout_rate: 0.0
194
+ ctc_type: builtin
195
+ reduce: true
196
+ ignore_nan_grad: null
197
+ zero_infinity: true
198
+ enh_criterions:
199
+ - name: si_snr
200
+ conf:
201
+ eps: 1.0e-07
202
+ wrapper: pit
203
+ wrapper_conf:
204
+ weight: 1.0
205
+ independent_perm: true
206
+ flexible_numspk: true
207
+ diar_num_spk: 3
208
+ diar_input_size: 128
209
+ enh_model_conf:
210
+ loss_type: si_snr
211
+ asr_model_conf:
212
+ ctc_weight: 0.5
213
+ interctc_weight: 0.0
214
+ ignore_id: -1
215
+ lsm_weight: 0.0
216
+ length_normalized_loss: false
217
+ report_cer: true
218
+ report_wer: true
219
+ sym_space: <space>
220
+ sym_blank: <blank>
221
+ extract_feats_in_collect_stats: true
222
+ st_model_conf:
223
+ stft_consistency: false
224
+ loss_type: mask_mse
225
+ mask_type: null
226
+ diar_model_conf:
227
+ diar_weight: 0.2
228
+ attractor_weight: 0.2
229
+ subtask_series:
230
+ - enh
231
+ - diar
232
+ model_conf:
233
+ calc_enh_loss: true
234
+ bypass_enh_prob: 0
235
+ use_preprocessor: true
236
+ token_type: bpe
237
+ bpemodel: null
238
+ src_token_type: bpe
239
+ src_bpemodel: null
240
+ non_linguistic_symbols: null
241
+ cleaner: null
242
+ g2p: null
243
+ enh_encoder: conv
244
+ enh_encoder_conf:
245
+ channel: 512
246
+ kernel_size: 16
247
+ stride: 8
248
+ enh_separator: tcn_nomask
249
+ enh_separator_conf:
250
+ layer: 8
251
+ stack: 3
252
+ bottleneck_dim: 128
253
+ hidden_dim: 512
254
+ kernel: 3
255
+ causal: false
256
+ norm_type: gLN
257
+ enh_decoder: conv
258
+ enh_decoder_conf:
259
+ channel: 512
260
+ kernel_size: 16
261
+ stride: 8
262
+ enh_mask_module: multi_mask
263
+ enh_mask_module_conf:
264
+ max_num_spk: 3
265
+ mask_nonlinear: relu
266
+ bottleneck_dim: 128
267
+ frontend: default
268
+ frontend_conf: {}
269
+ specaug: null
270
+ specaug_conf: {}
271
+ normalize: utterance_mvn
272
+ normalize_conf: {}
273
+ asr_preencoder: null
274
+ asr_preencoder_conf: {}
275
+ asr_encoder: rnn
276
+ asr_encoder_conf: {}
277
+ asr_postencoder: null
278
+ asr_postencoder_conf: {}
279
+ asr_decoder: rnn
280
+ asr_decoder_conf: {}
281
+ st_preencoder: null
282
+ st_preencoder_conf: {}
283
+ st_encoder: rnn
284
+ st_encoder_conf: {}
285
+ st_postencoder: null
286
+ st_postencoder_conf: {}
287
+ st_decoder: rnn
288
+ st_decoder_conf: {}
289
+ st_extra_asr_decoder: rnn
290
+ st_extra_asr_decoder_conf: {}
291
+ st_extra_mt_decoder: rnn
292
+ st_extra_mt_decoder_conf: {}
293
+ diar_frontend: default
294
+ diar_frontend_conf:
295
+ hop_length: 64
296
+ fs: 8000
297
+ diar_specaug: null
298
+ diar_specaug_conf: {}
299
+ diar_normalize: utterance_mvn
300
+ diar_normalize_conf: {}
301
+ diar_encoder: transformer
302
+ diar_encoder_conf:
303
+ input_layer: conv2d8
304
+ num_blocks: 4
305
+ linear_units: 512
306
+ dropout_rate: 0.1
307
+ output_size: 256
308
+ attention_heads: 4
309
+ attention_dropout_rate: 0.1
310
+ diar_decoder: linear
311
+ diar_decoder_conf: {}
312
+ label_aggregator: label_aggregator
313
+ label_aggregator_conf:
314
+ win_length: 256
315
+ hop_length: 64
316
+ diar_attractor: rnn
317
+ diar_attractor_conf:
318
+ unit: 256
319
+ layer: 1
320
+ dropout: 0.0
321
+ attractor_grad: true
322
+ required:
323
+ - output_dir
324
+ version: '202205'
325
+ distributed: false
326
+ ```
327
+
328
+ </details>
329
+
330
+
331
+
332
+ ### Citing ESPnet
333
+
334
+ ```BibTex
335
+ @inproceedings{watanabe2018espnet,
336
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
337
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
338
+ year={2018},
339
+ booktitle={Proceedings of Interspeech},
340
+ pages={2207--2211},
341
+ doi={10.21437/Interspeech.2018-1456},
342
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
343
+ }
344
+
345
+
346
+
347
+
348
+ ```
349
+
350
+ or arXiv:
351
+
352
+ ```bibtex
353
+ @misc{watanabe2018espnet,
354
+ title={ESPnet: End-to-End Speech Processing Toolkit},
355
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
356
+ year={2018},
357
+ eprint={1804.00015},
358
+ archivePrefix={arXiv},
359
+ primaryClass={cs.CL}
360
+ }
361
+ ```
exp_bk/diar_enh_stats_8k/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9cec0a1b3324f2eaa6cbad59122b5d908db5cad3ac0b1f58a9c1c4f863ab554
3
+ size 778
exp_bk/diar_enh_train_diar_enh_convtasnet_concat_feats_adapt/16epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d07c01b1e6ecde2bfd9dce6d0646972821933bd3f293ccebd9d88bb64f49dc1b
3
+ size 38983318
exp_bk/diar_enh_train_diar_enh_convtasnet_concat_feats_adapt/DIAR_RESULTS.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Requirement already satisfied: humanfriendly in /ocean/projects/cis210027p/smaiti/espnet_diar_enh/tools/venv/lib/python3.9/site-packages (10.0)
2
+
3
+ [notice] A new release of pip available: 22.1.2 -> 23.1.1
4
+ [notice] To update, run: pip install --upgrade pip
5
+ <!-- Generated by scripts/utils/show_diar_result.sh -->
6
+ # RESULTS
7
+ ## Environments
8
+ - date: `Wed Apr 26 13:02:10 EDT 2023`
9
+ - python version: `3.9.16 | packaged by conda-forge | (main, Feb 1 2023, 21:39:03) [GCC 11.3.0]`
10
+ - espnet version: `espnet 202205`
11
+ - pytorch version: `pytorch 1.8.1+cu102`
12
+ - Git hash: `d837c97c88f13ffe655a30bcff93d814f212b225`
13
+ - Commit date: `Wed Jun 29 12:04:57 2022 -0700`
14
+
15
+ ## diar_enh_train_diar_enh_convtasnet_concat_feats_adapt
16
+ ### DER
17
+ diarized_enhanced_test_decode_diar_enh_adapt
18
+ |threshold_median_collar|DER|
19
+ |---|---|
20
+ |result_th0.3_med11_collar0.0|6.50|
21
+ |result_th0.3_med1_collar0.0|6.52|
22
+ |result_th0.4_med11_collar0.0|6.20|
23
+ |result_th0.4_med1_collar0.0|6.24|
24
+ |result_th0.5_med11_collar0.0|6.04|
25
+ |result_th0.5_med1_collar0.0|6.10|
26
+ |result_th0.6_med11_collar0.0|6.03|
27
+ |result_th0.6_med1_collar0.0|6.12|
28
+ |result_th0.7_med11_collar0.0|6.25|
29
+ |result_th0.7_med1_collar0.0|6.36|
exp_bk/diar_enh_train_diar_enh_convtasnet_concat_feats_adapt/ENH_RESULTS.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Requirement already satisfied: humanfriendly in /ocean/projects/cis210027p/smaiti/espnet_diar_enh/tools/venv/lib/python3.9/site-packages (10.0)
2
+
3
+ [notice] A new release of pip available: 22.1.2 -> 23.1.2
4
+ [notice] To update, run: pip install --upgrade pip
5
+ <!-- Generated by scripts/utils/show_enh_score.sh -->
6
+ # RESULTS
7
+ ## Environments
8
+ - date: `Thu May 4 02:08:47 EDT 2023`
9
+ - python version: `3.9.16 | packaged by conda-forge | (main, Feb 1 2023, 21:39:03) [GCC 11.3.0]`
10
+ - espnet version: `espnet 202205`
11
+ - pytorch version: `pytorch 1.8.1+cu102`
12
+ - Git hash: `d837c97c88f13ffe655a30bcff93d814f212b225`
13
+ - Commit date: `Wed Jun 29 12:04:57 2022 -0700`
14
+
15
+
16
+ ## diar_enh_train_diar_enh_convtasnet_concat_feats_adapt
17
+
18
+ config: conf/tuning/train_diar_enh_convtasnet_concat_feats_adapt.yaml
19
+
20
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|
21
+ |---|---|---|---|---|---|
22
+ |diarized_enhanced_test_decode_diar_enh_adapt|77.3102|8.3970|6.9224|15.6728|5.3881|
23
+
exp_bk/diar_enh_train_diar_enh_convtasnet_concat_feats_adapt/config.yaml ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_diar_enh_convtasnet_concat_feats_adapt.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_adapt
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 50
28
+ patience: 4
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss_enh
39
+ - min
40
+ keep_nbest_models: 1
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 16
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param:
62
+ - ../enh_diar1/exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/valid.loss_enh.best.pth
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: null
66
+ batch_size: 1
67
+ valid_batch_size: null
68
+ batch_bins: 1000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/diar_enh_stats_8k/train/speech_shape
72
+ - exp/diar_enh_stats_8k/train/text_shape
73
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
74
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
75
+ - exp/diar_enh_stats_8k/train/speech_ref3_shape
76
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
77
+ valid_shape_file:
78
+ - exp/diar_enh_stats_8k/valid/speech_shape
79
+ - exp/diar_enh_stats_8k/valid/text_shape
80
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
81
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
82
+ - exp/diar_enh_stats_8k/valid/speech_ref3_shape
83
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
84
+ batch_type: folded
85
+ valid_batch_type: null
86
+ fold_length:
87
+ - 800
88
+ - 80000
89
+ - 80000
90
+ - 80000
91
+ - 80000
92
+ - 80000
93
+ sort_in_batch: descending
94
+ sort_batch: descending
95
+ multiple_iterator: false
96
+ chunk_length: 24000
97
+ chunk_shift_ratio: 0.5
98
+ num_cache_chunks: 1024
99
+ train_data_path_and_name_and_type:
100
+ - - dump/raw/train/wav.scp
101
+ - speech
102
+ - sound
103
+ - - dump/raw/train/espnet_rttm
104
+ - text
105
+ - rttm
106
+ - - dump/raw/train/spk1.scp
107
+ - speech_ref1
108
+ - sound
109
+ - - dump/raw/train/spk2.scp
110
+ - speech_ref2
111
+ - sound
112
+ - - dump/raw/train/spk3.scp
113
+ - speech_ref3
114
+ - sound
115
+ - - dump/raw/train/noise1.scp
116
+ - noise_ref1
117
+ - sound
118
+ valid_data_path_and_name_and_type:
119
+ - - dump/raw/dev/wav.scp
120
+ - speech
121
+ - sound
122
+ - - dump/raw/dev/espnet_rttm
123
+ - text
124
+ - rttm
125
+ - - dump/raw/dev/spk1.scp
126
+ - speech_ref1
127
+ - sound
128
+ - - dump/raw/dev/spk2.scp
129
+ - speech_ref2
130
+ - sound
131
+ - - dump/raw/dev/spk3.scp
132
+ - speech_ref3
133
+ - sound
134
+ - - dump/raw/dev/noise1.scp
135
+ - noise_ref1
136
+ - sound
137
+ allow_variable_data_keys: false
138
+ max_cache_size: 0.0
139
+ max_cache_fd: 32
140
+ valid_max_cache_size: null
141
+ optim: adam
142
+ optim_conf:
143
+ lr: 0.001
144
+ eps: 1.0e-07
145
+ weight_decay: 0
146
+ scheduler: reducelronplateau
147
+ scheduler_conf:
148
+ mode: min
149
+ factor: 0.5
150
+ patience: 1
151
+ token_list: null
152
+ src_token_list: null
153
+ init: xavier_uniform
154
+ input_size: null
155
+ ctc_conf:
156
+ dropout_rate: 0.0
157
+ ctc_type: builtin
158
+ reduce: true
159
+ ignore_nan_grad: null
160
+ zero_infinity: true
161
+ enh_criterions:
162
+ - name: si_snr
163
+ conf:
164
+ eps: 1.0e-07
165
+ wrapper: pit
166
+ wrapper_conf:
167
+ weight: 1.0
168
+ independent_perm: true
169
+ flexible_numspk: true
170
+ diar_num_spk: 3
171
+ diar_input_size: 128
172
+ enh_model_conf:
173
+ loss_type: si_snr
174
+ asr_model_conf:
175
+ ctc_weight: 0.5
176
+ interctc_weight: 0.0
177
+ ignore_id: -1
178
+ lsm_weight: 0.0
179
+ length_normalized_loss: false
180
+ report_cer: true
181
+ report_wer: true
182
+ sym_space: <space>
183
+ sym_blank: <blank>
184
+ extract_feats_in_collect_stats: true
185
+ st_model_conf:
186
+ stft_consistency: false
187
+ loss_type: mask_mse
188
+ mask_type: null
189
+ diar_model_conf:
190
+ diar_weight: 0.2
191
+ attractor_weight: 0.2
192
+ subtask_series:
193
+ - enh
194
+ - diar
195
+ model_conf:
196
+ calc_enh_loss: true
197
+ bypass_enh_prob: 0
198
+ use_preprocessor: true
199
+ token_type: bpe
200
+ bpemodel: null
201
+ src_token_type: bpe
202
+ src_bpemodel: null
203
+ non_linguistic_symbols: null
204
+ cleaner: null
205
+ g2p: null
206
+ enh_encoder: conv
207
+ enh_encoder_conf:
208
+ channel: 512
209
+ kernel_size: 16
210
+ stride: 8
211
+ enh_separator: tcn_nomask
212
+ enh_separator_conf:
213
+ layer: 8
214
+ stack: 3
215
+ bottleneck_dim: 128
216
+ hidden_dim: 512
217
+ kernel: 3
218
+ causal: false
219
+ norm_type: gLN
220
+ enh_decoder: conv
221
+ enh_decoder_conf:
222
+ channel: 512
223
+ kernel_size: 16
224
+ stride: 8
225
+ enh_mask_module: multi_mask
226
+ enh_mask_module_conf:
227
+ max_num_spk: 3
228
+ mask_nonlinear: relu
229
+ bottleneck_dim: 128
230
+ frontend: default
231
+ frontend_conf: {}
232
+ specaug: null
233
+ specaug_conf: {}
234
+ normalize: utterance_mvn
235
+ normalize_conf: {}
236
+ asr_preencoder: null
237
+ asr_preencoder_conf: {}
238
+ asr_encoder: rnn
239
+ asr_encoder_conf: {}
240
+ asr_postencoder: null
241
+ asr_postencoder_conf: {}
242
+ asr_decoder: rnn
243
+ asr_decoder_conf: {}
244
+ st_preencoder: null
245
+ st_preencoder_conf: {}
246
+ st_encoder: rnn
247
+ st_encoder_conf: {}
248
+ st_postencoder: null
249
+ st_postencoder_conf: {}
250
+ st_decoder: rnn
251
+ st_decoder_conf: {}
252
+ st_extra_asr_decoder: rnn
253
+ st_extra_asr_decoder_conf: {}
254
+ st_extra_mt_decoder: rnn
255
+ st_extra_mt_decoder_conf: {}
256
+ diar_frontend: default
257
+ diar_frontend_conf:
258
+ hop_length: 64
259
+ fs: 8000
260
+ diar_specaug: null
261
+ diar_specaug_conf: {}
262
+ diar_normalize: utterance_mvn
263
+ diar_normalize_conf: {}
264
+ diar_encoder: transformer
265
+ diar_encoder_conf:
266
+ input_layer: conv2d8
267
+ num_blocks: 4
268
+ linear_units: 512
269
+ dropout_rate: 0.1
270
+ output_size: 256
271
+ attention_heads: 4
272
+ attention_dropout_rate: 0.1
273
+ diar_decoder: linear
274
+ diar_decoder_conf: {}
275
+ label_aggregator: label_aggregator
276
+ label_aggregator_conf:
277
+ win_length: 256
278
+ hop_length: 64
279
+ diar_attractor: rnn
280
+ diar_attractor_conf:
281
+ unit: 256
282
+ layer: 1
283
+ dropout: 0.0
284
+ attractor_grad: true
285
+ required:
286
+ - output_dir
287
+ version: '202205'
288
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202205'
2
+ files:
3
+ model_file: exp_bk/diar_enh_train_diar_enh_convtasnet_concat_feats_adapt/16epoch.pth
4
+ python: "3.9.16 | packaged by conda-forge | (main, Feb 1 2023, 21:39:03) \n[GCC 11.3.0]"
5
+ timestamp: 1683182053.532115
6
+ torch: 1.8.1+cu102
7
+ yaml_files:
8
+ train_config: exp_bk/diar_enh_train_diar_enh_convtasnet_concat_feats_adapt/config.yaml