|
text_encoder: !new:uetasr.featurizers.text.Subword |
|
model_prefix: vocabs/subword_vietnamese_500 |
|
data_path: transcript_v3.1.txt |
|
character_coverage: 1.0 |
|
model_type: bpe |
|
num_threads: 16 |
|
unk_id: 1 |
|
pad_id: 0 |
|
eos_id: -1 |
|
unk_piece: <unk> |
|
pad_piece: <blank> |
|
eos_piece: </s> |
|
vocab_size: 500 |
|
|
|
audio_encoder: !new:uetasr.featurizers.audio.LogMelSpectrogram |
|
fs: 16000 |
|
n_fft: 512 |
|
win_length: 400 |
|
hop_length: 160 |
|
n_mels: 80 |
|
fmin: 0 |
|
fmax: 8000 |
|
htk: False |
|
|
|
d_model: 256 |
|
|
|
encoder_model: !new:uetasr.models.encoders.Conformer |
|
num_features: 80 |
|
window_size: 1 |
|
d_model: !ref <d_model> |
|
input_layer: vgg2l |
|
pos_enc_layer_type: rel_pos |
|
dropout_rate_pos_enc: 0.2 |
|
selfattention_layer_type: rel_selfattn |
|
attention_heads: 4 |
|
dropout_rate_att: 0.1 |
|
dropout_rate_pos_wise: 0.1 |
|
dropout_rate: 0.1 |
|
positionwise_layer_type: linear |
|
linear_units: 1024 |
|
conv_mod_kernel: 31 |
|
num_blocks: 18 |
|
use_macaron: True |
|
use_cnn_module: True |
|
eps_layer_norm: 0.000000000001 |
|
|
|
decoder_model: !new:uetasr.models.decoders.RNNDecoder |
|
vocab_size: !ref <text_encoder.vocab_size> |
|
embedding_dim: 256 |
|
num_layers: 1 |
|
hidden_dim: !ref <d_model> |
|
dropout_embed: 0.2 |
|
dropout_rnn: 0.1 |
|
rnn_type: LSTM |
|
|
|
jointer_model: !new:uetasr.layers.jointer.RNNTJointer |
|
encoder_dim: !ref <d_model> |
|
decoder_dim: !ref <d_model> |
|
hidden_dim: 512 |
|
output_dim: !ref <text_encoder.vocab_size> |
|
|
|
ctc_lin: null |
|
|
|
model: !new:uetasr.models.rnnt.RNNT |
|
encoder: !ref <encoder_model> |
|
decoder: !ref <decoder_model> |
|
jointer: !ref <jointer_model> |
|
ctc_lin: !ref <ctc_lin> |
|
ctc_dropout: 0.1 |
|
use_cmvn: True |
|
|
|
decoder: !new:uetasr.searchers.GreedyRNNTV2 |
|
decoder: !ref <decoder_model> |
|
jointer: !ref <jointer_model> |
|
text_decoder: !ref <text_encoder> |
|
max_symbols_per_step: 10 |