|
{ |
|
"feature_extraction": { |
|
"sequence": [ |
|
{ |
|
"operation": { |
|
"name": "audio_decoder", |
|
"type": "AudioDecoderEx", |
|
"attrs": { |
|
"target_sample_rates": [ |
|
8000, |
|
16000 |
|
] |
|
} |
|
} |
|
}, |
|
{ |
|
"operation": { |
|
"name": "phi_4_audio_embed", |
|
"type": "Phi4AudioEmbed", |
|
"attrs": { |
|
"audio_compression_rate": 8, |
|
"stft_normal/n_fft": 512, |
|
"stft_normal/frame_length": 400, |
|
"stft_normal/hop_length": 160, |
|
"stft_normal/win_fn": "hamming", |
|
"logmel/chunk_size": 30, |
|
"logmel/hop_length": 160, |
|
"logmel/n_fft": 512, |
|
"logmel/n_mel": 80, |
|
"logmel/feature_first": 0, |
|
"logmel/no_padding": 1, |
|
"stft_normal_8k/n_fft": 256, |
|
"stft_normal_8k/frame_length": 200, |
|
"stft_normal_8k/hop_length": 80, |
|
"stft_normal_8k/win_fn": "hamming", |
|
"logmel_8k/chunk_size": 30, |
|
"logmel_8k/hop_length": 80, |
|
"logmel_8k/n_fft": 512, |
|
"logmel_8k/n_mel": 80, |
|
"logmel_8k/feature_first": 0, |
|
"logmel_8k/no_padding": 1 |
|
} |
|
} |
|
} |
|
], |
|
"output_aligner": "phi4-audio-aligner" |
|
} |
|
} |