Spaces:
Running
Running
| # Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita) | |
| # Licensed under the MIT license. | |
| # | |
| # This module is for computing audio features | |
| import librosa | |
| import numpy as np | |
| def transform(Y, dtype=np.float32): | |
| Y = np.abs(Y) | |
| n_fft = 2 * (Y.shape[1] - 1) | |
| sr = 8000 | |
| n_mels = 23 | |
| mel_basis = librosa.filters.mel(sr, n_fft, n_mels) | |
| Y = np.dot(Y**2, mel_basis.T) | |
| Y = np.log10(np.maximum(Y, 1e-10)) | |
| mean = np.mean(Y, axis=0) | |
| Y = Y - mean | |
| return Y.astype(dtype) | |
| def subsample(Y, T, subsampling=1): | |
| Y_ss = Y[::subsampling] | |
| T_ss = T[::subsampling] | |
| return Y_ss, T_ss | |
| def splice(Y, context_size=0): | |
| Y_pad = np.pad(Y, [(context_size, context_size), (0, 0)], "constant") | |
| Y_spliced = np.lib.stride_tricks.as_strided( | |
| np.ascontiguousarray(Y_pad), | |
| (Y.shape[0], Y.shape[1] * (2 * context_size + 1)), | |
| (Y.itemsize * Y.shape[1], Y.itemsize), | |
| writeable=False, | |
| ) | |
| return Y_spliced | |
| def stft(data, frame_size=1024, frame_shift=256): | |
| fft_size = 1 << (frame_size - 1).bit_length() | |
| if len(data) % frame_shift == 0: | |
| return librosa.stft( | |
| data, n_fft=fft_size, win_length=frame_size, hop_length=frame_shift | |
| ).T[:-1] | |
| else: | |
| return librosa.stft( | |
| data, n_fft=fft_size, win_length=frame_size, hop_length=frame_shift | |
| ).T | |