Spaces:

stepfun-ai
/

Step-Audio

Running

Step-Audio / funasr_detach /frontends /eend_ola_feature.py

martin

initial

67c46fd 9 months ago

1.36 kB

	# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
	# Licensed under the MIT license.
	#
	# This module is for computing audio features

	import librosa
	import numpy as np


	def transform(Y, dtype=np.float32):
	Y = np.abs(Y)
	n_fft = 2 * (Y.shape[1] - 1)
	sr = 8000
	n_mels = 23
	mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
	Y = np.dot(Y**2, mel_basis.T)
	Y = np.log10(np.maximum(Y, 1e-10))
	mean = np.mean(Y, axis=0)
	Y = Y - mean
	return Y.astype(dtype)


	def subsample(Y, T, subsampling=1):
	Y_ss = Y[::subsampling]
	T_ss = T[::subsampling]
	return Y_ss, T_ss


	def splice(Y, context_size=0):
	Y_pad = np.pad(Y, [(context_size, context_size), (0, 0)], "constant")
	Y_spliced = np.lib.stride_tricks.as_strided(
	np.ascontiguousarray(Y_pad),
	(Y.shape[0], Y.shape[1] * (2 * context_size + 1)),
	(Y.itemsize * Y.shape[1], Y.itemsize),
	writeable=False,
	)
	return Y_spliced


	def stft(data, frame_size=1024, frame_shift=256):
	fft_size = 1 << (frame_size - 1).bit_length()
	if len(data) % frame_shift == 0:
	return librosa.stft(
	data, n_fft=fft_size, win_length=frame_size, hop_length=frame_shift
	).T[:-1]
	else:
	return librosa.stft(
	data, n_fft=fft_size, win_length=frame_size, hop_length=frame_shift
	).T