Spaces:
Running
Running
| # Copyright (c) 2023 Amphion. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import torch | |
| import numpy as np | |
| from numpy import linalg as LA | |
| import librosa | |
| import soundfile as sf | |
| import librosa.filters | |
| def load_audio_torch(wave_file, fs): | |
| """Load audio data into torch tensor | |
| Args: | |
| wave_file (str): path to wave file | |
| fs (int): sample rate | |
| Returns: | |
| audio (tensor): audio data in tensor | |
| fs (int): sample rate | |
| """ | |
| audio, sample_rate = librosa.load(wave_file, sr=fs, mono=True) | |
| # audio: (T,) | |
| assert len(audio) > 2 | |
| # Check the audio type (for soundfile loading backbone) - float, 8bit or 16bit | |
| if np.issubdtype(audio.dtype, np.integer): | |
| max_mag = -np.iinfo(audio.dtype).min | |
| else: | |
| max_mag = max(np.amax(audio), -np.amin(audio)) | |
| max_mag = ( | |
| (2**31) + 1 | |
| if max_mag > (2**15) | |
| else ((2**15) + 1 if max_mag > 1.01 else 1.0) | |
| ) | |
| # Normalize the audio | |
| audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag | |
| if (torch.isnan(audio) | torch.isinf(audio)).any(): | |
| return [], sample_rate or fs or 48000 | |
| # Resample the audio to our target samplerate | |
| if fs is not None and fs != sample_rate: | |
| audio = torch.from_numpy( | |
| librosa.core.resample(audio.numpy(), orig_sr=sample_rate, target_sr=fs) | |
| ) | |
| sample_rate = fs | |
| return audio, fs | |
| def _stft(y, cfg): | |
| return librosa.stft( | |
| y=y, n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.win_size | |
| ) | |
| def energy(wav, cfg): | |
| D = _stft(wav, cfg) | |
| magnitudes = np.abs(D).T # [F, T] | |
| return LA.norm(magnitudes, axis=1) | |
| def get_energy_from_tacotron(audio, _stft): | |
| audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1) | |
| audio = torch.autograd.Variable(audio, requires_grad=False) | |
| mel, energy = _stft.mel_spectrogram(audio) | |
| energy = torch.squeeze(energy, 0).numpy().astype(np.float32) | |
| return mel, energy | |