Spaces:
Sleeping
Sleeping
| # Copyright (c) 2023 Amphion. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import math | |
| import librosa | |
| import torch | |
| import numpy as np | |
| from utils.util import JsonHParams | |
| from utils.f0 import get_f0_features_using_parselmouth, get_pitch_sub_median | |
| ZERO = 1e-8 | |
| def extract_f0rmse( | |
| audio_ref, | |
| audio_deg, | |
| fs=None, | |
| hop_length=256, | |
| f0_min=37, | |
| f0_max=1000, | |
| pitch_bin=256, | |
| pitch_max=1100.0, | |
| pitch_min=50.0, | |
| need_mean=True, | |
| method="dtw", | |
| ): | |
| """Compute F0 Root Mean Square Error (RMSE) between the predicted and the ground truth audio. | |
| audio_ref: path to the ground truth audio. | |
| audio_deg: path to the predicted audio. | |
| fs: sampling rate. | |
| hop_length: hop length. | |
| f0_min: lower limit for f0. | |
| f0_max: upper limit for f0. | |
| pitch_bin: number of bins for f0 quantization. | |
| pitch_max: upper limit for f0 quantization. | |
| pitch_min: lower limit for f0 quantization. | |
| need_mean: subtract the mean value from f0 if "True". | |
| method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio. | |
| "cut" will cut both audios into a same length according to the one with the shorter length. | |
| """ | |
| # Load audio | |
| if fs != None: | |
| audio_ref, _ = librosa.load(audio_ref, sr=fs) | |
| audio_deg, _ = librosa.load(audio_deg, sr=fs) | |
| else: | |
| audio_ref, fs = librosa.load(audio_ref) | |
| audio_deg, fs = librosa.load(audio_deg) | |
| # Initialize config for f0 extraction | |
| cfg = JsonHParams() | |
| cfg.sample_rate = fs | |
| cfg.hop_size = hop_length | |
| cfg.f0_min = f0_min | |
| cfg.f0_max = f0_max | |
| cfg.pitch_bin = pitch_bin | |
| cfg.pitch_max = pitch_max | |
| cfg.pitch_min = pitch_min | |
| # Extract f0 | |
| f0_ref = get_f0_features_using_parselmouth( | |
| audio_ref, | |
| cfg, | |
| )[0] | |
| f0_deg = get_f0_features_using_parselmouth( | |
| audio_deg, | |
| cfg, | |
| )[0] | |
| # Subtract mean value from f0 | |
| if need_mean: | |
| f0_ref = torch.from_numpy(f0_ref) | |
| f0_deg = torch.from_numpy(f0_deg) | |
| f0_ref = get_pitch_sub_median(f0_ref).numpy() | |
| f0_deg = get_pitch_sub_median(f0_deg).numpy() | |
| # Avoid silence | |
| min_length = min(len(f0_ref), len(f0_deg)) | |
| if min_length <= 1: | |
| return 0 | |
| # F0 length alignment | |
| if method == "cut": | |
| length = min(len(f0_ref), len(f0_deg)) | |
| f0_ref = f0_ref[:length] | |
| f0_deg = f0_deg[:length] | |
| elif method == "dtw": | |
| _, wp = librosa.sequence.dtw(f0_ref, f0_deg, backtrack=True) | |
| f0_gt_new = [] | |
| f0_pred_new = [] | |
| for i in range(wp.shape[0]): | |
| gt_index = wp[i][0] | |
| pred_index = wp[i][1] | |
| f0_gt_new.append(f0_ref[gt_index]) | |
| f0_pred_new.append(f0_deg[pred_index]) | |
| f0_ref = np.array(f0_gt_new) | |
| f0_deg = np.array(f0_pred_new) | |
| assert len(f0_ref) == len(f0_deg) | |
| # Compute RMSE | |
| f0_mse = np.square(np.subtract(f0_ref, f0_deg)).mean() | |
| f0_rmse = math.sqrt(f0_mse) | |
| return f0_rmse | |