Spaces:
Sleeping
Sleeping
| import librosa | |
| import numpy as np | |
| import torch | |
| def singmos_warmup(): | |
| predictor = torch.hub.load( | |
| "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True | |
| ) | |
| return predictor | |
| def singmos_evaluation(predictor, wav_info, fs): | |
| wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000) | |
| wav_mos = torch.from_numpy(wav_mos).unsqueeze(0) | |
| len_mos = torch.tensor([wav_mos.shape[1]]) | |
| score = predictor(wav_mos, len_mos) | |
| return score | |
| def initialize_audiobox_predictor(): | |
| from audiobox_aesthetics.infer import initialize_predictor | |
| predictor = initialize_predictor() | |
| return predictor | |
| def audiobox_aesthetics_evaluation(predictor, audio_path): | |
| score = predictor.forward([{"path": str(audio_path)}]) | |
| return score | |
| def score_extract_warmpup(): | |
| from basic_pitch.inference import predict | |
| return predict | |
| def score_metric_evaluation(score_extractor, audio_path): | |
| model_output, midi_data, note_events = score_extractor(audio_path) | |
| metrics = {} | |
| assert ( | |
| len(midi_data.instruments) == 1 | |
| ), f"Detected {len(midi_data.instruments)} instruments for {audio_path}" | |
| midi_notes = midi_data.instruments[0].notes | |
| melody = [note.pitch for note in midi_notes] | |
| if len(melody) == 0: | |
| print(f"No notes detected in {audio_path}") | |
| return {} | |
| intervals = [abs(melody[i + 1] - melody[i]) for i in range(len(melody) - 1)] | |
| metrics["pitch_range"] = max(melody) - min(melody) | |
| if len(intervals) > 0: | |
| metrics["interval_mean"] = np.mean(intervals) | |
| metrics["interval_std"] = np.std(intervals) | |
| metrics["interval_large_jump_ratio"] = np.mean([i > 5 for i in intervals]) | |
| metrics["dissonance_rate"] = compute_dissonance_rate(intervals) | |
| return metrics | |
| def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}): | |
| dissonant = [i % 12 in dissonant_intervals for i in intervals] | |
| return np.mean(dissonant) if intervals else np.nan | |
| if __name__ == "__main__": | |
| import argparse | |
| from pathlib import Path | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--wav_path", | |
| type=Path, | |
| help="Path to the wav file", | |
| ) | |
| parser.add_argument( | |
| "--results_csv", | |
| type=Path, | |
| help="csv file to save the results", | |
| ) | |
| args = parser.parse_args() | |
| args.results_csv.parent.mkdir(parents=True, exist_ok=True) | |
| y, fs = librosa.load(args.wav_path, sr=None) | |
| # warmup | |
| predictor = singmos_warmup() | |
| score_extractor = score_extract_warmpup() | |
| aesthetic_predictor = initialize_audiobox_predictor() | |
| # evaluate the audio | |
| metrics = {} | |
| # singmos evaluation | |
| score = singmos_evaluation(predictor, y, fs) | |
| metrics["singmos"] = score | |
| # score metric evaluation | |
| score_results = score_metric_evaluation(score_extractor, args.wav_path) | |
| metrics.update(score_results) | |
| # audiobox aesthetics evaluation | |
| score_results = audiobox_aesthetics_evaluation(aesthetic_predictor, args.wav_path) | |
| metrics.update(score_results[0]) | |
| # save results | |
| with open(args.results_csv, "a") as f: | |
| header = "file," + ",".join(metrics.keys()) + "\n" | |
| if f.tell() == 0: | |
| f.write(header) | |
| else: | |
| with open(args.results_csv, "r") as f2: | |
| file_header = f2.readline() | |
| if file_header != header: | |
| raise ValueError(f"Header mismatch: {file_header} vs {header}") | |
| line = ( | |
| ",".join([str(args.wav_path)] + [str(v) for v in metrics.values()]) + "\n" | |
| ) | |
| f.write(line) | |