Spaces:
Running
Running
| import os | |
| import torch | |
| import librosa | |
| import warnings | |
| import numpy as np | |
| import pandas as pd | |
| import gradio as gr | |
| import librosa.display | |
| from model import EvalNet | |
| from t_model import t_EvalNet | |
| from utils import get_modelist, find_files, embed, MODEL_DIR | |
| TRANSLATE = { | |
| "chanyin": "Vibrato", # 颤音 | |
| "boxian": "Plucks", # 拨弦 | |
| "shanghua": "Upward Portamento", # 上滑音 | |
| "xiahua": "Downward Portamento", # 下滑音 | |
| "huazhi/guazou/lianmo/liantuo": "Glissando", # 花指\刮奏\连抹\连托 | |
| "yaozhi": "Tremolo", # 摇指 | |
| "dianyin": "Point Note", # 点音 | |
| } | |
| CLASSES = list(TRANSLATE.keys()) | |
| TEMP_DIR = "./__pycache__/tmp" | |
| SAMPLE_RATE = 44100 | |
| HOP_LENGTH = 512 | |
| TIME_LENGTH = 3 | |
| def logMel(y, sr=SAMPLE_RATE): | |
| mel = librosa.feature.melspectrogram( | |
| y=y, | |
| sr=sr, | |
| hop_length=HOP_LENGTH, | |
| fmin=27.5, | |
| ) | |
| return librosa.power_to_db(mel, ref=np.max) | |
| def logCqt(y, sr=SAMPLE_RATE): | |
| cqt = librosa.cqt( | |
| y, | |
| sr=sr, | |
| hop_length=HOP_LENGTH, | |
| fmin=27.5, | |
| n_bins=88, | |
| bins_per_octave=12, | |
| ) | |
| return ((1.0 / 80.0) * librosa.core.amplitude_to_db(np.abs(cqt), ref=np.max)) + 1.0 | |
| def logChroma(y, sr=SAMPLE_RATE): | |
| chroma = librosa.feature.chroma_stft( | |
| y=y, | |
| sr=sr, | |
| hop_length=HOP_LENGTH, | |
| ) | |
| return ( | |
| (1.0 / 80.0) * librosa.core.amplitude_to_db(np.abs(chroma), ref=np.max) | |
| ) + 1.0 | |
| def RoW_norm(data): | |
| common_sum = 0 | |
| square_sum = 0 | |
| tfle = 0 | |
| for i in range(len(data)): | |
| tfle += (data[i].sum(-1).sum(0) != 0).astype("float").sum() | |
| common_sum += data[i].sum(-1).sum(-1) | |
| square_sum += (data[i] ** 2).sum(-1).sum(-1) | |
| common_avg = common_sum / tfle | |
| square_avg = square_sum / tfle | |
| std = np.sqrt(square_avg - common_avg**2) | |
| return common_avg, std | |
| def norm(data): | |
| size = data.shape | |
| avg, std = RoW_norm(data) | |
| avg = np.tile(avg.reshape((1, -1, 1, 1)), (size[0], 1, size[2], size[3])) | |
| std = np.tile(std.reshape((1, -1, 1, 1)), (size[0], 1, size[2], size[3])) | |
| return (data - avg) / std | |
| def chunk_data(f): | |
| x = [] | |
| xdata = np.transpose(f) | |
| s = SAMPLE_RATE * TIME_LENGTH // HOP_LENGTH | |
| length = int(np.ceil((int(len(xdata) / s) + 1) * s)) | |
| app = np.zeros((length - xdata.shape[0], xdata.shape[1])) | |
| xdata = np.concatenate((xdata, app), 0) | |
| for i in range(int(length / s)): | |
| data = xdata[int(i * s) : int(i * s + s)] | |
| x.append(np.transpose(data[:s, :])) | |
| return np.array(x) | |
| def load(audio_path: str, converto="mel"): | |
| y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
| spec = eval("log%s(y, sr)" % converto.capitalize()) | |
| x_spec = chunk_data(spec) | |
| Xtr_spec = np.expand_dims(x_spec, axis=3) | |
| return list(norm(Xtr_spec)) | |
| def format_second(seconds): | |
| integer_part = int(seconds) | |
| decimal_part = round(seconds - integer_part, 3) | |
| hours, remainder = divmod(integer_part, 3600) | |
| minutes, seconds = divmod(remainder, 60) | |
| return f"{hours:02}:{minutes:02}:{seconds:02}.{decimal_part:.3f}" | |
| def infer(audio_path: str, log_name: str): | |
| if not audio_path: | |
| return None, "Please input an audio!" | |
| backbone = "_".join(log_name.split("_")[:-1]) | |
| spec = log_name.split("_")[-1] | |
| try: | |
| input = load(audio_path, converto=spec) | |
| dur = librosa.get_duration(path=audio_path) | |
| frames_per_3s = input[0].shape[1] | |
| if "vit" in backbone or "swin" in backbone: | |
| eval_net = t_EvalNet( | |
| backbone, | |
| len(TRANSLATE), | |
| input[0].shape[1], | |
| weight_path=f"{MODEL_DIR}/{log_name}.pt", | |
| ) | |
| else: | |
| eval_net = EvalNet( | |
| backbone, | |
| len(TRANSLATE), | |
| input[0].shape[1], | |
| weight_path=f"{MODEL_DIR}/{log_name}.pt", | |
| ) | |
| except Exception as e: | |
| return None, f"{e}" | |
| input_size = eval_net.get_input_size() | |
| embeded_input = embed(input, input_size) | |
| output = list(eval_net.forward(embeded_input)) | |
| outputs = [] | |
| index = 0 | |
| for y in output: | |
| preds = list(y.T) | |
| for pred in preds: | |
| start = index * TIME_LENGTH / frames_per_3s | |
| if start > dur: | |
| break | |
| to = (index + 1) * TIME_LENGTH / frames_per_3s | |
| outputs.append( | |
| { | |
| "Frame": f"{format_second(start)} - {format_second(to)}", | |
| "Tech": TRANSLATE[CLASSES[torch.argmax(pred).item()]], | |
| } | |
| ) | |
| index += 1 | |
| return os.path.basename(audio_path), pd.DataFrame(outputs) | |
| if __name__ == "__main__": | |
| warnings.filterwarnings("ignore") | |
| models = get_modelist(assign_model="VGG19_mel") | |
| examples = [] | |
| example_wavs = find_files() | |
| for wav in example_wavs: | |
| examples.append([wav, models[0]]) | |
| with gr.Blocks() as demo: | |
| gr.Interface( | |
| fn=infer, | |
| inputs=[ | |
| gr.Audio(label="Upload audio", type="filepath"), | |
| gr.Dropdown(choices=models, label="Select a model", value=models[0]), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Audio filename", show_copy_button=True), | |
| gr.Dataframe(label="Frame-level guzheng playing technique detection"), | |
| ], | |
| examples=examples, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| title="It is suggested that the recording time should not be too long", | |
| ) | |
| gr.Markdown( | |
| """ | |
| # Cite | |
| ```bibtex | |
| @dataset{zhaorui_liu_2021_5676893, | |
| author = {Monan Zhou, Shenyang Xu, Zhaorui Liu, Zhaowen Wang, Feng Yu, Wei Li and Baoqiang Han}, | |
| title = {CCMusic: an Open and Diverse Database for Chinese Music Information Retrieval Research}, | |
| month = {mar}, | |
| year = {2024}, | |
| publisher = {HuggingFace}, | |
| version = {1.2}, | |
| url = {https://huggingface.co/ccmusic-database} | |
| } | |
| ```""" | |
| ) | |
| demo.launch() | |