Spaces:

ccmusic-database
/

pianos

Running

App Files Files

admin commited on Apr 23

Commit

45381f2

1 Parent(s): 8389616

sync ms

Browse files

Files changed (3) hide show

app.py +78 -80
model.py +39 -3
requirements.txt +5 -3

app.py CHANGED Viewed

@@ -11,22 +11,7 @@ import torchvision.transforms as transforms
 from collections import Counter
 from PIL import Image
 from tqdm import tqdm
-from model import net, MODEL_DIR
-MODEL = net()
-TRANS = {
-    "PearlRiver": "Pearl River",
-    "YoungChang": "YOUNG CHANG",
-    "Steinway-T": "STEINWAY Theater",
-    "Hsinghai": "HSINGHAI",
-    "Kawai": "KAWAI",
-    "Steinway": "STEINWAY",
-    "Kawai-G": "KAWAI Grand",
-    "Yamaha": "YAMAHA",
-}
-CLASSES = list(TRANS.keys())
-CACHE_DIR = "./__pycache__/tmp"
 def most_common_element(input_list):
@@ -36,30 +21,26 @@ def most_common_element(input_list):
 def wav_to_mel(audio_path: str, width=0.18):
-    os.makedirs(CACHE_DIR, exist_ok=True)
-    try:
-        y, sr = librosa.load(audio_path, sr=48000)
-        non_silent = y
-        mel_spec = librosa.feature.melspectrogram(y=non_silent, sr=sr)
-        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
-        dur = librosa.get_duration(y=non_silent, sr=sr)
-        total_frames = log_mel_spec.shape[1]
-        step = int(width * total_frames / dur)
-        count = int(total_frames / step)
-        begin = int(0.5 * (total_frames - count * step))
-        end = begin + step * count
-        for i in tqdm(range(begin, end, step), desc="Converting wav to jpgs..."):
-            librosa.display.specshow(log_mel_spec[:, i : i + step])
-            plt.axis("off")
-            plt.savefig(
-                f"{CACHE_DIR}/{os.path.basename(audio_path)[:-4]}_{i}.jpg",
-                bbox_inches="tight",
-                pad_inches=0.0,
-            )
-            plt.close()
-    except Exception as e:
-        print(f"Error converting {audio_path} : {e}")
 def embed_img(img_path, input_size=224):
@@ -74,65 +55,82 @@ def embed_img(img_path, input_size=224):
     return transform(img).unsqueeze(0)
-def inference(wav_path, folder_path=CACHE_DIR):
-    if os.path.exists(folder_path):
-        shutil.rmtree(folder_path)
-    if not wav_path:
-        return None, "Please input an audio!"
-    wav_to_mel(wav_path)
-    outputs = []
-    all_files = os.listdir(folder_path)
-    for file_name in all_files:
-        if file_name.lower().endswith(".jpg"):
-            file_path = os.path.join(folder_path, file_name)
-            input = embed_img(file_path)
-            output: torch.Tensor = MODEL(input)
-            pred_id = torch.max(output.data, 1)[1]
-            outputs.append(pred_id)
-    max_count_item = most_common_element(outputs)
-    shutil.rmtree(folder_path)
-    return os.path.basename(wav_path), TRANS[CLASSES[max_count_item]]
 if __name__ == "__main__":
     warnings.filterwarnings("ignore")
     example_wavs = []
-    for cls in CLASSES:
         example_wavs.append(f"{MODEL_DIR}/examples/{cls}.wav")
     with gr.Blocks() as demo:
         gr.Interface(
-            fn=inference,
-            inputs=gr.Audio(type="filepath", label="Upload a piano recording"),
             outputs=[
-                gr.Textbox(label="Audio filename", show_copy_button=True),
-                gr.Textbox(
-                    label="Piano classification result",
-                    show_copy_button=True,
-                ),
             ],
             examples=example_wavs,
             cache_examples=False,
             allow_flagging="never",
-            title="It is recommended to keep the duration of recording around 3s, too long will affect the recognition efficiency.",
         )
         gr.Markdown(
-            """
-# Cite
-```bibtex
-@inproceedings{zhou2023holistic,
-  title        = {A Holistic Evaluation of Piano Sound Quality},
-  author       = {Monan Zhou and Shangda Wu and Shaohua Ji and Zijin Li and Wei Li},
-  booktitle    = {National Conference on Sound and Music Technology},
-  pages        = {3--17},
-  year         = {2023},
-  organization = {Springer}
-}
-```"""
         )
     demo.launch()

 from collections import Counter
 from PIL import Image
 from tqdm import tqdm
+from model import net, _L, MODEL_DIR, TMP_DIR
 def most_common_element(input_list):
 def wav_to_mel(audio_path: str, width=0.18):
+    os.makedirs(TMP_DIR, exist_ok=True)
+    y, sr = librosa.load(audio_path, sr=48000)
+    non_silent = y
+    mel_spec = librosa.feature.melspectrogram(y=non_silent, sr=sr)
+    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
+    dur = librosa.get_duration(y=non_silent, sr=sr)
+    total_frames = log_mel_spec.shape[1]
+    step = int(width * total_frames / dur)
+    count = int(total_frames / step)
+    begin = int(0.5 * (total_frames - count * step))
+    end = begin + step * count
+    for i in tqdm(range(begin, end, step), desc="转换 wav 至 jpgs..."):
+        librosa.display.specshow(log_mel_spec[:, i : i + step])
+        plt.axis("off")
+        plt.savefig(
+            f"{TMP_DIR}/{os.path.basename(audio_path)[:-4]}_{i}.jpg",
+            bbox_inches="tight",
+            pad_inches=0.0,
+        )
+        plt.close()
 def embed_img(img_path, input_size=224):
     return transform(img).unsqueeze(0)
+def infer(wav_path, folder_path=TMP_DIR):
+    status = "Success"
+    filename = result = None
+    try:
+        if os.path.exists(folder_path):
+            shutil.rmtree(folder_path)
+        if not wav_path:
+            raise ValueError("请输入音频!")
+        wav_to_mel(wav_path)
+        outputs = []
+        all_files = os.listdir(folder_path)
+        for file_name in all_files:
+            if file_name.lower().endswith(".jpg"):
+                file_path = os.path.join(folder_path, file_name)
+                input = embed_img(file_path)
+                output: torch.Tensor = net()(input)
+                pred_id = torch.max(output.data, 1)[1]
+                outputs.append(pred_id)
+        max_count_item = most_common_element(outputs)
+        filename = os.path.basename(wav_path)
+        result = translate[classes[max_count_item]]
+    except Exception as e:
+        status = f"{e}"
+    return status, filename, result
 if __name__ == "__main__":
     warnings.filterwarnings("ignore")
+    translate = {
+        "PearlRiver": _L("珠江"),
+        "YoungChang": _L("英昌"),
+        "Steinway-T": _L("施坦威剧场"),
+        "Hsinghai": _L("星海"),
+        "Kawai": _L("卡瓦依"),
+        "Steinway": _L("施坦威"),
+        "Kawai-G": _L("卡瓦依三角"),
+        "Yamaha": _L("雅马哈"),
+    }
+    classes = list(translate.keys())
     example_wavs = []
+    for cls in classes:
         example_wavs.append(f"{MODEL_DIR}/examples/{cls}.wav")
     with gr.Blocks() as demo:
         gr.Interface(
+            fn=infer,
+            inputs=gr.Audio(type="filepath", label=_L("上传钢琴录音")),
             outputs=[
+                gr.Textbox(label=_L("状态栏"), show_copy_button=True),
+                gr.Textbox(label=_L("音频文件名"), show_copy_button=True),
+                gr.Textbox(label=_L("钢琴分类结果"), show_copy_button=True),
             ],
             examples=example_wavs,
             cache_examples=False,
             allow_flagging="never",
+            title=_L("建议录音时长保持在 3s 左右, 过长会影响识别效率"),
         )
         gr.Markdown(
+            f"# {_L('引用')}"
+            + """
+            ```bibtex
+                @inproceedings{zhou2023holistic,
+                title        = {A Holistic Evaluation of Piano Sound Quality},
+                author       = {Monan Zhou and Shangda Wu and Shaohua Ji and Zijin Li and Wei Li},
+                booktitle    = {National Conference on Sound and Music Technology},
+                pages        = {3--17},
+                year         = {2023},
+                organization = {Springer}
+            }
+            ```"""
         )
     demo.launch()

model.py CHANGED Viewed

@@ -1,9 +1,45 @@
 import torch
 import torch.nn as nn
-from huggingface_hub import snapshot_download
 from torchvision.models import squeezenet1_1
-MODEL_DIR = snapshot_download("ccmusic-database/pianos", cache_dir="./__pycache__")
 def Classifier(cls_num=8, output_size=512, linear_output=False):
@@ -41,7 +77,7 @@ def Classifier(cls_num=8, output_size=512, linear_output=False):
         )
-def net(weights=f"{MODEL_DIR}/save.pt"):
     model = squeezenet1_1(pretrained=False)
     model.classifier = Classifier()
     model.load_state_dict(torch.load(weights, map_location=torch.device("cpu")))

+import os
 import torch
 import torch.nn as nn
+import huggingface_hub
+import modelscope
 from torchvision.models import squeezenet1_1
+TMP_DIR = "./__pycache__/tmp"
+EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
+ZH2EN = {
+    "上传钢琴录音": "Upload a piano recording",
+    "状态栏": "Status",
+    "音频文件名": "Audio filename",
+    "钢琴分类结果": "Piano classification result",
+    "建议录音时长保持在 3s 左右, 过长会影响识别效率": "It is recommended to keep the duration of recording around 3s, too long will affect the recognition efficiency.",
+    "引用": "Cite",
+    "珠江": "Pearl River",
+    "英昌": "YOUNG CHANG",
+    "施坦威剧场": "STEINWAY Theater",
+    "星海": "HSINGHAI",
+    "卡瓦依": "KAWAI",
+    "施坦威": "STEINWAY",
+    "卡瓦依三角": "KAWAI Grand",
+    "雅马哈": "YAMAHA",
+}
+MODEL_DIR = (
+    huggingface_hub.snapshot_download(
+        "ccmusic-database/pianos",
+        cache_dir="./__pycache__",
+    )
+    if EN_US
+    else modelscope.snapshot_download(
+        "ccmusic-database/pianos",
+        cache_dir="./__pycache__",
+    )
+)
+def _L(zh_txt: str):
+    return ZH2EN[zh_txt] if EN_US else zh_txt
 def Classifier(cls_num=8, output_size=512, linear_output=False):
         )
+def net(weights=MODEL_DIR + "/save.pt"):
     model = squeezenet1_1(pretrained=False)
     model.classifier = Classifier()
     model.load_state_dict(torch.load(weights, map_location=torch.device("cpu")))

requirements.txt CHANGED Viewed

@@ -1,5 +1,7 @@
-torch
-pillow
 librosa
 matplotlib
-torchvision

+torch==2.6.0+cu118
+-f https://download.pytorch.org/whl/torch
+torchvision==0.21.0+cu118
+-f https://download.pytorch.org/whl/torchvision
 librosa
 matplotlib
+modelscope[framework]==1.21.0