admin commited on
Commit
4240e15
·
1 Parent(s): 7296665
Files changed (4) hide show
  1. app.py +58 -57
  2. model.py +9 -4
  3. requirements.txt +5 -3
  4. utils.py +57 -10
app.py CHANGED
@@ -9,22 +9,17 @@ import librosa.display
9
  import matplotlib.pyplot as plt
10
  from collections import Counter
11
  from model import EvalNet
12
- from utils import get_modelist, find_files, embed_img
13
-
14
-
15
- TRANSLATE = {
16
- "vibrato": "chan yin",
17
- "upward_portamento": "shang hua yin",
18
- "downward_portamento": "xia hua yin",
19
- "returning_portamento": "hui hua yin",
20
- "glissando": "gua zou, hua zhi",
21
- "tremolo": "yao zhi",
22
- "harmonics": "fan yin",
23
- "plucks": "gou, da, mo, tuo, ...",
24
- }
25
- CLASSES = list(TRANSLATE.keys())
26
- TEMP_DIR = "./__pycache__/tmp"
27
- SAMPLE_RATE = 44100
28
 
29
 
30
  def circular_padding(spec: np.ndarray, end: int):
@@ -127,33 +122,38 @@ def most_frequent_value(lst: list):
127
 
128
 
129
  def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR):
130
- if os.path.exists(folder_path):
131
- shutil.rmtree(folder_path)
 
 
 
132
 
133
- if not wav_path:
134
- return None, "Please input an audio!"
135
 
136
- spec = log_name.split("_")[-3]
137
- os.makedirs(folder_path, exist_ok=True)
138
- try:
139
  model = EvalNet(log_name, len(TRANSLATE)).model
140
  eval("wav2%s" % spec)(wav_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  except Exception as e:
143
- return None, f"{e}"
144
-
145
- jpgs = find_files(folder_path, ".jpg")
146
- preds = []
147
- for jpg in jpgs:
148
- input = embed_img(jpg)
149
- output: torch.Tensor = model(input)
150
- preds.append(torch.max(output.data, 1)[1])
151
 
152
- pred_id = most_frequent_value(preds)
153
- return (
154
- os.path.basename(wav_path),
155
- f"{TRANSLATE[CLASSES[pred_id]]} ({CLASSES[pred_id].capitalize()})",
156
- )
157
 
158
 
159
  if __name__ == "__main__":
@@ -168,39 +168,40 @@ if __name__ == "__main__":
168
  gr.Interface(
169
  fn=infer,
170
  inputs=[
171
- gr.Audio(label="Upload a recording", type="filepath"),
172
- gr.Dropdown(choices=models, label="Select a model", value=models[0]),
173
  ],
174
  outputs=[
175
- gr.Textbox(label="Audio filename", show_copy_button=True),
 
176
  gr.Textbox(
177
- label="Guzheng playing tech recognition",
178
  show_copy_button=True,
179
  ),
180
  ],
181
  examples=examples,
182
  cache_examples=False,
183
  flagging_mode="never",
184
- title="It is recommended to keep the recording length around 3s.",
185
  )
186
 
187
  gr.Markdown(
188
- """
189
- # Cite
190
- ```bibtex
191
- @article{Zhou-2025,
192
- author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
193
- title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
194
- journal = {Transactions of the International Society for Music Information Retrieval},
195
- volume = {8},
196
- number = {1},
197
- pages = {22--38},
198
- month = {Mar},
199
- year = {2025},
200
- url = {https://doi.org/10.5334/tismir.194},
201
- doi = {10.5334/tismir.194}
202
- }
203
- ```"""
204
  )
205
 
206
  demo.launch()
 
9
  import matplotlib.pyplot as plt
10
  from collections import Counter
11
  from model import EvalNet
12
+ from utils import (
13
+ get_modelist,
14
+ find_files,
15
+ embed_img,
16
+ _L,
17
+ SAMPLE_RATE,
18
+ TEMP_DIR,
19
+ TRANSLATE,
20
+ CLASSES,
21
+ EN_US,
22
+ )
 
 
 
 
 
23
 
24
 
25
  def circular_padding(spec: np.ndarray, end: int):
 
122
 
123
 
124
  def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR):
125
+ status = "Success"
126
+ filename = result = None
127
+ try:
128
+ if os.path.exists(folder_path):
129
+ shutil.rmtree(folder_path)
130
 
131
+ if not wav_path:
132
+ raise ValueError("请输入音频!")
133
 
134
+ spec = log_name.split("_")[-3]
135
+ os.makedirs(folder_path, exist_ok=True)
 
136
  model = EvalNet(log_name, len(TRANSLATE)).model
137
  eval("wav2%s" % spec)(wav_path)
138
+ jpgs = find_files(folder_path, ".jpg")
139
+ preds = []
140
+ for jpg in jpgs:
141
+ input = embed_img(jpg)
142
+ output: torch.Tensor = model(input)
143
+ preds.append(torch.max(output.data, 1)[1])
144
+
145
+ pred_id = most_frequent_value(preds)
146
+ filename = os.path.basename(wav_path)
147
+ result = (
148
+ CLASSES[pred_id].capitalize()
149
+ if EN_US
150
+ else f"{TRANSLATE[CLASSES[pred_id]]} ({CLASSES[pred_id].capitalize()})"
151
+ )
152
 
153
  except Exception as e:
154
+ status = f"{e}"
 
 
 
 
 
 
 
155
 
156
+ return status, filename, result
 
 
 
 
157
 
158
 
159
  if __name__ == "__main__":
 
168
  gr.Interface(
169
  fn=infer,
170
  inputs=[
171
+ gr.Audio(label=_L("上传录音"), type="filepath"),
172
+ gr.Dropdown(choices=models, label=_L("选择模型"), value=models[0]),
173
  ],
174
  outputs=[
175
+ gr.Textbox(label=_L("状态栏"), show_copy_button=True),
176
+ gr.Textbox(label=_L("音频文件名"), show_copy_button=True),
177
  gr.Textbox(
178
+ label=_L("古筝演奏技法识别"),
179
  show_copy_button=True,
180
  ),
181
  ],
182
  examples=examples,
183
  cache_examples=False,
184
  flagging_mode="never",
185
+ title=_L("建议录音时长保持在 3s 左右"),
186
  )
187
 
188
  gr.Markdown(
189
+ f"# {_L('引用')}"
190
+ + """
191
+ ```bibtex
192
+ @article{Zhou-2025,
193
+ author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
194
+ title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
195
+ journal = {Transactions of the International Society for Music Information Retrieval},
196
+ volume = {8},
197
+ number = {1},
198
+ pages = {22--38},
199
+ month = {Mar},
200
+ year = {2025},
201
+ url = {https://doi.org/10.5334/tismir.194},
202
+ doi = {10.5334/tismir.194}
203
+ }
204
+ ```"""
205
  )
206
 
207
  demo.launch()
model.py CHANGED
@@ -1,8 +1,9 @@
1
  import torch
2
  import torch.nn as nn
3
  import torchvision.models as models
 
4
  from datasets import load_dataset
5
- from utils import MODEL_DIR
6
 
7
 
8
  class EvalNet:
@@ -17,7 +18,7 @@ class EvalNet:
17
  self.m_type, self.input_size = self._model_info(m_ver)
18
 
19
  if not hasattr(models, m_ver):
20
- raise Exception("Unsupported model.")
21
 
22
  self.model = eval("models.%s()" % m_ver)
23
  linear_output = self._set_outsize()
@@ -34,11 +35,15 @@ class EvalNet:
34
  if ver == bb["ver"]:
35
  return bb
36
 
37
- print("Backbone name not found, using default option - alexnet.")
38
  return backbone_list[0]
39
 
40
  def _model_info(self, m_ver: str):
41
- backbone_list = load_dataset("monetjoe/cv_backbones", split="train")
 
 
 
 
42
  backbone = self._get_backbone(m_ver, backbone_list)
43
  m_type = str(backbone["type"])
44
  input_size = int(backbone["input_size"])
 
1
  import torch
2
  import torch.nn as nn
3
  import torchvision.models as models
4
+ from modelscope.msdatasets import MsDataset
5
  from datasets import load_dataset
6
+ from utils import MODEL_DIR, EN_US
7
 
8
 
9
  class EvalNet:
 
18
  self.m_type, self.input_size = self._model_info(m_ver)
19
 
20
  if not hasattr(models, m_ver):
21
+ raise ValueError("不支持的模型")
22
 
23
  self.model = eval("models.%s()" % m_ver)
24
  linear_output = self._set_outsize()
 
35
  if ver == bb["ver"]:
36
  return bb
37
 
38
+ print("未找到骨干网络名称,使用默认选项 - alexnet")
39
  return backbone_list[0]
40
 
41
  def _model_info(self, m_ver: str):
42
+ backbone_list = (
43
+ load_dataset("monetjoe/cv_backbones", split="train")
44
+ if EN_US
45
+ else MsDataset.load("monetjoe/cv_backbones", split="v1")
46
+ )
47
  backbone = self._get_backbone(m_ver, backbone_list)
48
  m_type = str(backbone["type"])
49
  input_size = int(backbone["input_size"])
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
- torch
2
- pillow
 
 
3
  librosa
4
  matplotlib
5
- torchvision
 
1
+ torch==2.6.0+cu118
2
+ -f https://download.pytorch.org/whl/torch
3
+ torchvision==0.21.0+cu118
4
+ -f https://download.pytorch.org/whl/torchvision
5
  librosa
6
  matplotlib
7
+ modelscope[framework]==1.21.0
utils.py CHANGED
@@ -1,10 +1,60 @@
1
  import os
2
  import torch
3
  import torchvision.transforms as transforms
4
- from huggingface_hub import snapshot_download
 
5
  from PIL import Image
6
 
7
- MODEL_DIR = snapshot_download("ccmusic-database/GZ_IsoTech", cache_dir="./__pycache__")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def toCUDA(x):
@@ -27,19 +77,16 @@ def find_files(folder_path=f"{MODEL_DIR}/examples", ext=".wav"):
27
 
28
 
29
  def get_modelist(model_dir=MODEL_DIR, assign_model=""):
30
- try:
31
- entries = os.listdir(model_dir)
32
- except OSError as e:
33
- print(f"Cannot access {model_dir}: {e}")
34
- return
35
-
36
  output = []
37
- for entry in entries:
 
38
  full_path = os.path.join(model_dir, entry)
 
39
  if entry == ".git" or entry == "examples":
40
- print(f"Skip .git / examples dir: {full_path}")
41
  continue
42
 
 
43
  if os.path.isdir(full_path):
44
  model = os.path.basename(full_path)
45
  if assign_model and assign_model.lower() in model:
 
1
  import os
2
  import torch
3
  import torchvision.transforms as transforms
4
+ import huggingface_hub
5
+ import modelscope
6
  from PIL import Image
7
 
8
+ EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
9
+
10
+ ZH2EN = {
11
+ "上传录音": "Upload a recording (>40dB)",
12
+ "选择模型": "Select a model",
13
+ "状态栏": "Status",
14
+ "音频文件名": "Audio filename",
15
+ "古筝演奏技法识别": "Singing method recognition",
16
+ "建议录音时长保持在 3s 左右": "It is recommended to keep the recording length around 5s, too long will affect the recognition efficiency.",
17
+ "引用": "Cite",
18
+ "颤音": "chan yin",
19
+ "上滑音": "shang hua yin",
20
+ "下滑音": "xia hua yin",
21
+ "回滑音": "hui hua yin",
22
+ "刮奏, 花指": "gua zou, hua zhi",
23
+ "摇指": "yao zhi",
24
+ "泛音": "fan yin",
25
+ "勾, 打, 抹, 托, ...": "gou, da, mo, tuo, ...",
26
+ }
27
+
28
+ MODEL_DIR = (
29
+ huggingface_hub.snapshot_download(
30
+ "ccmusic-database/GZ_IsoTech",
31
+ cache_dir="./__pycache__",
32
+ )
33
+ if EN_US
34
+ else modelscope.snapshot_download(
35
+ "ccmusic-database/GZ_IsoTech",
36
+ cache_dir="./__pycache__",
37
+ )
38
+ )
39
+
40
+
41
+ def _L(zh_txt: str):
42
+ return ZH2EN[zh_txt] if EN_US else zh_txt
43
+
44
+
45
+ TRANSLATE = {
46
+ "vibrato": _L("颤音"),
47
+ "upward_portamento": _L("上滑音"),
48
+ "downward_portamento": _L("下滑音"),
49
+ "returning_portamento": _L("回滑音"),
50
+ "glissando": _L("刮奏, 花指"),
51
+ "tremolo": _L("摇指"),
52
+ "harmonics": _L("泛音"),
53
+ "plucks": _L("勾, 打, 抹, 托, ..."),
54
+ }
55
+ CLASSES = list(TRANSLATE.keys())
56
+ TEMP_DIR = "./__pycache__/tmp"
57
+ SAMPLE_RATE = 44100
58
 
59
 
60
  def toCUDA(x):
 
77
 
78
 
79
  def get_modelist(model_dir=MODEL_DIR, assign_model=""):
 
 
 
 
 
 
80
  output = []
81
+ for entry in os.listdir(model_dir):
82
+ # 获取完整路径
83
  full_path = os.path.join(model_dir, entry)
84
+ # 跳过'.git'文件夹
85
  if entry == ".git" or entry == "examples":
86
+ print(f"跳过 .git examples 文件夹: {full_path}")
87
  continue
88
 
89
+ # 检查条目是文件还是目录
90
  if os.path.isdir(full_path):
91
  model = os.path.basename(full_path)
92
  if assign_model and assign_model.lower() in model: