github-actions[bot] commited on
Commit
039e024
0 Parent(s):

Sync to HuggingFace Spaces

Browse files
Files changed (9) hide show
  1. .gitattributes +35 -0
  2. .github/workflows/sync.yml +26 -0
  3. .gitignore +6 -0
  4. README.md +31 -0
  5. app.py +175 -0
  6. headers.yaml +9 -0
  7. requirements.txt +6 -0
  8. youtube.py +42 -0
  9. zero.py +45 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.github/workflows/sync.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face Spaces
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ sync:
10
+ name: Sync
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - name: Checkout Repository
15
+ uses: actions/checkout@v4
16
+ with:
17
+ lfs: true
18
+
19
+ - name: Sync to Hugging Face Spaces
20
+ uses: JacobLinCool/huggingface-sync@v1
21
+ with:
22
+ github: ${{ secrets.GITHUB_TOKEN }}
23
+ user: jacoblincool # Hugging Face username or organization name
24
+ space: vocal-separation # Hugging Face space name
25
+ token: ${{ secrets.HF_TOKEN }} # Hugging Face token
26
+ configuration: headers.yaml
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .DS_Store
2
+
3
+ *.wav
4
+ *.mp3
5
+
6
+ __pycache__/
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vocal Separation SOTA
3
+ emoji: 🎤
4
+ colorFrom: red
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 4.37.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # Vocal Separation SOTA
14
+
15
+ [![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/resolve/main/open-in-hf-spaces-lg-dark.svg)](https://huggingface.co/spaces/JacobLinCool/vocal-separation)
16
+
17
+ This is a demo for SOTA vocal separation models. Upload an audio file and the model will separate the vocals from the background music.
18
+
19
+ Based on the result of [MDX23](https://www.aicrowd.com/challenges/sound-demixing-challenge-2023/problems/music-demixing-track-mdx-23/leaderboards), the current SOTA model is [BS-RoFormer](https://arxiv.org/abs/2309.02612).
20
+
21
+ For comparison, you can also try the Mel-RoFormer model (a variant of BS-RoFormer) and the popular HTDemucs FT model.
22
+
23
+ ## Models
24
+
25
+ - BS-RoFormer
26
+ - Mel-RoFormer
27
+ - HTDemucs FT
28
+
29
+ > The models are trained by the [UVR project](https://github.com/Anjok07/ultimatevocalremovergui).
30
+
31
+ > The code of this app is available on [GitHub](https://github.com/JacobLinCool/vocal-separation), any contributions should go there. Hugging Face Space is force pushed by GitHub Actions.
app.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Tuple
3
+ import gradio as gr
4
+ import tempfile
5
+ import numpy as np
6
+ import soundfile as sf
7
+ import librosa
8
+ import matplotlib.pyplot as plt
9
+ from audio_separator.separator import Separator
10
+ from zero import dynGPU
11
+ from youtube import youtube
12
+
13
+
14
+ separators = {
15
+ "BS-RoFormer": Separator(output_dir=tempfile.gettempdir(), output_format="mp3"),
16
+ "Mel-RoFormer": Separator(output_dir=tempfile.gettempdir(), output_format="mp3"),
17
+ "HTDemucs-FT": Separator(output_dir=tempfile.gettempdir(), output_format="mp3"),
18
+ }
19
+
20
+
21
+ def load():
22
+ separators["BS-RoFormer"].load_model("model_bs_roformer_ep_317_sdr_12.9755.ckpt")
23
+ separators["Mel-RoFormer"].load_model(
24
+ "model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt"
25
+ )
26
+ separators["HTDemucs-FT"].load_model("htdemucs_ft.yaml")
27
+
28
+
29
+ # sometimes the network might be down, so we retry a few times
30
+ for _ in range(3):
31
+ try:
32
+ load()
33
+ break
34
+ except Exception as e:
35
+ print(e)
36
+
37
+
38
+ def merge(outs):
39
+ print(f"Merging {outs}")
40
+ bgm = np.sum(np.array([sf.read(out)[0] for out in outs]), axis=0)
41
+ print(f"Merged shape: {bgm.shape}")
42
+ tmp_file = os.path.join(tempfile.gettempdir(), f"{outs[0].split('/')[-1]}_merged")
43
+ sf.write(tmp_file + ".mp3", bgm, 44100)
44
+ return tmp_file + ".mp3"
45
+
46
+
47
+ def measure_duration(audio: str, model: str) -> int:
48
+ y, sr = librosa.load(audio, sr=44100)
49
+ return int(librosa.get_duration(y=y, sr=sr) / 3.0)
50
+
51
+
52
+ @dynGPU(duration=measure_duration)
53
+ def separate(audio: str, model: str) -> Tuple[str, str]:
54
+ separator = separators[model]
55
+ outs = separator.separate(audio)
56
+ outs = [os.path.join(tempfile.gettempdir(), out) for out in outs]
57
+ # roformers
58
+ if len(outs) == 2:
59
+ return outs[1], outs[0]
60
+ # demucs
61
+ if len(outs) == 4:
62
+ bgm = merge(outs[:3])
63
+ return outs[3], bgm
64
+ raise gr.Error("Unknown output format")
65
+
66
+
67
+ def from_youtube(url: str, model: str) -> Tuple[str, str, str]:
68
+ audio = youtube(url)
69
+ return audio, *separate(audio, model)
70
+
71
+
72
+ def plot_spectrogram(audio: str):
73
+ y, sr = librosa.load(audio, sr=44100)
74
+ S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
75
+ S_dB = librosa.power_to_db(S, ref=np.max)
76
+ fig = plt.figure(figsize=(15, 5))
77
+ librosa.display.specshow(S_dB, sr=sr, x_axis="time", y_axis="mel")
78
+ plt.colorbar(format="%+2.0f dB")
79
+ plt.title("Mel-frequency spectrogram")
80
+ fig.tight_layout()
81
+ return fig
82
+
83
+
84
+ with gr.Blocks() as app:
85
+ with open(os.path.join(os.path.dirname(__file__), "README.md"), "r") as f:
86
+ README = f.read()
87
+ # remove yaml front matter
88
+ blocks = README.split("---")
89
+ if len(blocks) > 1:
90
+ README = "---".join(blocks[2:])
91
+
92
+ gr.Markdown(README)
93
+
94
+ with gr.Row():
95
+ with gr.Column():
96
+ gr.Markdown("## Upload an audio file")
97
+ audio = gr.Audio(label="Upload an audio file", type="filepath")
98
+ with gr.Column():
99
+ gr.Markdown(
100
+ "## or use a YouTube URL\n\nTry something on [The First Take](https://www.youtube.com/@The_FirstTake)?"
101
+ )
102
+ yt = gr.Textbox(
103
+ label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."
104
+ )
105
+ yt_btn = gr.Button("Use this YouTube URL")
106
+
107
+ with gr.Row():
108
+ model = gr.Radio(
109
+ label="Select a model",
110
+ choices=[s for s in separators.keys()],
111
+ value="BS-RoFormer",
112
+ )
113
+ btn = gr.Button("Separate", variant="primary")
114
+
115
+ with gr.Row():
116
+ with gr.Column():
117
+ vocals = gr.Audio(
118
+ label="Vocals", format="mp3", type="filepath", interactive=False
119
+ )
120
+ with gr.Column():
121
+ bgm = gr.Audio(
122
+ label="Background", format="mp3", type="filepath", interactive=False
123
+ )
124
+
125
+ with gr.Row():
126
+ with gr.Column():
127
+ vocal_spec = gr.Plot(label="Vocal spectrogram")
128
+ with gr.Column():
129
+ bgm_spec = gr.Plot(label="Background spectrogram")
130
+
131
+ gr.Examples(
132
+ examples=[
133
+ # I don't have any good examples, please contribute some!
134
+ # Suno's generated musix seems to have too many artifacts
135
+ ],
136
+ inputs=[audio],
137
+ )
138
+
139
+ gr.Markdown(
140
+ """
141
+ - BS-RoFormer: https://arxiv.org/abs/2309.02612
142
+ - Mel-RoFormer: https://arxiv.org/abs/2310.01809
143
+ """
144
+ )
145
+
146
+ btn.click(
147
+ fn=separate,
148
+ inputs=[audio, model],
149
+ outputs=[vocals, bgm],
150
+ api_name="separate",
151
+ ).success(
152
+ fn=plot_spectrogram,
153
+ inputs=[vocals],
154
+ outputs=[vocal_spec],
155
+ ).success(
156
+ fn=plot_spectrogram,
157
+ inputs=[bgm],
158
+ outputs=[bgm_spec],
159
+ )
160
+
161
+ yt_btn.click(
162
+ fn=from_youtube,
163
+ inputs=[yt, model],
164
+ outputs=[audio, vocals, bgm],
165
+ ).success(
166
+ fn=plot_spectrogram,
167
+ inputs=[vocals],
168
+ outputs=[vocal_spec],
169
+ ).success(
170
+ fn=plot_spectrogram,
171
+ inputs=[bgm],
172
+ outputs=[bgm_spec],
173
+ )
174
+
175
+ app.launch(show_error=True)
headers.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ title: Vocal Separation SOTA
2
+ emoji: 🎤
3
+ colorFrom: red
4
+ colorTo: gray
5
+ sdk: gradio
6
+ sdk_version: 4.37.2
7
+ app_file: app.py
8
+ pinned: false
9
+ license: mit
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ audio-separator[gpu]; sys_platform != 'darwin'
3
+ audio-separator[cpu]; sys_platform == 'darwin'
4
+ yt_dlp
5
+ librosa
6
+ spaces
youtube.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from gradio_client import Client
4
+ import yt_dlp
5
+ import tempfile
6
+ import hashlib
7
+ import shutil
8
+
9
+
10
+ def youtube(url: str) -> str:
11
+ if not url:
12
+ raise gr.Error("Please input a YouTube URL")
13
+
14
+ hash = hashlib.md5(url.encode()).hexdigest()
15
+ tmp_file = os.path.join(tempfile.gettempdir(), f"{hash}")
16
+
17
+ try:
18
+ ydl_opts = {
19
+ "format": "bestaudio/best",
20
+ "outtmpl": tmp_file,
21
+ "postprocessors": [
22
+ {
23
+ "key": "FFmpegExtractAudio",
24
+ "preferredcodec": "mp3",
25
+ "preferredquality": "192",
26
+ }
27
+ ],
28
+ }
29
+
30
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
31
+ ydl.download([url])
32
+ except Exception as e:
33
+ print(e)
34
+ try:
35
+ ytdl = Client("JacobLinCool/yt-dlp")
36
+ file = ytdl.predict(api_name="/download", url=url)
37
+ shutil.move(file, tmp_file + ".mp3")
38
+ except Exception as e:
39
+ print(e)
40
+ raise gr.Error(f"Failed to download YouTube audio from {url}")
41
+
42
+ return tmp_file + ".mp3"
zero.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable
2
+ from functools import partial
3
+ import gradio as gr
4
+ import spaces
5
+ import spaces.config
6
+ from spaces.zero.decorator import P, R
7
+
8
+
9
+ def _dynGPU(
10
+ fn: Callable[P, R] | None, duration: Callable[P, int], min=30, max=300, step=10
11
+ ) -> Callable[P, R]:
12
+ if not spaces.config.Config.zero_gpu:
13
+ return fn
14
+
15
+ funcs = [
16
+ (t, spaces.GPU(duration=t)(lambda *args, **kwargs: fn(*args, **kwargs)))
17
+ for t in range(min, max + 1, step)
18
+ ]
19
+
20
+ def wrapper(*args, **kwargs):
21
+ requirement = duration(*args, **kwargs)
22
+
23
+ # find the function that satisfies the duration requirement
24
+ for t, func in funcs:
25
+ if t >= requirement:
26
+ gr.Info(f"Acquiring ZeroGPU for {t} seconds")
27
+ return func(*args, **kwargs)
28
+
29
+ # if no function is found, return the last one
30
+ gr.Info(f"Acquiring ZeroGPU for {funcs[-1][0]} seconds")
31
+ return funcs[-1][1](*args, **kwargs)
32
+
33
+ return wrapper
34
+
35
+
36
+ def dynGPU(
37
+ fn: Callable[P, R] | None = None,
38
+ duration: Callable[P, int] = lambda: 60,
39
+ min=30,
40
+ max=300,
41
+ step=10,
42
+ ) -> Callable[P, R]:
43
+ if fn is None:
44
+ return partial(_dynGPU, duration=duration, min=min, max=max, step=step)
45
+ return _dynGPU(fn, duration, min, max, step)