Spaces:

JacobLinCool
/

vocal-separation

Running on Zero

App Files Files Community

github-actions[bot] commited on Jul 25, 2024

Commit

039e024

0 Parent(s):

Sync to HuggingFace Spaces

Browse files

Files changed (9) hide show

.gitattributes +35 -0
.github/workflows/sync.yml +26 -0
.gitignore +6 -0
README.md +31 -0
app.py +175 -0
headers.yaml +9 -0
requirements.txt +6 -0
youtube.py +42 -0
zero.py +45 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.github/workflows/sync.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+name: Sync to Hugging Face Spaces
+on:
+    push:
+        branches:
+            - main
+jobs:
+    sync:
+        name: Sync
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout Repository
+              uses: actions/checkout@v4
+              with:
+                  lfs: true
+            - name: Sync to Hugging Face Spaces
+              uses: JacobLinCool/huggingface-sync@v1
+              with:
+                  github: ${{ secrets.GITHUB_TOKEN }}
+                  user: jacoblincool # Hugging Face username or organization name
+                  space: vocal-separation # Hugging Face space name
+                  token: ${{ secrets.HF_TOKEN }} # Hugging Face token
+                  configuration: headers.yaml

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.DS_Store
+*.wav
+*.mp3
+__pycache__/

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+---
+title: Vocal Separation SOTA
+emoji: 🎤
+colorFrom: red
+colorTo: gray
+sdk: gradio
+sdk_version: 4.37.2
+app_file: app.py
+pinned: false
+license: mit
+---
+# Vocal Separation SOTA
+[![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/resolve/main/open-in-hf-spaces-lg-dark.svg)](https://huggingface.co/spaces/JacobLinCool/vocal-separation)
+This is a demo for SOTA vocal separation models. Upload an audio file and the model will separate the vocals from the background music.
+Based on the result of [MDX23](https://www.aicrowd.com/challenges/sound-demixing-challenge-2023/problems/music-demixing-track-mdx-23/leaderboards), the current SOTA model is [BS-RoFormer](https://arxiv.org/abs/2309.02612).
+For comparison, you can also try the Mel-RoFormer model (a variant of BS-RoFormer) and the popular HTDemucs FT model.
+## Models
+- BS-RoFormer
+- Mel-RoFormer
+- HTDemucs FT
+> The models are trained by the [UVR project](https://github.com/Anjok07/ultimatevocalremovergui).
+> The code of this app is available on [GitHub](https://github.com/JacobLinCool/vocal-separation), any contributions should go there. Hugging Face Space is force pushed by GitHub Actions.

app.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import os
+from typing import Tuple
+import gradio as gr
+import tempfile
+import numpy as np
+import soundfile as sf
+import librosa
+import matplotlib.pyplot as plt
+from audio_separator.separator import Separator
+from zero import dynGPU
+from youtube import youtube
+separators = {
+    "BS-RoFormer": Separator(output_dir=tempfile.gettempdir(), output_format="mp3"),
+    "Mel-RoFormer": Separator(output_dir=tempfile.gettempdir(), output_format="mp3"),
+    "HTDemucs-FT": Separator(output_dir=tempfile.gettempdir(), output_format="mp3"),
+}
+def load():
+    separators["BS-RoFormer"].load_model("model_bs_roformer_ep_317_sdr_12.9755.ckpt")
+    separators["Mel-RoFormer"].load_model(
+        "model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt"
+    )
+    separators["HTDemucs-FT"].load_model("htdemucs_ft.yaml")
+# sometimes the network might be down, so we retry a few times
+for _ in range(3):
+    try:
+        load()
+        break
+    except Exception as e:
+        print(e)
+def merge(outs):
+    print(f"Merging {outs}")
+    bgm = np.sum(np.array([sf.read(out)[0] for out in outs]), axis=0)
+    print(f"Merged shape: {bgm.shape}")
+    tmp_file = os.path.join(tempfile.gettempdir(), f"{outs[0].split('/')[-1]}_merged")
+    sf.write(tmp_file + ".mp3", bgm, 44100)
+    return tmp_file + ".mp3"
+def measure_duration(audio: str, model: str) -> int:
+    y, sr = librosa.load(audio, sr=44100)
+    return int(librosa.get_duration(y=y, sr=sr) / 3.0)
+@dynGPU(duration=measure_duration)
+def separate(audio: str, model: str) -> Tuple[str, str]:
+    separator = separators[model]
+    outs = separator.separate(audio)
+    outs = [os.path.join(tempfile.gettempdir(), out) for out in outs]
+    # roformers
+    if len(outs) == 2:
+        return outs[1], outs[0]
+    # demucs
+    if len(outs) == 4:
+        bgm = merge(outs[:3])
+        return outs[3], bgm
+    raise gr.Error("Unknown output format")
+def from_youtube(url: str, model: str) -> Tuple[str, str, str]:
+    audio = youtube(url)
+    return audio, *separate(audio, model)
+def plot_spectrogram(audio: str):
+    y, sr = librosa.load(audio, sr=44100)
+    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
+    S_dB = librosa.power_to_db(S, ref=np.max)
+    fig = plt.figure(figsize=(15, 5))
+    librosa.display.specshow(S_dB, sr=sr, x_axis="time", y_axis="mel")
+    plt.colorbar(format="%+2.0f dB")
+    plt.title("Mel-frequency spectrogram")
+    fig.tight_layout()
+    return fig
+with gr.Blocks() as app:
+    with open(os.path.join(os.path.dirname(__file__), "README.md"), "r") as f:
+        README = f.read()
+        # remove yaml front matter
+        blocks = README.split("---")
+        if len(blocks) > 1:
+            README = "---".join(blocks[2:])
+    gr.Markdown(README)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## Upload an audio file")
+            audio = gr.Audio(label="Upload an audio file", type="filepath")
+        with gr.Column():
+            gr.Markdown(
+                "## or use a YouTube URL\n\nTry something on [The First Take](https://www.youtube.com/@The_FirstTake)?"
+            )
+            yt = gr.Textbox(
+                label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."
+            )
+            yt_btn = gr.Button("Use this YouTube URL")
+    with gr.Row():
+        model = gr.Radio(
+            label="Select a model",
+            choices=[s for s in separators.keys()],
+            value="BS-RoFormer",
+        )
+        btn = gr.Button("Separate", variant="primary")
+    with gr.Row():
+        with gr.Column():
+            vocals = gr.Audio(
+                label="Vocals", format="mp3", type="filepath", interactive=False
+            )
+        with gr.Column():
+            bgm = gr.Audio(
+                label="Background", format="mp3", type="filepath", interactive=False
+            )
+    with gr.Row():
+        with gr.Column():
+            vocal_spec = gr.Plot(label="Vocal spectrogram")
+        with gr.Column():
+            bgm_spec = gr.Plot(label="Background spectrogram")
+    gr.Examples(
+        examples=[
+            # I don't have any good examples, please contribute some!
+            # Suno's generated musix seems to have too many artifacts
+        ],
+        inputs=[audio],
+    )
+    gr.Markdown(
+        """
+        - BS-RoFormer: https://arxiv.org/abs/2309.02612
+        - Mel-RoFormer: https://arxiv.org/abs/2310.01809
+        """
+    )
+    btn.click(
+        fn=separate,
+        inputs=[audio, model],
+        outputs=[vocals, bgm],
+        api_name="separate",
+    ).success(
+        fn=plot_spectrogram,
+        inputs=[vocals],
+        outputs=[vocal_spec],
+    ).success(
+        fn=plot_spectrogram,
+        inputs=[bgm],
+        outputs=[bgm_spec],
+    )
+    yt_btn.click(
+        fn=from_youtube,
+        inputs=[yt, model],
+        outputs=[audio, vocals, bgm],
+    ).success(
+        fn=plot_spectrogram,
+        inputs=[vocals],
+        outputs=[vocal_spec],
+    ).success(
+        fn=plot_spectrogram,
+        inputs=[bgm],
+        outputs=[bgm_spec],
+    )
+    app.launch(show_error=True)

headers.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+title: Vocal Separation SOTA
+emoji: 🎤
+colorFrom: red
+colorTo: gray
+sdk: gradio
+sdk_version: 4.37.2
+app_file: app.py
+pinned: false
+license: mit

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+audio-separator[gpu]; sys_platform != 'darwin'
+audio-separator[cpu]; sys_platform == 'darwin'
+yt_dlp
+librosa
+spaces

youtube.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import gradio as gr
+from gradio_client import Client
+import yt_dlp
+import tempfile
+import hashlib
+import shutil
+def youtube(url: str) -> str:
+    if not url:
+        raise gr.Error("Please input a YouTube URL")
+    hash = hashlib.md5(url.encode()).hexdigest()
+    tmp_file = os.path.join(tempfile.gettempdir(), f"{hash}")
+    try:
+        ydl_opts = {
+            "format": "bestaudio/best",
+            "outtmpl": tmp_file,
+            "postprocessors": [
+                {
+                    "key": "FFmpegExtractAudio",
+                    "preferredcodec": "mp3",
+                    "preferredquality": "192",
+                }
+            ],
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+    except Exception as e:
+        print(e)
+        try:
+            ytdl = Client("JacobLinCool/yt-dlp")
+            file = ytdl.predict(api_name="/download", url=url)
+            shutil.move(file, tmp_file + ".mp3")
+        except Exception as e:
+            print(e)
+            raise gr.Error(f"Failed to download YouTube audio from {url}")
+    return tmp_file + ".mp3"

zero.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from typing import Callable
+from functools import partial
+import gradio as gr
+import spaces
+import spaces.config
+from spaces.zero.decorator import P, R
+def _dynGPU(
+    fn: Callable[P, R] | None, duration: Callable[P, int], min=30, max=300, step=10
+) -> Callable[P, R]:
+    if not spaces.config.Config.zero_gpu:
+        return fn
+    funcs = [
+        (t, spaces.GPU(duration=t)(lambda *args, **kwargs: fn(*args, **kwargs)))
+        for t in range(min, max + 1, step)
+    ]
+    def wrapper(*args, **kwargs):
+        requirement = duration(*args, **kwargs)
+        # find the function that satisfies the duration requirement
+        for t, func in funcs:
+            if t >= requirement:
+                gr.Info(f"Acquiring ZeroGPU for {t} seconds")
+                return func(*args, **kwargs)
+        # if no function is found, return the last one
+        gr.Info(f"Acquiring ZeroGPU for {funcs[-1][0]} seconds")
+        return funcs[-1][1](*args, **kwargs)
+    return wrapper
+def dynGPU(
+    fn: Callable[P, R] | None = None,
+    duration: Callable[P, int] = lambda: 60,
+    min=30,
+    max=300,
+    step=10,
+) -> Callable[P, R]:
+    if fn is None:
+        return partial(_dynGPU, duration=duration, min=min, max=max, step=step)
+    return _dynGPU(fn, duration, min, max, step)