Spaces:
Sleeping
Sleeping
File size: 3,999 Bytes
f708112 8b67ea6 42456ee e37d595 42456ee 8102ff1 644617d 8102ff1 42456ee 8b67ea6 42456ee 8b67ea6 babb3b4 8b67ea6 9edd470 4e51e0a 33dfd12 4e51e0a 33dfd12 4e51e0a f708112 9edd470 8102ff1 f708112 5116161 8b67ea6 ace8f3b f708112 8b67ea6 f708112 5116161 4e51e0a a398fde 4e51e0a f708112 9edd470 f708112 1c08be3 cebdacd 64a38ed cebdacd e955550 f708112 787c620 f708112 787c620 f708112 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import gradio as gr
from gradio_client import Client
import torch
import os
from scipy.io.wavfile import write
hf_token = os.environ.get('HF_TOKEN')
#splt_client = Client("https://fffiloni-splittrack2musicgen.hf.space/")
def split_process(audio, chosen_out_track):
os.makedirs("out", exist_ok=True)
write('test.wav', audio[0], audio[1])
os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out")
#return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav"
if chosen_out_track == "vocals":
return "/out/mdx_extra_q/test/vocals.wav"
elif chosen_out_track == "bass":
return "./out/mdx_extra_q/test/bass.wav"
elif chosen_out_track == "drums":
return "./out/mdx_extra_q/test/drums.wav"
elif chosen_out_track == "other":
return "./out/mdx_extra_q/test/other.wav"
elif chosen_out_track == "all-in":
return "test.wav"
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import tempfile
MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
token=hf_token
)
#@spaces.GPU
def transcribe(inputs, task):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return text
import re
def format_lyrics(text):
# Use regex to find parts that start with a capital letter and insert a newline
formatted_text = re.sub(r'(?<!^)([A-Z])', r'\n\1', text)
# Remove any leading whitespace on each line
formatted_text = re.sub(r'^[ \t]+', '', formatted_text, flags=re.MULTILINE)
return formatted_text
def infer(audio_input):
# STEP 1 | Split vocals from the song/audio file
splt_result = split_process(audio_input, "vocals")
print(splt_result)
# STEP 2 | Transcribe
# TO-DO : handling errors if JAX demo queue is full
whisper_result = transcribe(
splt_result, # str (filepath or URL to file) in 'inputs' Audio component
"transcribe", # str in 'Task' Radio component
)
print(whisper_result)
#return whisper_result[0] # if using JAX
lyrics = format_lyrics(whisper_result)
print(lyrics)
return splt_result, lyrics
css = """
#col-container {max-width: 510px; margin-left: auto; margin-right: auto;}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
Song To Lyrics
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Send the audio file of your favorite song, and get the lyrics ! <br />
Under the hood, we split and get the vocals track from the audio file, then send the vocals to Whisper.
</p>
</div>""")
song_in = gr.Audio(label="Song input", type="numpy", sources="upload")
getlyrics_btn = gr.Button("Get Lyrics !")
vocals_out = gr.Audio(label="Vocals Only")
lyrics_res = gr.Textbox(label="Lyrics")
getlyrics_btn.click(fn=infer, inputs=[song_in], outputs=[vocals_out, lyrics_res])
demo.queue().launch()
|