Music-To-Lyrics / app.py
fffiloni's picture
Update app.py
644617d verified
raw
history blame
4 kB
import gradio as gr
from gradio_client import Client
import torch
import os
from scipy.io.wavfile import write
hf_token = os.environ.get('HF_TOKEN')
#splt_client = Client("https://fffiloni-splittrack2musicgen.hf.space/")
def split_process(audio, chosen_out_track):
os.makedirs("out", exist_ok=True)
write('test.wav', audio[0], audio[1])
os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out")
#return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav"
if chosen_out_track == "vocals":
return "/out/mdx_extra_q/test/vocals.wav"
elif chosen_out_track == "bass":
return "./out/mdx_extra_q/test/bass.wav"
elif chosen_out_track == "drums":
return "./out/mdx_extra_q/test/drums.wav"
elif chosen_out_track == "other":
return "./out/mdx_extra_q/test/other.wav"
elif chosen_out_track == "all-in":
return "test.wav"
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import tempfile
MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
token=hf_token
)
#@spaces.GPU
def transcribe(inputs, task):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return text
import re
def format_lyrics(text):
# Use regex to find parts that start with a capital letter and insert a newline
formatted_text = re.sub(r'(?<!^)([A-Z])', r'\n\1', text)
# Remove any leading whitespace on each line
formatted_text = re.sub(r'^[ \t]+', '', formatted_text, flags=re.MULTILINE)
return formatted_text
def infer(audio_input):
# STEP 1 | Split vocals from the song/audio file
splt_result = split_process(audio_input, "vocals")
print(splt_result)
# STEP 2 | Transcribe
# TO-DO : handling errors if JAX demo queue is full
whisper_result = transcribe(
splt_result, # str (filepath or URL to file) in 'inputs' Audio component
"transcribe", # str in 'Task' Radio component
)
print(whisper_result)
#return whisper_result[0] # if using JAX
lyrics = format_lyrics(whisper_result)
print(lyrics)
return splt_result, lyrics
css = """
#col-container {max-width: 510px; margin-left: auto; margin-right: auto;}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
Song To Lyrics
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Send the audio file of your favorite song, and get the lyrics ! <br />
Under the hood, we split and get the vocals track from the audio file, then send the vocals to Whisper.
</p>
</div>""")
song_in = gr.Audio(label="Song input", type="numpy", sources="upload")
getlyrics_btn = gr.Button("Get Lyrics !")
vocals_out = gr.Audio(label="Vocals Only")
lyrics_res = gr.Textbox(label="Lyrics")
getlyrics_btn.click(fn=infer, inputs=[song_in], outputs=[vocals_out, lyrics_res])
demo.queue().launch()