Spaces:
Sleeping
Sleeping
import gradio as gr | |
from gradio_client import Client | |
import torch | |
import os | |
from scipy.io.wavfile import write | |
hf_token = os.environ.get('HF_TOKEN') | |
#splt_client = Client("https://fffiloni-splittrack2musicgen.hf.space/") | |
def split_process(audio, chosen_out_track): | |
os.makedirs("out", exist_ok=True) | |
write('test.wav', audio[0], audio[1]) | |
os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out") | |
#return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav" | |
if chosen_out_track == "vocals": | |
return "/out/mdx_extra_q/test/vocals.wav" | |
elif chosen_out_track == "bass": | |
return "./out/mdx_extra_q/test/bass.wav" | |
elif chosen_out_track == "drums": | |
return "./out/mdx_extra_q/test/drums.wav" | |
elif chosen_out_track == "other": | |
return "./out/mdx_extra_q/test/other.wav" | |
elif chosen_out_track == "all-in": | |
return "test.wav" | |
from transformers import pipeline | |
from transformers.pipelines.audio_utils import ffmpeg_read | |
import tempfile | |
MODEL_NAME = "openai/whisper-large-v3-turbo" | |
BATCH_SIZE = 8 | |
FILE_LIMIT_MB = 1000 | |
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files | |
device = 0 if torch.cuda.is_available() else "cpu" | |
pipe = pipeline( | |
task="automatic-speech-recognition", | |
model=MODEL_NAME, | |
chunk_length_s=30, | |
device=device, | |
token=hf_token | |
) | |
#@spaces.GPU | |
def transcribe(inputs, task): | |
if inputs is None: | |
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") | |
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"] | |
return text | |
import re | |
def format_lyrics(text): | |
# Use regex to find parts that start with a capital letter and insert a newline | |
formatted_text = re.sub(r'(?<!^)([A-Z])', r'\n\1', text) | |
# Remove any leading whitespace on each line | |
formatted_text = re.sub(r'^[ \t]+', '', formatted_text, flags=re.MULTILINE) | |
return formatted_text | |
def infer(audio_input): | |
# STEP 1 | Split vocals from the song/audio file | |
splt_result = split_process(audio_input, "vocals") | |
print(splt_result) | |
# STEP 2 | Transcribe | |
# TO-DO : handling errors if JAX demo queue is full | |
whisper_result = transcribe( | |
splt_result, # str (filepath or URL to file) in 'inputs' Audio component | |
"transcribe", # str in 'Task' Radio component | |
) | |
print(whisper_result) | |
#return whisper_result[0] # if using JAX | |
lyrics = format_lyrics(whisper_result) | |
print(lyrics) | |
return splt_result, lyrics | |
css = """ | |
#col-container {max-width: 510px; margin-left: auto; margin-right: auto;} | |
""" | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;"> | |
<div | |
style=" | |
display: inline-flex; | |
align-items: center; | |
gap: 0.8rem; | |
font-size: 1.75rem; | |
" | |
> | |
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;"> | |
Song To Lyrics | |
</h1> | |
</div> | |
<p style="margin-bottom: 10px; font-size: 94%"> | |
Send the audio file of your favorite song, and get the lyrics ! <br /> | |
Under the hood, we split and get the vocals track from the audio file, then send the vocals to Whisper. | |
</p> | |
</div>""") | |
song_in = gr.Audio(label="Song input", type="numpy", sources="upload") | |
getlyrics_btn = gr.Button("Get Lyrics !") | |
vocals_out = gr.Audio(label="Vocals Only") | |
lyrics_res = gr.Textbox(label="Lyrics") | |
getlyrics_btn.click(fn=infer, inputs=[song_in], outputs=[vocals_out, lyrics_res]) | |
demo.queue().launch() | |