Spaces:

fffiloni
/

Music-To-Lyrics

Sleeping

App Files Files Community

Music-To-Lyrics / app.py

fffiloni

Update app.py

644617d verified 16 days ago

raw

history blame

4 kB

	import gradio as gr
	from gradio_client import Client
	import torch
	import os
	from scipy.io.wavfile import write
	hf_token = os.environ.get('HF_TOKEN')
	#splt_client = Client("https://fffiloni-splittrack2musicgen.hf.space/")

	def split_process(audio, chosen_out_track):
	os.makedirs("out", exist_ok=True)
	write('test.wav', audio[0], audio[1])
	os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out")
	#return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav"
	if chosen_out_track == "vocals":
	return "/out/mdx_extra_q/test/vocals.wav"
	elif chosen_out_track == "bass":
	return "./out/mdx_extra_q/test/bass.wav"
	elif chosen_out_track == "drums":
	return "./out/mdx_extra_q/test/drums.wav"
	elif chosen_out_track == "other":
	return "./out/mdx_extra_q/test/other.wav"
	elif chosen_out_track == "all-in":
	return "test.wav"

	from transformers import pipeline
	from transformers.pipelines.audio_utils import ffmpeg_read

	import tempfile

	MODEL_NAME = "openai/whisper-large-v3-turbo"
	BATCH_SIZE = 8
	FILE_LIMIT_MB = 1000
	YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files

	device = 0 if torch.cuda.is_available() else "cpu"

	pipe = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	chunk_length_s=30,
	device=device,
	token=hf_token
	)


	#@spaces.GPU
	def transcribe(inputs, task):
	if inputs is None:
	raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

	text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
	return text

	import re

	def format_lyrics(text):
	# Use regex to find parts that start with a capital letter and insert a newline
	formatted_text = re.sub(r'(?<!^)([A-Z])', r'\n\1', text)

	# Remove any leading whitespace on each line
	formatted_text = re.sub(r'^[ \t]+', '', formatted_text, flags=re.MULTILINE)

	return formatted_text


	def infer(audio_input):

	# STEP 1 \| Split vocals from the song/audio file
	splt_result = split_process(audio_input, "vocals")
	print(splt_result)

	# STEP 2 \| Transcribe
	# TO-DO : handling errors if JAX demo queue is full
	whisper_result = transcribe(
	splt_result, # str (filepath or URL to file) in 'inputs' Audio component
	"transcribe", # str in 'Task' Radio component

	)
	print(whisper_result)

	#return whisper_result[0] # if using JAX

	lyrics = format_lyrics(whisper_result)

	print(lyrics)

	return splt_result, lyrics

	css = """
	#col-container {max-width: 510px; margin-left: auto; margin-right: auto;}
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
	<div
	style="
	display: inline-flex;
	align-items: center;
	gap: 0.8rem;
	font-size: 1.75rem;
	"
	>
	<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
	Song To Lyrics
	</h1>
	</div>
	<p style="margin-bottom: 10px; font-size: 94%">
	Send the audio file of your favorite song, and get the lyrics ! <br />
	Under the hood, we split and get the vocals track from the audio file, then send the vocals to Whisper.
	</p>
	</div>""")
	song_in = gr.Audio(label="Song input", type="numpy", sources="upload")
	getlyrics_btn = gr.Button("Get Lyrics !")
	vocals_out = gr.Audio(label="Vocals Only")
	lyrics_res = gr.Textbox(label="Lyrics")

	getlyrics_btn.click(fn=infer, inputs=[song_in], outputs=[vocals_out, lyrics_res])

	demo.queue().launch()