File size: 3,999 Bytes
f708112
 
8b67ea6
42456ee
e37d595
42456ee
8102ff1
 
 
 
 
 
 
 
644617d
8102ff1
 
 
 
 
 
 
 
42456ee
8b67ea6
 
42456ee
8b67ea6
 
 
 
 
 
 
 
 
 
 
 
 
 
babb3b4
8b67ea6
 
 
 
 
 
 
 
 
 
9edd470
4e51e0a
 
 
33dfd12
 
4e51e0a
 
 
 
 
33dfd12
4e51e0a
f708112
9edd470
 
8102ff1
f708112
 
5116161
 
8b67ea6
ace8f3b
f708112
8b67ea6
f708112
 
 
5116161
 
4e51e0a
 
a398fde
 
4e51e0a
f708112
9edd470
 
 
 
f708112
1c08be3
cebdacd
 
 
 
 
 
 
 
 
 
 
 
 
 
64a38ed
 
cebdacd
 
e955550
f708112
787c620
f708112
 
787c620
f708112
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
from gradio_client import Client
import torch
import os 
from scipy.io.wavfile import write
hf_token = os.environ.get('HF_TOKEN')
#splt_client = Client("https://fffiloni-splittrack2musicgen.hf.space/")

def split_process(audio, chosen_out_track):
    os.makedirs("out", exist_ok=True)
    write('test.wav', audio[0], audio[1])
    os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out")
    #return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav"
    if chosen_out_track == "vocals":
        return "/out/mdx_extra_q/test/vocals.wav"
    elif chosen_out_track == "bass":
        return "./out/mdx_extra_q/test/bass.wav"
    elif chosen_out_track == "drums":
        return "./out/mdx_extra_q/test/drums.wav"
    elif chosen_out_track == "other":
        return "./out/mdx_extra_q/test/other.wav"
    elif chosen_out_track == "all-in":
        return "test.wav"

from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read

import tempfile

MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
    token=hf_token
)


#@spaces.GPU
def transcribe(inputs, task):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    return  text

import re

def format_lyrics(text):
    # Use regex to find parts that start with a capital letter and insert a newline
    formatted_text = re.sub(r'(?<!^)([A-Z])', r'\n\1', text)

    # Remove any leading whitespace on each line
    formatted_text = re.sub(r'^[ \t]+', '', formatted_text, flags=re.MULTILINE)

    return formatted_text

    
def infer(audio_input):

    # STEP 1 | Split vocals from the song/audio file
    splt_result = split_process(audio_input, "vocals")
    print(splt_result)

    # STEP 2 | Transcribe 
    # TO-DO : handling errors if JAX demo queue is full
    whisper_result = transcribe(
    				splt_result,	# str (filepath or URL to file) in 'inputs' Audio component
    				"transcribe",	# str  in 'Task' Radio component

    )
    print(whisper_result)

    #return whisper_result[0] # if using JAX

    lyrics = format_lyrics(whisper_result)

    print(lyrics)

    return splt_result, lyrics

css = """
#col-container {max-width: 510px; margin-left: auto; margin-right: auto;}
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
                <div
                style="
                    display: inline-flex;
                    align-items: center;
                    gap: 0.8rem;
                    font-size: 1.75rem;
                "
                >
                <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
                    Song To Lyrics
                </h1>
                </div>
                <p style="margin-bottom: 10px; font-size: 94%">
                Send the audio file of your favorite song, and get the lyrics ! <br />
                Under the hood, we split and get the vocals track from the audio file, then send the vocals to Whisper.
                </p>
            </div>""")
        song_in = gr.Audio(label="Song input", type="numpy", sources="upload")
        getlyrics_btn = gr.Button("Get Lyrics !")
        vocals_out = gr.Audio(label="Vocals Only")
        lyrics_res = gr.Textbox(label="Lyrics")

    getlyrics_btn.click(fn=infer, inputs=[song_in], outputs=[vocals_out, lyrics_res])

demo.queue().launch()