import os os.system("pip install git+https://github.com/openai/whisper.git") import gradio as gr import whisper from share_btn import community_icon_html, loading_icon_html, share_js # whisper model specification model = whisper.load_model("tiny") def inference(audio): # load audio data audio = whisper.load_audio(audio) # ensure sample is in correct format for inference audio = whisper.pad_or_trim(audio) # generate a log-mel spetrogram of the audio data mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) # decode audio data options = whisper.DecodingOptions(fp16 = False) # transcribe speech to text result = whisper.decode(model, mel, options) # print audio data as text print(result.text) return result.text, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) css = """ .gradio-container { font-family: 'IBM Plex Sans', sans-serif; } .gr-button { color: white; border-color: black; background: black; } input[type='range'] { accent-color: black; } .dark input[type='range'] { accent-color: #dfdfdf; } .container { max-width: 730px; margin: auto; padding-top: 1.5rem; } .details:hover { text-decoration: underline; } .gr-button { white-space: nowrap; } .gr-button:focus { border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1; --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); --tw-ring-opacity: .5; } .footer { margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5; } .footer>p { font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white; } .dark .footer { border-color: #303030; } .dark .footer>p { background: #0b0f19; } .prompt h4{ margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%; } .animate-spin { animation: spin 1s linear infinite; } @keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } } #share-btn-container { display: flex; margin-top: 1.5rem !important; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; } #share-btn { all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important; } #share-btn * { all: unset; } """ block = gr.Blocks(css=css) with block: gr.HTML( """

Whisper

Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. This demo cuts audio after around 30 secs.

You can skip the queue by using google colab for the space:

""" ) with gr.Group(): with gr.Box(): with gr.Row().style(mobile_collapse=False, equal_height=True): # get audio from microphone audio = gr.Audio( label="Input Audio", show_label=False, source="microphone", type="filepath" ) btn = gr.Button("Transcribe") text = gr.Textbox(show_label=False, elem_id="result-textarea") with gr.Group(elem_id="share-btn-container"): community_icon = gr.HTML(community_icon_html, visible=False) loading_icon = gr.HTML(loading_icon_html, visible=False) share_button = gr.Button("Share to community", elem_id="share-btn", visible=False) btn.click(inference, inputs=[audio], outputs=[text, community_icon, loading_icon, share_button]) share_button.click(None, [], [], _js=share_js) gr.HTML(''' ''') block.launch()