import os import gradio as gr from transformers import pipeline import numpy as np import time from typing import Tuple import logging import torch # Create a logger. logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Check if all the variables are set. required_variables = ["HF_TOKEN", "PASSWORD", "MODEL_NAME"] for required_variable in required_variables: if os.environ.get(required_variable, "NO") == "NO": logger.error( f"Environment variable {required_variable} is not set. " "Please set it before running the application." ) raise ValueError( f"Environment variable {required_variable} is not set. " "Please set it before running the application." ) # Create the transcription pipeline. model_name = os.environ["MODEL_NAME"] device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Loading model {model_name} with device {device}...") transcriber = pipeline( "automatic-speech-recognition", model=model_name, device=device ) logger.info(f"Model loaded successfully.") # Start the app. def main(): interface = create_interface() interface.launch() # Create the Gradio interface for the Whisper transcription service. def create_interface(): # The UI is a block of Gradio components. with gr.Blocks() as interface: # Title. gr.Markdown("# Whisper Speech Transcription") # One row for the password input and another for the audio input. with gr.Row(): with gr.Column(scale=2): passwort_input = gr.Textbox( label="Enter Password", placeholder="Enter the password to access the transcription service", type="password" ) # Row for audio input. with gr.Row(): with gr.Column(scale=2): audio_input = gr.Audio( sources=["microphone", "upload"], type="numpy", label="Record or Upload Audio" ) # Row for the transcription button. with gr.Row(): transcribe_button = gr.Button("Transcribe", variant="primary") # Row for the transcription output. with gr.Row(): output_text = gr.Textbox( label="Transcription Output", placeholder="Transcription will appear here...", lines=5 ) # Status message for transcription time. status_text = gr.Textbox( label="Status", placeholder="Transcription status will appear here...", lines=1, interactive=False ) # Set up the transcribe button click event transcribe_button.click( fn=transcribe_audio, inputs=[audio_input, passwort_input], outputs=[output_text, status_text], ) # Also transcribe when audio is recorded/uploaded audio_input.change( fn=transcribe_audio, inputs=[audio_input, passwort_input], outputs=[output_text, status_text], ) return interface def transcribe_audio(audio: Tuple[int, np.ndarray], password: str = None) -> str: # If the password is wrong, return an error message. if password != os.environ.get("PASSWORD"): return "Incorrect password. Please try again.", "" # If there is no audio, return an error message. if audio is None: return "No audio detected. Please record some audio.", "" # Start measuring the time. start_time = time.time() # Unpack the audio. sr, y = audio # Convert to mono if stereo if y.ndim > 1: logger.debug(f"Converting {y.shape[1]} channels to mono") y = y.mean(axis=1) # Normalize audio y = y.astype(np.float32) max_abs = np.max(np.abs(y)) if max_abs > 0: # Avoid division by zero y /= max_abs logger.info(f"Processing audio: {sr}Hz, {len(y)} samples (~{len(y)/sr:.2f}s)") # Run transcription result = transcriber({"sampling_rate": sr, "raw": y}, chunk_length_s=30, stride_length_s=[6,0]) logger.info(f"Transcription completed.") # Calculate elapsed time elapsed_time = time.time() - start_time audio_time = len(y) / sr status_string = f"Transcription took {elapsed_time:.2f}s for {audio_time:.2f}s of audio with model {model_name}." return result["text"], status_string # Entrypoint. main()