Spaces:

bofenghuang
/

speech-to-text

Running

+import logging
+import warnings
+import gradio as gr
+import torchaudio
+from transformers import pipeline
+from transformers.utils.logging import disable_progress_bar
+SAMPLE_RATE = 16_000
+warnings.filterwarnings("ignore")
+disable_progress_bar()
+logging.basicConfig(
+    format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s",
+    datefmt="%Y-%m-%dT%H:%M:%SZ",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+pipe = pipeline(model="bhuang/asr-wav2vec2-french")
+logger.info("ASR pipeline has been initialized")
+def process_audio_file(audio_file):
+    waveform, sample_rate = torchaudio.load(audio_file)
+    waveform = waveform.squeeze(axis=0)  # mono
+    # resample
+    if sample_rate != SAMPLE_RATE:
+        resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
+        waveform = resampler(waveform)
+    return waveform
+def transcribe(microphone_audio_file, uploaded_audio_file):
+    warning_message = ""
+    if (microphone_audio_file is not None) and (uploaded_audio_file is not None):
+        warning_message = (
+            "WARNING: You've uploaded an audio file and used the microphone. "
+            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
+        )
+        audio_file = microphone_audio_file
+    elif (microphone_audio_file is None) and (uploaded_audio_file is None):
+        return "ERROR: You have to either use the microphone or upload an audio file"
+    elif microphone_audio_file is not None:
+        audio_file = microphone_audio_file
+    else:
+        audio_file = uploaded_audio_file
+    audio_data = process_audio_file(audio_file)
+    # text = pipe(audio, chunk_length_s=30, stride_length_s=5)["text"]
+    text = pipe(audio_data)["text"]
+    logger.info(f"Transcription for {audio_file}: {text}")
+    return warning_message + text
+iface = gr.Interface(
+    fn=transcribe,
+    inputs=[
+        gr.Audio(source="microphone", type="filepath", label="Record something...", optional=True),
+        gr.Audio(source="upload", type="filepath", label="Upload some audio file...", optional=True),
+    ],
+    outputs="text",
+    layout="horizontal",
+    # theme="huggingface",
+    title="Speech-to-Text in French",
+    description="Realtime demo for French automatic speech recognition.",
+    allow_flagging="never",
+)
+# iface.launch(server_name="0.0.0.0", debug=True, share=True)
+iface.launch(enable_queue=True)