|
""" |
|
Description: |
|
This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK |
|
|
|
Dependencies: |
|
all the necessary dependencies are listed in requirements.txt |
|
|
|
Usage: |
|
The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space |
|
|
|
Author: Lorenzo Concina |
|
Date: 4/6/2025 |
|
""" |
|
import os |
|
import torch |
|
import librosa as lb |
|
import gradio as gr |
|
from transformers import AutoProcessor, pipeline |
|
from datasets import load_dataset |
|
|
|
def load_fama(model_id, output_lang): |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
tgt_lang = "it" |
|
|
|
|
|
lang_tag = "<lang:{}>".format(output_lang) |
|
lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag) |
|
|
|
generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id} |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model=model_id, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float32, |
|
device=device, |
|
return_timestamps=False, |
|
generate_kwargs=generate_kwargs, |
|
chunk_length_s=60, |
|
stride_length_s=1 |
|
) |
|
return pipe |
|
|
|
def load_audio_file(audio_path): |
|
y, sr = lb.load(audio_path, sr=16000, mono=True) |
|
return y |
|
|
|
def transcribe(audio, task_type, model_id, output_lang): |
|
""" |
|
Function called by gradio interface. It runs model inference on an audio sample |
|
""" |
|
cache_key = (model_id, output_lang) |
|
if cache_key not in model_cache: |
|
model_cache[cache_key] = load_fama(model_id, output_lang) |
|
|
|
pipeline = model_cache[cache_key] |
|
|
|
if isinstance(audio, str) and os.path.isfile(audio): |
|
|
|
utterance = load_audio_file(audio) |
|
result = pipeline(utterance) |
|
else: |
|
|
|
result = pipeline(audio) |
|
return result["text"] |
|
|
|
|
|
def update_model_options(task_type): |
|
if task_type == "ST": |
|
return gr.update(choices=["FBK-MT/fama-small", "FBK-MT/fama-medium"], value="FBK-MT/fama-small") |
|
else: |
|
return gr.update(choices=[ |
|
"FBK-MT/fama-small", |
|
"FBK-MT/fama-medium", |
|
"FBK-MT/fama-small-asr", |
|
"FBK-MT/fama-medium-asr" |
|
], value="FBK-MT/fama-small") |
|
|
|
|
|
language_choices = ["en", "it"] |
|
|
|
|
|
model_cache = {} |
|
|
|
if __name__ == "__main__": |
|
|
|
with gr.Blocks() as iface: |
|
gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo powered by FAMA models, developed at FBK. \ |
|
More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""") |
|
|
|
with gr.Row(): |
|
audio_input = gr.Audio(type="filepath", label="Upload or record audio") |
|
task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type") |
|
|
|
model_input = gr.Radio(choices=[ |
|
"FBK-MT/fama-small", |
|
"FBK-MT/fama-medium", |
|
"FBK-MT/fama-small-asr", |
|
"FBK-MT/fama-medium-asr" |
|
], value="FBK-MT/fama-small", label="Select a FAMA model") |
|
|
|
lang_input = gr.Dropdown(choices=language_choices, value="it", label="Transcription language") |
|
|
|
output = gr.Textbox(label="Transcription") |
|
|
|
task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=model_input) |
|
|
|
transcribe_btn = gr.Button("Transcribe") |
|
transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output) |
|
|
|
iface.launch() |
|
|