Spaces:

Kushtrim
/

whisper-large-v3-turbo-shqip

Running

File size: 2,837 Bytes

adf34ec
 
 
 
 
 
 
 
 
3643b3a
adf34ec
 
 
3643b3a
adf34ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebef752
adf34ec
 
 
 
 
 
 
 
 
 
 
 
 
 
ebef752
adf34ec
 
 
 
 
 
 
 
 
 
 
 
ebef752

from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from transformers.pipelines.audio_utils import ffmpeg_read
from huggingface_hub import login
import gradio as gr
import spaces
import torch
import os

HF_TOKEN = os.getenv("HF")
MODEL_ID_HF = os.getenv("MODEL_ID_HF")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = MODEL_ID_HF
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch_dtype, use_safetensors=True).to(device)
processor = AutoProcessor.from_pretrained(model_id, token=HF_TOKEN)
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, token=HF_TOKEN, feature_extractor=processor.feature_extractor,
                max_new_tokens=128, chunk_length_s=15, batch_size=8, torch_dtype=torch_dtype, device=device) # token=os.environ["HF"]
@spaces.GPU
def transcribe(inputs, task):
    if inputs is None:
        raise gr.Error(
            "No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs, generate_kwargs={
                "task": task, 'language': 'sq'}, return_timestamps=True)["text"]
    return text

demo = gr.Blocks()

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources=["upload"], type="filepath", label="Audio file"),
    ],
    outputs=gr.Textbox(label="Transcription", lines=8),
    title="Whisper Large V3 Turbo Shqip: Transcribe Audio",
    description=("This fine-tuned Whisper model provides reliable transcription for Albanian audio, whether from a microphone or an uploaded file. " 
                 "Key details about this project:"
                 "\n\n- Fine-tuned on 400 plus hours of carefully curated Albanian audio data. "
                 f"\n- Hosted on Hugging Face. Repository: [{model_id}](https://huggingface.co/{model_id}). "
                ),
    allow_flagging="never",
)

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath"),
    ],
    outputs=gr.Textbox(label="Transcription", lines=8),
    title="Whisper Large V3 Turbo Shqip: Transcribe Audio",
    description=("This fine-tuned Whisper model provides reliable transcription for Albanian audio, whether from a microphone or an uploaded file. " 
                 "Key details about this project:"
                 "\n\n- Fine-tuned on 400 plus hours of carefully curated Albanian audio data. "
                 f"\n- Hosted on Hugging Face. Repository: [{model_id}](https://huggingface.co/{model_id}). "
                ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])

demo.launch()