Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, File, UploadFile, Form, HTTPException | |
from transformers import pipeline | |
import torch | |
import soundfile as sf | |
import io | |
import numpy as np | |
app = FastAPI() | |
# Initialize Whisper pipeline (loaded once at startup) | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model="openai/whisper-large-v3", | |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
device="cuda" if torch.cuda.is_available() else "cpu", | |
) | |
async def transcribe(audio: UploadFile = File(...), language: str = Form(...)): | |
try: | |
# Validate language | |
valid_languages = {"english": "en", "urdu": "ur", "arabic": "ar"} | |
if language.lower() not in valid_languages: | |
raise HTTPException(status_code=400, detail="Invalid language. Use 'english', 'urdu', or 'arabic'.") | |
# Read audio | |
audio_bytes = await audio.read() | |
audio_file = io.BytesIO(audio_bytes) | |
audio_data, sample_rate = sf.read(audio_file) | |
# Ensure mono, 16kHz | |
if len(audio_data.shape) > 1: | |
audio_data = np.mean(audio_data, axis=1) | |
if sample_rate != 16000: | |
raise HTTPException(status_code=400, detail="Audio must be 16kHz.") | |
# Transcribe with language | |
result = pipe( | |
audio_data, | |
generate_kwargs={"language": valid_languages[language.lower()], "task": "transcribe"}, | |
return_timestamps=False, | |
) | |
return {"text": result["text"]} | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Transcription error: {str(e)}") |