v-e-n-o-m's picture
Add custom Whisper-large-v3 API with language param
c4b6df5
raw
history blame
1.63 kB
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from transformers import pipeline
import torch
import soundfile as sf
import io
import numpy as np
app = FastAPI()
# Initialize Whisper pipeline (loaded once at startup)
pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-large-v3",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device="cuda" if torch.cuda.is_available() else "cpu",
)
@app.post("/transcribe")
async def transcribe(audio: UploadFile = File(...), language: str = Form(...)):
try:
# Validate language
valid_languages = {"english": "en", "urdu": "ur", "arabic": "ar"}
if language.lower() not in valid_languages:
raise HTTPException(status_code=400, detail="Invalid language. Use 'english', 'urdu', or 'arabic'.")
# Read audio
audio_bytes = await audio.read()
audio_file = io.BytesIO(audio_bytes)
audio_data, sample_rate = sf.read(audio_file)
# Ensure mono, 16kHz
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
if sample_rate != 16000:
raise HTTPException(status_code=400, detail="Audio must be 16kHz.")
# Transcribe with language
result = pipe(
audio_data,
generate_kwargs={"language": valid_languages[language.lower()], "task": "transcribe"},
return_timestamps=False,
)
return {"text": result["text"]}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Transcription error: {str(e)}")