from fastapi import FastAPI, File, UploadFile, Form, HTTPException from transformers import pipeline import torch import soundfile as sf import io import numpy as np app = FastAPI() # Initialize Whisper pipeline (loaded once at startup) pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-large-v3", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device="cuda" if torch.cuda.is_available() else "cpu", ) @app.post("/transcribe") async def transcribe(audio: UploadFile = File(...), language: str = Form(...)): try: # Validate language valid_languages = {"english": "en", "urdu": "ur", "arabic": "ar"} if language.lower() not in valid_languages: raise HTTPException(status_code=400, detail="Invalid language. Use 'english', 'urdu', or 'arabic'.") # Read audio audio_bytes = await audio.read() audio_file = io.BytesIO(audio_bytes) audio_data, sample_rate = sf.read(audio_file) # Ensure mono, 16kHz if len(audio_data.shape) > 1: audio_data = np.mean(audio_data, axis=1) if sample_rate != 16000: raise HTTPException(status_code=400, detail="Audio must be 16kHz.") # Transcribe with language result = pipe( audio_data, generate_kwargs={"language": valid_languages[language.lower()], "task": "transcribe"}, return_timestamps=False, ) return {"text": result["text"]} except Exception as e: raise HTTPException(status_code=500, detail=f"Transcription error: {str(e)}")