import logging import os import sys import uvicorn import asyncio from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor import torch import soundfile as sf import subprocess import tempfile import time from contextlib import contextmanager # Create logs directory os.makedirs("./logs", exist_ok=True) # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("./logs/app.log"), logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger(__name__) # Log system information logger.info(f"Python version: {sys.version}") try: import transformers import torch logger.info(f"Transformers version: {transformers.__version__}") logger.info(f"Torch version: {torch.__version__}") except ImportError as e: logger.error(f"Failed to import dependency: {str(e)}") raise # Set up cache directory os.makedirs("./cache", exist_ok=True) os.environ["HF_HOME"] = "./cache" logger.info(f"Set HF_HOME to ./cache") # Initialize FastAPI app app = FastAPI(title="Quran Transcription API") # Health check endpoint @app.get("/health", status_code=200) async def health_check(): logger.info("Health check requested") return {"status": "healthy", "model_loaded": model is not None} # Debug endpoint @app.get("/debug") async def debug(): logger.info("Debug endpoint requested") return { "cuda_available": torch.cuda.is_available(), "model_loaded": model is not None, "pipeline_initialized": asr is not None, "cache_dir": os.getenv("HF_HOME"), "port": os.getenv("PORT", "7860") } # Load model and processor try: model_id = "tarteel-ai/whisper-base-ar-quran" logger.info(f"Loading processor for model: {model_id}") processor = WhisperProcessor.from_pretrained(model_id) logger.info(f"Loading model: {model_id}") model = WhisperForConditionalGeneration.from_pretrained(model_id) model.generation_config.no_timestamps_token_id = processor.tokenizer.convert_tokens_to_ids("<|notimestamps|>") except Exception as e: logger.error(f"Failed to load model: {str(e)}") raise HTTPException(status_code=500, detail="Model loading failed") # Initialize ASR pipeline try: logger.info("Initializing ASR pipeline") asr = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=-1 # Force CPU ) except Exception as e: logger.error(f"Failed to initialize ASR pipeline: {str(e)}") raise HTTPException(status_code=500, detail="Pipeline initialization failed") @contextmanager def temporary_files(): temp_mp3 = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) temp_wav_path = temp_mp3.name.replace(".mp3", ".wav") try: yield temp_mp3, temp_wav_path finally: for path in [temp_mp3.name, temp_wav_path]: if os.path.exists(path): try: os.unlink(path) logger.debug(f"Deleted temporary file: {path}") except Exception as e: logger.warning(f"Failed to delete temporary file {path}: {str(e)}") # Function to transcribe with a timeout async def transcribe_with_timeout(audio, timeout=3000): loop = asyncio.get_event_loop() def _transcribe(): start_time = time.time() result = asr(audio, return_timestamps=False)["text"] logger.info(f"Transcription took {time.time() - start_time:.2f} seconds") return result try: # Run the transcription in a thread pool and wait with timeout return await asyncio.wait_for( loop.run_in_executor(None, _transcribe), timeout=timeout ) except asyncio.TimeoutError: logger.error(f"Transcription timed out after {timeout} seconds") raise HTTPException(status_code=504, detail=f"Transcription timed out after {timeout} seconds") # Function to check audio duration def check_audio_duration(audio_path, max_duration=20): try: result = subprocess.run( ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", audio_path], capture_output=True, text=True, check=True ) duration = float(result.stdout.strip()) logger.info(f"Audio duration: {duration:.2f} seconds") if duration > max_duration: logger.warning(f"Audio exceeds maximum duration: {duration:.2f} > {max_duration} seconds") return False, duration return True, duration except Exception as e: logger.error(f"Failed to check audio duration: {str(e)}") return False, 0 # Maximum file size (5MB) MAX_FILE_SIZE = 5 * 1024 * 1024 # 5MB # Maximum audio duration (in seconds) MAX_DURATION = 20 # 20 seconds @app.post("/transcribe") async def transcribe_audio(file: UploadFile = File(...)): logger.info(f"Received file: {file.filename}") # Check file extension if not file.filename.lower().endswith(".mp3"): logger.error(f"Invalid file type: {file.filename}. Only MP3 is supported") raise HTTPException(status_code=400, detail="Only MP3 files are supported") try: # Read file with size check mp3_data = await file.read() file_size = len(mp3_data) logger.info(f"File size: {file_size / 1024:.2f} KB") if file_size > MAX_FILE_SIZE: logger.error(f"File too large: {file_size / (1024 * 1024):.2f} MB > {MAX_FILE_SIZE / (1024 * 1024)} MB") raise HTTPException(status_code=413, detail=f"File too large. Maximum size is {MAX_FILE_SIZE / (1024 * 1024)} MB") except Exception as e: logger.error(f"Failed to read MP3 file: {str(e)}") raise HTTPException(status_code=500, detail="Failed to read audio file") with temporary_files() as (temp_mp3, temp_wav_path): try: # Save MP3 to temp file temp_mp3.write(mp3_data) temp_mp3.close() logger.info(f"Saved MP3 to temporary file: {temp_mp3.name}") # Check audio duration is_valid_duration, duration = check_audio_duration(temp_mp3.name, MAX_DURATION) if not is_valid_duration: raise HTTPException( status_code=413, detail=f"Audio too long ({duration:.2f} seconds). Maximum duration is {MAX_DURATION} seconds" ) # Convert MP3 to WAV logger.info(f"Converting MP3 to WAV: {temp_wav_path}") result = subprocess.run( ["ffmpeg", "-i", temp_mp3.name, "-ar", "16000", "-ac", "1", "-y", temp_wav_path], check=True, capture_output=True, text=True, timeout=30 ) logger.debug(f"ffmpeg output: {result.stdout}") except subprocess.CalledProcessError as e: logger.error(f"ffmpeg conversion failed: {e.stderr}") raise HTTPException(status_code=500, detail="Audio conversion failed") except HTTPException: raise # Re-raise HTTPExceptions except Exception as e: logger.error(f"Unexpected error during conversion: {str(e)}") raise HTTPException(status_code=500, detail="Unexpected error during conversion") try: # Read WAV file audio, sample_rate = sf.read(temp_wav_path) logger.info(f"Read WAV file: {temp_wav_path}, sample rate: {sample_rate}") if sample_rate != 16000: logger.error(f"Invalid sample rate: {sample_rate}. Expected 16000 Hz") raise HTTPException(status_code=400, detail="Converted audio is not 16 kHz") except Exception as e: logger.error(f"Failed to read WAV file: {str(e)}") raise HTTPException(status_code=500, detail="Failed to read converted audio") try: # Transcribe with timeout logger.info("Starting transcription") transcription = await transcribe_with_timeout(audio, timeout=60) # 60 second timeout logger.info(f"Transcription completed: {transcription}") return {"transcription": transcription} except HTTPException: raise # Re-raise HTTPExceptions except Exception as e: logger.error(f"Transcription failed: {str(e)}") raise HTTPException(status_code=500, detail="Transcription failed") @app.on_event("startup") async def startup_event(): logger.info("Application fully started") logger.info(f"CUDA available: {torch.cuda.is_available()}") logger.info(f"Memory allocated: {torch.cuda.memory_allocated() if torch.cuda.is_available() else 'N/A'}") if __name__ == "__main__": port = int(os.getenv("PORT", 7860)) # Use PORT env var or default to 7860 logger.info(f"Starting Uvicorn server on port {port}") uvicorn.run( "app:app", host="0.0.0.0", port=port, log_level="info", workers=1 # Single worker to avoid resource issues )