Spaces:

v-e-n-o-m
/

urdu-whisper-asr

Sleeping

App Files Files Community

v-e-n-o-m commited on Apr 22

Commit

1bec6ca

1 Parent(s): cf10aa2

deploy

Browse files

Files changed (3) hide show

Dockerfile +27 -0
app.py +91 -0
requirements.txt +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+# Use a lightweight Python base image with PyTorch
+FROM python:3.10-slim
+# Install ffmpeg, libsndfile, and other system dependencies
+RUN apt-get update && apt-get install -y ffmpeg libsndfile1 && apt-get clean
+# Set working directory
+WORKDIR /app
+# Create a cache directory and set permissions
+RUN mkdir -p /app/cache && chmod -R 777 /app/cache
+# Set environment variable for Hugging Face cache
+ENV HF_HOME=/app/cache
+# Copy requirements and install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the application code
+COPY . .
+# Expose port for the API
+EXPOSE 7860
+# Command to run the application
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from fastapi import FastAPI, File, UploadFile
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import torch
+import io
+import soundfile as sf
+import numpy as np
+import torchaudio
+import logging
+import timeout_decorator
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI()
+# Load the model and processor
+model_name = "ihanif/whisper-medium-urdu"
+try:
+    logger.info(f"Loading processor for {model_name}")
+    processor = WhisperProcessor.from_pretrained(model_name, language="urdu", task="transcribe")
+    logger.info(f"Loading model for {model_name}")
+    model = WhisperForConditionalGeneration.from_pretrained(model_name)  # Default to float32
+except Exception as e:
+    logger.error(f"Error loading model or processor: {str(e)}")
+    raise
+# Set Urdu language and task
+model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ur", task="transcribe")
+logger.info("Set forced_decoder_ids for Urdu transcription")
+# Move model to CPU (free Spaces don’t have GPU)
+device = "cpu"
+model.to(device)
+logger.info(f"Model loaded and moved to {device}")
+@app.post("/transcribe")
+async def transcribe_audio(file: UploadFile = File(...)):
+    try:
+        # Read audio file
+        logger.info("Reading audio file")
+        audio_data, sample_rate = sf.read(io.BytesIO(await file.read()))
+        # Ensure audio is mono
+        if len(audio_data.shape) > 1:
+            audio_data = np.mean(audio_data, axis=1)  # Convert to mono
+        # Resample to 16kHz if necessary
+        target_sample_rate = 16000
+        if sample_rate != target_sample_rate:
+            logger.info(f"Resampling audio from {sample_rate} Hz to {target_sample_rate} Hz")
+            audio_tensor = torch.from_numpy(audio_data).float()
+            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
+            audio_tensor = resampler(audio_tensor)
+            audio_data = audio_tensor.numpy()
+            sample_rate = target_sample_rate
+        # Trim silence
+        logger.info("Trimming silence")
+        audio_tensor = torch.from_numpy(audio_data).float()
+        vad = torchaudio.transforms.Vad(sample_rate=sample_rate)
+        audio_tensor = vad(audio_tensor)
+        audio_data = audio_tensor.numpy()
+        # Process audio input
+        logger.info("Processing audio input")
+        inputs = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt")
+        input_features = inputs.input_features.to(device)
+        # Generate transcription with timeout
+        logger.info("Generating transcription")
+        @timeout_decorator.timeout(60, timeout_exception=TimeoutError)  # 60-second timeout
+        def generate_transcription():
+            with torch.no_grad():
+                generated_ids = model.generate(input_features, max_new_tokens=225)  # Limit output
+            return generated_ids
+        generated_ids = generate_transcription()
+        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return {"transcription": transcription}
+    except TimeoutError:
+        logger.error("Transcription timed out after 60 seconds")
+        return {"error": "Transcription took too long. Try a shorter audio file or a faster model."}
+    except Exception as e:
+        logger.error(f"Error during transcription: {str(e)}")
+        return {"error": str(e)}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+transformers==4.38.2
+torch==2.0.1
+fastapi==0.103.0
+uvicorn==0.23.2
+pydantic==2.3.0
+soundfile==0.12.1
+python-multipart==0.0.9
+numpy==1.26.4
+timeout-decorator==0.5.0
+torchaudio==2.0.2