Spaces:
Running
Running
deploy
Browse files- Dockerfile +27 -0
- app.py +91 -0
- requirements.txt +10 -0
Dockerfile
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use a lightweight Python base image with PyTorch
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# Install ffmpeg, libsndfile, and other system dependencies
|
5 |
+
RUN apt-get update && apt-get install -y ffmpeg libsndfile1 && apt-get clean
|
6 |
+
|
7 |
+
# Set working directory
|
8 |
+
WORKDIR /app
|
9 |
+
|
10 |
+
# Create a cache directory and set permissions
|
11 |
+
RUN mkdir -p /app/cache && chmod -R 777 /app/cache
|
12 |
+
|
13 |
+
# Set environment variable for Hugging Face cache
|
14 |
+
ENV HF_HOME=/app/cache
|
15 |
+
|
16 |
+
# Copy requirements and install dependencies
|
17 |
+
COPY requirements.txt .
|
18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
19 |
+
|
20 |
+
# Copy the application code
|
21 |
+
COPY . .
|
22 |
+
|
23 |
+
# Expose port for the API
|
24 |
+
EXPOSE 7860
|
25 |
+
|
26 |
+
# Command to run the application
|
27 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, File, UploadFile
|
2 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
3 |
+
import torch
|
4 |
+
import io
|
5 |
+
import soundfile as sf
|
6 |
+
import numpy as np
|
7 |
+
import torchaudio
|
8 |
+
import logging
|
9 |
+
import timeout_decorator
|
10 |
+
|
11 |
+
# Set up logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
app = FastAPI()
|
16 |
+
|
17 |
+
# Load the model and processor
|
18 |
+
model_name = "ihanif/whisper-medium-urdu"
|
19 |
+
try:
|
20 |
+
logger.info(f"Loading processor for {model_name}")
|
21 |
+
processor = WhisperProcessor.from_pretrained(model_name, language="urdu", task="transcribe")
|
22 |
+
logger.info(f"Loading model for {model_name}")
|
23 |
+
model = WhisperForConditionalGeneration.from_pretrained(model_name) # Default to float32
|
24 |
+
except Exception as e:
|
25 |
+
logger.error(f"Error loading model or processor: {str(e)}")
|
26 |
+
raise
|
27 |
+
|
28 |
+
# Set Urdu language and task
|
29 |
+
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ur", task="transcribe")
|
30 |
+
logger.info("Set forced_decoder_ids for Urdu transcription")
|
31 |
+
|
32 |
+
# Move model to CPU (free Spaces don’t have GPU)
|
33 |
+
device = "cpu"
|
34 |
+
model.to(device)
|
35 |
+
logger.info(f"Model loaded and moved to {device}")
|
36 |
+
|
37 |
+
@app.post("/transcribe")
|
38 |
+
async def transcribe_audio(file: UploadFile = File(...)):
|
39 |
+
try:
|
40 |
+
# Read audio file
|
41 |
+
logger.info("Reading audio file")
|
42 |
+
audio_data, sample_rate = sf.read(io.BytesIO(await file.read()))
|
43 |
+
|
44 |
+
# Ensure audio is mono
|
45 |
+
if len(audio_data.shape) > 1:
|
46 |
+
audio_data = np.mean(audio_data, axis=1) # Convert to mono
|
47 |
+
|
48 |
+
# Resample to 16kHz if necessary
|
49 |
+
target_sample_rate = 16000
|
50 |
+
if sample_rate != target_sample_rate:
|
51 |
+
logger.info(f"Resampling audio from {sample_rate} Hz to {target_sample_rate} Hz")
|
52 |
+
audio_tensor = torch.from_numpy(audio_data).float()
|
53 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
|
54 |
+
audio_tensor = resampler(audio_tensor)
|
55 |
+
audio_data = audio_tensor.numpy()
|
56 |
+
sample_rate = target_sample_rate
|
57 |
+
|
58 |
+
# Trim silence
|
59 |
+
logger.info("Trimming silence")
|
60 |
+
audio_tensor = torch.from_numpy(audio_data).float()
|
61 |
+
vad = torchaudio.transforms.Vad(sample_rate=sample_rate)
|
62 |
+
audio_tensor = vad(audio_tensor)
|
63 |
+
audio_data = audio_tensor.numpy()
|
64 |
+
|
65 |
+
# Process audio input
|
66 |
+
logger.info("Processing audio input")
|
67 |
+
inputs = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt")
|
68 |
+
input_features = inputs.input_features.to(device)
|
69 |
+
|
70 |
+
# Generate transcription with timeout
|
71 |
+
logger.info("Generating transcription")
|
72 |
+
@timeout_decorator.timeout(60, timeout_exception=TimeoutError) # 60-second timeout
|
73 |
+
def generate_transcription():
|
74 |
+
with torch.no_grad():
|
75 |
+
generated_ids = model.generate(input_features, max_new_tokens=225) # Limit output
|
76 |
+
return generated_ids
|
77 |
+
|
78 |
+
generated_ids = generate_transcription()
|
79 |
+
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
80 |
+
|
81 |
+
return {"transcription": transcription}
|
82 |
+
except TimeoutError:
|
83 |
+
logger.error("Transcription timed out after 60 seconds")
|
84 |
+
return {"error": "Transcription took too long. Try a shorter audio file or a faster model."}
|
85 |
+
except Exception as e:
|
86 |
+
logger.error(f"Error during transcription: {str(e)}")
|
87 |
+
return {"error": str(e)}
|
88 |
+
|
89 |
+
if __name__ == "__main__":
|
90 |
+
import uvicorn
|
91 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.38.2
|
2 |
+
torch==2.0.1
|
3 |
+
fastapi==0.103.0
|
4 |
+
uvicorn==0.23.2
|
5 |
+
pydantic==2.3.0
|
6 |
+
soundfile==0.12.1
|
7 |
+
python-multipart==0.0.9
|
8 |
+
numpy==1.26.4
|
9 |
+
timeout-decorator==0.5.0
|
10 |
+
torchaudio==2.0.2
|