v-e-n-o-m commited on
Commit
1bec6ca
·
1 Parent(s): cf10aa2
Files changed (3) hide show
  1. Dockerfile +27 -0
  2. app.py +91 -0
  3. requirements.txt +10 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a lightweight Python base image with PyTorch
2
+ FROM python:3.10-slim
3
+
4
+ # Install ffmpeg, libsndfile, and other system dependencies
5
+ RUN apt-get update && apt-get install -y ffmpeg libsndfile1 && apt-get clean
6
+
7
+ # Set working directory
8
+ WORKDIR /app
9
+
10
+ # Create a cache directory and set permissions
11
+ RUN mkdir -p /app/cache && chmod -R 777 /app/cache
12
+
13
+ # Set environment variable for Hugging Face cache
14
+ ENV HF_HOME=/app/cache
15
+
16
+ # Copy requirements and install dependencies
17
+ COPY requirements.txt .
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy the application code
21
+ COPY . .
22
+
23
+ # Expose port for the API
24
+ EXPOSE 7860
25
+
26
+ # Command to run the application
27
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile
2
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
3
+ import torch
4
+ import io
5
+ import soundfile as sf
6
+ import numpy as np
7
+ import torchaudio
8
+ import logging
9
+ import timeout_decorator
10
+
11
+ # Set up logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ app = FastAPI()
16
+
17
+ # Load the model and processor
18
+ model_name = "ihanif/whisper-medium-urdu"
19
+ try:
20
+ logger.info(f"Loading processor for {model_name}")
21
+ processor = WhisperProcessor.from_pretrained(model_name, language="urdu", task="transcribe")
22
+ logger.info(f"Loading model for {model_name}")
23
+ model = WhisperForConditionalGeneration.from_pretrained(model_name) # Default to float32
24
+ except Exception as e:
25
+ logger.error(f"Error loading model or processor: {str(e)}")
26
+ raise
27
+
28
+ # Set Urdu language and task
29
+ model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ur", task="transcribe")
30
+ logger.info("Set forced_decoder_ids for Urdu transcription")
31
+
32
+ # Move model to CPU (free Spaces don’t have GPU)
33
+ device = "cpu"
34
+ model.to(device)
35
+ logger.info(f"Model loaded and moved to {device}")
36
+
37
+ @app.post("/transcribe")
38
+ async def transcribe_audio(file: UploadFile = File(...)):
39
+ try:
40
+ # Read audio file
41
+ logger.info("Reading audio file")
42
+ audio_data, sample_rate = sf.read(io.BytesIO(await file.read()))
43
+
44
+ # Ensure audio is mono
45
+ if len(audio_data.shape) > 1:
46
+ audio_data = np.mean(audio_data, axis=1) # Convert to mono
47
+
48
+ # Resample to 16kHz if necessary
49
+ target_sample_rate = 16000
50
+ if sample_rate != target_sample_rate:
51
+ logger.info(f"Resampling audio from {sample_rate} Hz to {target_sample_rate} Hz")
52
+ audio_tensor = torch.from_numpy(audio_data).float()
53
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
54
+ audio_tensor = resampler(audio_tensor)
55
+ audio_data = audio_tensor.numpy()
56
+ sample_rate = target_sample_rate
57
+
58
+ # Trim silence
59
+ logger.info("Trimming silence")
60
+ audio_tensor = torch.from_numpy(audio_data).float()
61
+ vad = torchaudio.transforms.Vad(sample_rate=sample_rate)
62
+ audio_tensor = vad(audio_tensor)
63
+ audio_data = audio_tensor.numpy()
64
+
65
+ # Process audio input
66
+ logger.info("Processing audio input")
67
+ inputs = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt")
68
+ input_features = inputs.input_features.to(device)
69
+
70
+ # Generate transcription with timeout
71
+ logger.info("Generating transcription")
72
+ @timeout_decorator.timeout(60, timeout_exception=TimeoutError) # 60-second timeout
73
+ def generate_transcription():
74
+ with torch.no_grad():
75
+ generated_ids = model.generate(input_features, max_new_tokens=225) # Limit output
76
+ return generated_ids
77
+
78
+ generated_ids = generate_transcription()
79
+ transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
80
+
81
+ return {"transcription": transcription}
82
+ except TimeoutError:
83
+ logger.error("Transcription timed out after 60 seconds")
84
+ return {"error": "Transcription took too long. Try a shorter audio file or a faster model."}
85
+ except Exception as e:
86
+ logger.error(f"Error during transcription: {str(e)}")
87
+ return {"error": str(e)}
88
+
89
+ if __name__ == "__main__":
90
+ import uvicorn
91
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers==4.38.2
2
+ torch==2.0.1
3
+ fastapi==0.103.0
4
+ uvicorn==0.23.2
5
+ pydantic==2.3.0
6
+ soundfile==0.12.1
7
+ python-multipart==0.0.9
8
+ numpy==1.26.4
9
+ timeout-decorator==0.5.0
10
+ torchaudio==2.0.2