v-e-n-o-m commited on
Commit
a34fbfb
·
1 Parent(s): 83b3a6a
Files changed (4) hide show
  1. .gitattributes +2 -35
  2. Dockerfile +21 -18
  3. app.py +43 -31
  4. requirements.txt +7 -7
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.webm filter=lfs
2
+ *.wav filter=lfs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -1,23 +1,26 @@
1
  FROM python:3.10-slim
2
 
3
- # Set environment variables early
4
- ENV TRANSFORMERS_CACHE=/app/cache
5
- ENV HF_HOME=/app/cache
6
- ENV PYTHONUNBUFFERED=1
7
-
8
- WORKDIR /app
9
-
10
- # Install dependencies
11
- RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/*
12
-
13
- # Create cache directory with permissions
14
- RUN mkdir -p /app/cache && chmod -R 777 /app/cache
15
-
16
- COPY requirements.txt .
17
- RUN pip install --no-cache-dir -r requirements.txt
18
-
 
19
  COPY app.py .
20
 
21
- EXPOSE 8000
 
22
 
23
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
 
 
1
  FROM python:3.10-slim
2
 
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ ffmpeg \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ # Install Python dependencies
9
+ RUN pip install --no-cache-dir \
10
+ fastapi==0.115.2 \
11
+ uvicorn==0.32.0 \
12
+ transformers==4.46.0 \
13
+ torch==2.5.0 \
14
+ python-multipart==0.0.12 \
15
+ soundfile==0.12.1 \
16
+ numpy==1.26.4 \
17
+ pydub==0.25.1
18
+
19
+ # Copy app
20
  COPY app.py .
21
 
22
+ # Expose port
23
+ EXPOSE 7860
24
 
25
+ # Run
26
+ CMD ["uvicorn", "app.py:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,50 +1,62 @@
1
- from fastapi import FastAPI, File, UploadFile, Form, HTTPException
2
  from transformers import pipeline
3
  import soundfile as sf
4
  import io
5
  import numpy as np
6
- import logging
7
  import torch
8
-
9
- logging.basicConfig(level=logging.INFO)
10
- logger = logging.getLogger(__name__)
11
 
12
  app = FastAPI()
13
 
14
- try:
15
- logger.info("Loading Whisper-large-v3...")
16
- pipe = pipeline(
17
- "automatic-speech-recognition",
18
- model="openai/whisper-large-v3",
19
- torch_dtype=torch.float16,
20
- device="cpu",
21
- model_kwargs={"cache_dir": "/app/cache"},
22
- )
23
- logger.info("Model loaded successfully")
24
- except Exception as e:
25
- logger.error(f"Failed to load model: {str(e)}")
26
- raise
27
 
28
  @app.post("/transcribe")
29
  async def transcribe(audio: UploadFile = File(...), language: str = Form(...)):
30
  try:
31
- valid_languages = {"english": "en", "urdu": "ur", "arabic": "ar"}
32
  if language.lower() not in valid_languages:
33
- raise HTTPException(status_code=400, detail="Invalid language. Use 'english', 'urdu', or 'arabic'.")
 
 
34
  audio_bytes = await audio.read()
35
- audio_file = io.BytesIO(audio_bytes)
36
- audio_data, sample_rate = sf.read(audio_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  if len(audio_data.shape) > 1:
38
  audio_data = np.mean(audio_data, axis=1)
 
39
  if sample_rate != 16000:
40
- raise HTTPException(status_code=400, detail="Audio must be 16kHz.")
41
- logger.info(f"Transcribing with language: {language}")
42
- result = pipe(
43
- audio_data,
44
- generate_kwargs={"language": valid_languages[language.lower()], "task": "transcribe"},
45
- return_timestamps=False,
46
- )
47
  return {"text": result["text"]}
48
  except Exception as e:
49
- logger.error(f"Transcription error: {str(e)}")
50
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Form
2
  from transformers import pipeline
3
  import soundfile as sf
4
  import io
5
  import numpy as np
 
6
  import torch
7
+ from pydub import AudioSegment
8
+ import tempfile
9
+ import os
10
 
11
  app = FastAPI()
12
 
13
+ print("Loading Whisper-large-v3...")
14
+ pipe = pipeline(
15
+ "automatic-speech-recognition",
16
+ model="openai/whisper-large-v3", # Try "whisper-medium" if memory crashes
17
+ torch_dtype=torch.float16,
18
+ device="cuda" if torch.cuda.is_available() else "cpu",
19
+ )
20
+ print("Model loaded successfully")
 
 
 
 
 
21
 
22
  @app.post("/transcribe")
23
  async def transcribe(audio: UploadFile = File(...), language: str = Form(...)):
24
  try:
25
+ valid_languages = {"en": "en", "ur": "ur", "ar": "ar"}
26
  if language.lower() not in valid_languages:
27
+ return {"error": "Invalid language. Use 'en', 'ur', or 'ar'"}
28
+
29
+ # Read audio bytes
30
  audio_bytes = await audio.read()
31
+
32
+ # Convert WebM to WAV
33
+ with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_webm:
34
+ temp_webm.write(audio_bytes)
35
+ temp_webm_path = temp_webm.name
36
+
37
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
38
+ temp_wav_path = temp_wav.name
39
+
40
+ audio_segment = AudioSegment.from_file(temp_webm_path)
41
+ audio_segment = audio_segment.set_frame_rate(16000).set_channels(1)
42
+ audio_segment.export(temp_wav_path, format="wav")
43
+
44
+ # Read WAV
45
+ audio_data, sample_rate = sf.read(temp_wav_path)
46
  if len(audio_data.shape) > 1:
47
  audio_data = np.mean(audio_data, axis=1)
48
+
49
  if sample_rate != 16000:
50
+ return {"error": "Converted audio is not 16kHz"}
51
+
52
+ # Transcribe
53
+ result = pipe(audio_data, generate_kwargs={"language": language.lower(), "task": "transcribe"})
 
 
 
54
  return {"text": result["text"]}
55
  except Exception as e:
56
+ return {"error": f"Audio processing failed: {str(e)}"}
57
+ finally:
58
+ # Clean up
59
+ if 'temp_webm_path' in locals():
60
+ os.unlink(temp_webm_path)
61
+ if 'temp_wav_path' in locals():
62
+ os.unlink(temp_wav_path)
requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
- fastapi==0.103.2
2
- uvicorn==0.23.2
3
- transformers==4.38.2
4
- torch==2.0.1
 
5
  soundfile==0.12.1
6
- numpy==1.24.3
7
- librosa==0.10.1
8
- python-multipart==0.0.9
 
1
+ fastapi==0.115.2
2
+ uvicorn==0.32.0
3
+ transformers==4.46.0
4
+ torch==2.5.0
5
+ python-multipart==0.0.12
6
  soundfile==0.12.1
7
+ numpy==1.26.4
8
+ pydub==0.25.1