WajeehAzeemX commited on
Commit
33f6d66
·
1 Parent(s): 754c22b

using int8 quantized model

Browse files
Files changed (3) hide show
  1. app.py +14 -55
  2. app_backup.py +75 -0
  3. requirements.txt +2 -1
app.py CHANGED
@@ -1,34 +1,12 @@
1
  from fastapi import FastAPI, Request, HTTPException
2
- import torch
3
- import torchaudio
4
- from transformers import AutoProcessor, pipeline
5
  import io
6
- from pydub import AudioSegment
7
- from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
8
- import numpy as np
9
  import uvicorn
10
- app = FastAPI()
11
-
12
- # Device configuration
13
- device = "cuda" if torch.cuda.is_available() else "cpu"
14
- print(device)
15
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
16
-
17
- # Load the model and processor
18
- model_id = "WajeehAzeemX/whisper-small-ar2_onnx"
19
- model = ORTModelForSpeechSeq2Seq.from_pretrained(
20
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
21
- )
22
- processor = AutoProcessor.from_pretrained(model_id)
23
 
 
24
 
25
- pipe = pipeline(
26
- "automatic-speech-recognition",
27
- model=model,
28
- tokenizer=processor.tokenizer,
29
- feature_extractor=processor.feature_extractor,
30
- torch_dtype=torch_dtype,
31
- )
32
 
33
  @app.post("/transcribe/")
34
  async def transcribe_audio(request: Request):
@@ -38,37 +16,18 @@ async def transcribe_audio(request: Request):
38
 
39
  # Convert binary data to a file-like object
40
  audio_file = io.BytesIO(audio_data)
41
-
42
- # Load the audio file using pydub
43
- try:
44
- audio_segment = AudioSegment.from_file(audio_file, format="wav")
45
- except Exception as e:
46
- raise HTTPException(status_code=400, detail=f"Error loading audio file: {str(e)}")
47
 
48
- # Convert to mono if the audio is stereo (multi-channel)
49
- if audio_segment.channels > 1:
50
- audio_segment = audio_segment.set_channels(1)
51
 
52
- # Resample the audio to 16kHz
53
- target_sample_rate = 16000
54
- if audio_segment.frame_rate != target_sample_rate:
55
- audio_segment = audio_segment.set_frame_rate(target_sample_rate)
56
 
57
- # Convert audio to numpy array
58
- audio_array = np.array(audio_segment.get_array_of_samples())
59
- if audio_segment.sample_width == 2:
60
- audio_array = audio_array.astype(np.float32) / 32768.0
61
- else:
62
- raise HTTPException(status_code=400, detail="Unsupported sample width")
63
 
64
- # Convert to the format expected by the model
65
- inputs = processor(audio_array, sampling_rate=target_sample_rate, return_tensors="pt")
66
- inputs = inputs.to(device)
67
-
68
- # Get the transcription result
69
- result = pipe(audio_array)
70
- transcription = result["text"]
71
-
72
- return {"transcription": transcription}
73
  except Exception as e:
74
- raise HTTPException(status_code=500, detail=str(e))
 
 
1
  from fastapi import FastAPI, Request, HTTPException
 
 
 
2
  import io
3
+ import time
4
+ from faster_whisper import WhisperModel
 
5
  import uvicorn
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ app = FastAPI()
8
 
9
+ model = WhisperModel("WajeehAzeemX/faster-whisper-smallar2-int8", device="cpu", compute_type="int8")
 
 
 
 
 
 
10
 
11
  @app.post("/transcribe/")
12
  async def transcribe_audio(request: Request):
 
16
 
17
  # Convert binary data to a file-like object
18
  audio_file = io.BytesIO(audio_data)
 
 
 
 
 
 
19
 
20
+ # Start timing the transcription
21
+ start_time = time.time()
 
22
 
23
+ # Transcribe the audio
24
+ segments, info = model.transcribe(audio_file)
25
+ transcription = " ".join([segment.text for segment in segments])
 
26
 
27
+ # Calculate time taken
28
+ time_taken = time.time() - start_time
 
 
 
 
29
 
30
+ return {"transcription": transcription, "time_taken": time_taken}
 
 
 
 
 
 
 
 
31
  except Exception as e:
32
+ raise HTTPException(status_code=500, detail=str(e))
33
+
app_backup.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from fastapi import FastAPI, Request, HTTPException
3
+ import torch
4
+ import torchaudio
5
+ from transformers import AutoProcessor, pipeline
6
+ import io
7
+ from pydub import AudioSegment
8
+ from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
9
+ import numpy as np
10
+ import uvicorn
11
+ app = FastAPI()
12
+
13
+ # Device configuration
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ print(device)
16
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
17
+
18
+ # Load the model and processor
19
+ model_id = "WajeehAzeemX/whisper-small-ar2_onnx"
20
+ model = ORTModelForSpeechSeq2Seq.from_pretrained(
21
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
22
+ )
23
+ processor = AutoProcessor.from_pretrained(model_id)
24
+
25
+
26
+ pipe = pipeline(
27
+ "automatic-speech-recognition",
28
+ model=model,
29
+ tokenizer=processor.tokenizer,
30
+ feature_extractor=processor.feature_extractor,
31
+ torch_dtype=torch_dtype,
32
+ )
33
+
34
+ @app.post("/transcribe/")
35
+ async def transcribe_audio(request: Request):
36
+ try:
37
+ # Read binary data from the request
38
+ audio_data = await request.body()
39
+
40
+ # Convert binary data to a file-like object
41
+ audio_file = io.BytesIO(audio_data)
42
+
43
+ # Load the audio file using pydub
44
+ try:
45
+ audio_segment = AudioSegment.from_file(audio_file, format="wav")
46
+ except Exception as e:
47
+ raise HTTPException(status_code=400, detail=f"Error loading audio file: {str(e)}")
48
+
49
+ # Convert to mono if the audio is stereo (multi-channel)
50
+ if audio_segment.channels > 1:
51
+ audio_segment = audio_segment.set_channels(1)
52
+
53
+ # Resample the audio to 16kHz
54
+ target_sample_rate = 16000
55
+ if audio_segment.frame_rate != target_sample_rate:
56
+ audio_segment = audio_segment.set_frame_rate(target_sample_rate)
57
+
58
+ # Convert audio to numpy array
59
+ audio_array = np.array(audio_segment.get_array_of_samples())
60
+ if audio_segment.sample_width == 2:
61
+ audio_array = audio_array.astype(np.float32) / 32768.0
62
+ else:
63
+ raise HTTPException(status_code=400, detail="Unsupported sample width")
64
+
65
+ # Convert to the format expected by the model
66
+ inputs = processor(audio_array, sampling_rate=target_sample_rate, return_tensors="pt")
67
+ inputs = inputs.to(device)
68
+
69
+ # Get the transcription result
70
+ result = pipe(audio_array)
71
+ transcription = result["text"]
72
+
73
+ return {"transcription": transcription}
74
+ except Exception as e:
75
+ raise HTTPException(status_code=500, detail=str(e))
requirements.txt CHANGED
@@ -9,4 +9,5 @@ pydub
9
  numpy
10
  onnx
11
  optimum
12
- onnxruntime
 
 
9
  numpy
10
  onnx
11
  optimum
12
+ onnxruntime
13
+ faster_whisper