Spaces:

hamza2923
/

faster-whisper-transcription-api

Sleeping

App Files Files Community

hamza2923 commited on May 23

Commit

8a199a7

verified ·

1 Parent(s): 7b109f5

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -25

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from flask import Flask, request, jsonify, Response  # Add Response here
 from faster_whisper import WhisperModel
 import torch
 import io
@@ -8,14 +8,17 @@ from threading import Semaphore
 import os
 from werkzeug.utils import secure_filename
 import tempfile
 app = Flask(__name__)
 # Configuration
-MAX_CONCURRENT_REQUESTS = 2  # Adjust based on your server capacity
-MAX_AUDIO_DURATION = 60 * 30  # 30 minutes maximum audio duration (adjust as needed)
 TEMPORARY_FOLDER = tempfile.gettempdir()
-ALLOWED_EXTENSIONS = {'mp3', 'wav', 'ogg', 'm4a', 'flac'}
 # Device check for faster-whisper
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -23,12 +26,12 @@ compute_type = "float16" if device == "cuda" else "int8"
 print(f"Using device: {device} with compute_type: {compute_type}")
 # Faster Whisper setup with optimized parameters for long audio
-beamsize = 2  # Slightly larger beam size can help with long-form accuracy
 wmodel = WhisperModel(
     "guillaumekln/faster-whisper-small",
     device=device,
     compute_type=compute_type,
-    download_root="./model_cache"  # Cache model to avoid re-downloading
 )
 # Concurrency control
@@ -39,13 +42,27 @@ def allowed_file(filename):
     return '.' in filename and \
            filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
-def cleanup_temp_files(file_path):
     """Ensure temporary files are deleted after processing"""
     try:
-        if os.path.exists(file_path):
-            os.remove(file_path)
     except Exception as e:
-        print(f"Error cleaning up temp file {file_path}: {str(e)}")
 @app.route("/health", methods=["GET"])
 def health_check():
@@ -56,7 +73,8 @@ def health_check():
         'device': device,
         'compute_type': compute_type,
         'active_requests': active_requests,
-        'max_duration_supported': MAX_AUDIO_DURATION
     })
 @app.route("/status/busy", methods=["GET"])
@@ -70,7 +88,7 @@ def server_busy():
     })
 @app.route("/whisper_transcribe", methods=["POST"])
-def whisper_transcribe():
     global active_requests
     if not request_semaphore.acquire(blocking=False):
@@ -79,38 +97,50 @@ def whisper_transcribe():
     active_requests += 1
     start_time = time.time()
     temp_file_path = None
     try:
-        if 'audio' not in request.files:
             return jsonify({'error': 'No file provided'}), 400
-        audio_file = request.files['audio']
-        if not (audio_file and allowed_file(audio_file.filename)):
-            return jsonify({'error': 'Invalid file format'}), 400
-        temp_file_path = os.path.join(TEMPORARY_FOLDER, secure_filename(audio_file.filename))
-        audio_file.save(temp_file_path)
         segments, _ = wmodel.transcribe(
-            temp_file_path,
             beam_size=beamsize,
             vad_filter=True,
-            without_timestamps=True,  # Ensure timestamps are not included
             compression_ratio_threshold=2.4,
             word_timestamps=False
         )
         full_text = " ".join(segment.text for segment in segments)
-        return jsonify({'transcription': full_text}), 200
     except Exception as e:
         return jsonify({'error': str(e)}), 500
     finally:
-        if temp_file_path:
-            cleanup_temp_files(temp_file_path)
         active_requests -= 1
         request_semaphore.release()
         print(f"Processed in {time.time()-start_time:.2f}s (Active: {active_requests})")

+from flask import Flask, request, jsonify, Response
 from faster_whisper import WhisperModel
 import torch
 import io
 import os
 from werkzeug.utils import secure_filename
 import tempfile
+from moviepy.editor import VideoFileClip  # Added for video processing
 app = Flask(__name__)
 # Configuration
+MAX_CONCURRENT_REQUESTS = 2  # Adjust based on server capacity
+MAX_FILE_DURATION = 60 * 30  # 30 minutes maximum duration (adjust as needed)
 TEMPORARY_FOLDER = tempfile.gettempdir()
+ALLOWED_AUDIO_EXTENSIONS = {'mp3', 'wav', 'ogg', 'm4a', 'flac', 'aac', 'wma', 'opus', 'aiff'}
+ALLOWED_VIDEO_EXTENSIONS = {'mp4', 'avi', 'mov', 'mkv', 'webm', 'flv', 'wmv', 'mpeg', 'mpg', '3gp'}
+ALLOWED_EXTENSIONS = ALLOWED_AUDIO_EXTENSIONS.union(ALLOWED_VIDEO_EXTENSIONS)
 # Device check for faster-whisper
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device} with compute_type: {compute_type}")
 # Faster Whisper setup with optimized parameters for long audio
+beamsize = 2
 wmodel = WhisperModel(
     "guillaumekln/faster-whisper-small",
     device=device,
     compute_type=compute_type,
+    download_root="./model_cache"
 )
 # Concurrency control
     return '.' in filename and \
            filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def cleanup_temp_files(*file_paths):
     """Ensure temporary files are deleted after processing"""
+    for file_path in file_paths:
+        try:
+            if file_path and os.path.exists(file_path):
+                os.remove(file_path)
+        except Exception as e:
+            print(f"Error cleaning up temp file {file_path}: {str(e)}")
+def extract_audio_from_video(video_path, output_audio_path):
+    """Extract audio from a video file and save it as a temporary audio file"""
     try:
+        video = VideoFileClip(video_path)
+        if video.duration > MAX_FILE_DURATION:
+            video.close()
+            raise ValueError(f"Video duration exceeds {MAX_FILE_DURATION} seconds")
+        video.audio.write_audiofile(output_audio_path)
+        video.close()
+        return output_audio_path
     except Exception as e:
+        raise Exception(f"Failed to extract audio from video: {str(e)}")
 @app.route("/health", methods=["GET"])
 def health_check():
         'device': device,
         'compute_type': compute_type,
         'active_requests': active_requests,
+        'max_duration_supported': MAX_FILE_DURATION,
+        'supported_formats': list(ALLOWED_EXTENSIONS)
     })
 @app.route("/status/busy", methods=["GET"])
     })
 @app.route("/whisper_transcribe", methods=["POST"])
+def transcribe():
     global active_requests
     if not request_semaphore.acquire(blocking=False):
     active_requests += 1
     start_time = time.time()
     temp_file_path = None
+    temp_audio_path = None
     try:
+        if 'file' not in request.files:
             return jsonify({'error': 'No file provided'}), 400
+        file = request.files['file']
+        if not (file and allowed_file(file.filename)):
+            return jsonify({'error': f'Invalid file format. Supported: {", ".join(ALLOWED_EXTENSIONS)}'}), 400
+        # Save uploaded file to temporary location
+        temp_file_path = os.path.join(TEMPORARY_FOLDER, secure_filename(file.filename))
+        file.save(temp_file_path)
+        # Check if file is a video and extract audio if necessary
+        file_extension = file.filename.rsplit('.', 1)[1].lower()
+        if file_extension in ALLOWED_VIDEO_EXTENSIONS:
+            temp_audio_path = os.path.join(TEMPORARY_FOLDER, f"temp_audio_{int(time.time())}.wav")
+            extract_audio_from_video(temp_file_path, temp_audio_path)
+            transcription_file = temp_audio_path
+        else:
+            transcription_file = temp_file_path
+        # Transcribe the audio file
         segments, _ = wmodel.transcribe(
+            transcription_file,
             beam_size=beamsize,
             vad_filter=True,
+            without_timestamps=True,
             compression_ratio_threshold=2.4,
             word_timestamps=False
         )
         full_text = " ".join(segment.text for segment in segments)
+        return jsonify({
+            'transcription': full_text,
+            'file_type': 'video' if file_extension in ALLOWED_VIDEO_EXTENSIONS else 'audio'
+        }), 200
     except Exception as e:
         return jsonify({'error': str(e)}), 500
     finally:
+        cleanup_temp_files(temp_file_path, temp_audio_path)
         active_requests -= 1
         request_semaphore.release()
         print(f"Processed in {time.time()-start_time:.2f}s (Active: {active_requests})")