Hebrew
meirk's picture
Update handler.py
e98cc49 verified
raw
history blame
5.57 kB
import base64
import logging
import time
from tempfile import NamedTemporaryFile
import torch
# Use faster-whisper instead of the original whisper
from faster_whisper import WhisperModel
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class EndpointHandler:
def __init__(self, path=""):
# The 'path' argument is now where you specify the model repository
model_path = "meirk/whisper-large-v3-turbo-ct2-copy"
logger.info(f"Loading CTranslate2 model from: {model_path}")
# Check CUDA availability
if torch.cuda.is_available():
self.device = "cuda"
# CTranslate2 has its own compute type specification
self.compute_type = "float16"
logger.info(f"CUDA available: {torch.cuda.get_device_name(0)}. Using compute_type: {self.compute_type}")
else:
self.device = "cpu"
self.compute_type = "float32" # Or "int8" for CPU
logger.info(f"CUDA not available, using CPU with compute_type: {self.compute_type}")
# Load the model using WhisperModel from faster-whisper
# This will download the model from Hugging Face Hub on the first run
self.model = WhisperModel(model_path, device=self.device, compute_type=self.compute_type)
logger.info(f"Model loaded on {self.device}")
def __call__(self, data):
try:
start_time = time.time()
# Get parameters
params = data.get("parameters", {})
# Get audio data
audio_b64 = data.get("inputs", None)
if not audio_b64:
return {"error": "Missing 'inputs' field"}
# Decode and process audio
audio_bytes = base64.b64decode(audio_b64)
audio_size_mb = len(audio_bytes) / (1024 * 1024)
logger.info(f"Processing {audio_size_mb:.2f} MB of audio on {self.device}")
# Save to temp file
with NamedTemporaryFile(delete=True) as tmp:
tmp.write(audio_bytes)
tmp.flush()
logger.info("Starting transcription...")
# Transcribe using faster-whisper
# Note the slightly different parameter names
segments_generator, info = self.model.transcribe(
tmp.name,
language=params.get("language", "he"),
task=params.get("task", "transcribe"),
beam_size=params.get("beam_size", 5),
# best_of is part of beam_size in faster-whisper
temperature=params.get("temperature", 0),
word_timestamps=params.get("word_timestamps", False),
# condition_on_previous_text is now part of temperature
initial_prompt=params.get("initial_prompt", None),
# This helps prevent hallucination loops
no_speech_threshold=0.6,
log_prob_threshold=-1.0,
condition_on_previous_text=False # Often better to set to False to avoid loops
)
# 1. Format segments and assemble full text
segments = []
full_text = ""
audio_duration = 0
for seg in segments_generator:
full_text += seg.text
segment_data = {
"text": seg.text.strip(),
"start": round(seg.start, 2),
"end": round(seg.end, 2)
}
if params.get("word_timestamps", False) and seg.words:
segment_data["words"] = [
{
"word": w.word,
"start": round(w.start, 2),
"end": round(w.end, 2),
"probability": round(w.probability, 3)
} for w in seg.words
]
segments.append(segment_data)
# Detected language and actual audio duration are in the `info` object
detected_language = info.language
audio_duration = info.duration
processing_time = time.time() - start_time
# 2. Calculate speed ratio correctly
speed_ratio = audio_duration / processing_time if processing_time > 0 else 0
logger.info(f"Detected language '{detected_language}' with probability {info.language_probability:.2f}")
logger.info(f"Audio duration: {audio_duration:.2f}s")
logger.info(f"Completed in {processing_time:.1f}s ({speed_ratio:.1f}x realtime)")
return {
"text": full_text,
"chunks": segments,
"language": detected_language,
"processing_time": round(processing_time, 2),
"speed_ratio": round(speed_ratio, 2),
"segment_count": len(segments),
"device_used": self.device
}
except Exception as e:
logger.error("Error: %s", str(e), exc_info=True)
return {
"error": str(e),
"error_type": type(e).__name__,
"device_used": getattr(self, 'device', 'unknown')
}