whisper-large-v3-turbo-ct2-copy / handler.py

Update handler.py

e98cc49 verified 16 days ago

5.57 kB

	import base64
	import logging
	import time
	from tempfile import NamedTemporaryFile
	import torch

	# Use faster-whisper instead of the original whisper
	from faster_whisper import WhisperModel

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class EndpointHandler:
	def __init__(self, path=""):
	# The 'path' argument is now where you specify the model repository
	model_path = "meirk/whisper-large-v3-turbo-ct2-copy"

	logger.info(f"Loading CTranslate2 model from: {model_path}")

	# Check CUDA availability
	if torch.cuda.is_available():
	self.device = "cuda"
	# CTranslate2 has its own compute type specification
	self.compute_type = "float16"
	logger.info(f"CUDA available: {torch.cuda.get_device_name(0)}. Using compute_type: {self.compute_type}")
	else:
	self.device = "cpu"
	self.compute_type = "float32" # Or "int8" for CPU
	logger.info(f"CUDA not available, using CPU with compute_type: {self.compute_type}")

	# Load the model using WhisperModel from faster-whisper
	# This will download the model from Hugging Face Hub on the first run
	self.model = WhisperModel(model_path, device=self.device, compute_type=self.compute_type)
	logger.info(f"Model loaded on {self.device}")

	def __call__(self, data):
	try:
	start_time = time.time()

	# Get parameters
	params = data.get("parameters", {})

	# Get audio data
	audio_b64 = data.get("inputs", None)
	if not audio_b64:
	return {"error": "Missing 'inputs' field"}

	# Decode and process audio
	audio_bytes = base64.b64decode(audio_b64)
	audio_size_mb = len(audio_bytes) / (1024 * 1024)
	logger.info(f"Processing {audio_size_mb:.2f} MB of audio on {self.device}")

	# Save to temp file
	with NamedTemporaryFile(delete=True) as tmp:
	tmp.write(audio_bytes)
	tmp.flush()

	logger.info("Starting transcription...")

	# Transcribe using faster-whisper
	# Note the slightly different parameter names
	segments_generator, info = self.model.transcribe(
	tmp.name,
	language=params.get("language", "he"),
	task=params.get("task", "transcribe"),
	beam_size=params.get("beam_size", 5),
	# best_of is part of beam_size in faster-whisper
	temperature=params.get("temperature", 0),
	word_timestamps=params.get("word_timestamps", False),
	# condition_on_previous_text is now part of temperature
	initial_prompt=params.get("initial_prompt", None),
	# This helps prevent hallucination loops
	no_speech_threshold=0.6,
	log_prob_threshold=-1.0,
	condition_on_previous_text=False # Often better to set to False to avoid loops
	)

	# 1. Format segments and assemble full text
	segments = []
	full_text = ""
	audio_duration = 0

	for seg in segments_generator:
	full_text += seg.text
	segment_data = {
	"text": seg.text.strip(),
	"start": round(seg.start, 2),
	"end": round(seg.end, 2)
	}

	if params.get("word_timestamps", False) and seg.words:
	segment_data["words"] = [
	{
	"word": w.word,
	"start": round(w.start, 2),
	"end": round(w.end, 2),
	"probability": round(w.probability, 3)
	} for w in seg.words
	]

	segments.append(segment_data)

	# Detected language and actual audio duration are in the `info` object
	detected_language = info.language
	audio_duration = info.duration

	processing_time = time.time() - start_time

	# 2. Calculate speed ratio correctly
	speed_ratio = audio_duration / processing_time if processing_time > 0 else 0

	logger.info(f"Detected language '{detected_language}' with probability {info.language_probability:.2f}")
	logger.info(f"Audio duration: {audio_duration:.2f}s")
	logger.info(f"Completed in {processing_time:.1f}s ({speed_ratio:.1f}x realtime)")

	return {
	"text": full_text,
	"chunks": segments,
	"language": detected_language,
	"processing_time": round(processing_time, 2),
	"speed_ratio": round(speed_ratio, 2),
	"segment_count": len(segments),
	"device_used": self.device
	}

	except Exception as e:
	logger.error("Error: %s", str(e), exc_info=True)
	return {
	"error": str(e),
	"error_type": type(e).__name__,
	"device_used": getattr(self, 'device', 'unknown')
	}