handler / handler.py

Update handler.py

31fdbaa verified 2 months ago

5.88 kB

	import os
	import io
	import base64
	import torch
	import numpy as np
	from transformers import BarkModel, BarkProcessor
	from typing import Dict, List, Any

	class EndpointHandler:
	def __init__(self, path=""):
	"""
	Initialize the handler for Bark text-to-speech model.
	Args:
	path (str, optional): Path to the model directory. Defaults to "".
	"""
	self.path = path
	self.model = None
	self.processor = None
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.initialized = False

	def setup(self, **kwargs):
	"""
	Load the model and processor.
	Args:
	**kwargs: Additional arguments.
	"""
	# Load model from the local directory
	self.model = BarkModel.from_pretrained(self.path)
	self.model.to(self.device)

	# Load processor
	self.processor = BarkProcessor.from_pretrained(self.path)

	self.initialized = True
	print(f"Bark model loaded on {self.device}")

	def preprocess(self, request: Dict) -> Dict:
	"""
	Process the input request before inference.
	Args:
	request (Dict): The request data containing text to convert to speech.
	Returns:
	Dict: Processed inputs for the model.
	"""
	if not self.initialized:
	self.setup()

	inputs = {}

	# Get text from the request
	if "inputs" in request:
	if isinstance(request["inputs"], str):
	# Single text input
	inputs["text"] = request["inputs"]
	elif isinstance(request["inputs"], list):
	# List of text inputs
	inputs["text"] = request["inputs"][0] # Take the first text

	# Get optional parameters
	params = request.get("parameters", {})

	# Speaker ID/voice preset
	if "speaker_id" in params:
	inputs["speaker_id"] = params["speaker_id"]
	elif "voice_preset" in params:
	inputs["voice_preset"] = params["voice_preset"]

	# Other generation parameters
	if "temperature" in params:
	inputs["temperature"] = params.get("temperature", 0.7)

	return inputs

	def inference(self, inputs: Dict) -> Dict:
	"""
	Run model inference on the processed inputs.
	Args:
	inputs (Dict): Processed inputs for the model.
	Returns:
	Dict: Model outputs.
	"""
	text = inputs.get("text", "")
	if not text:
	return {"error": "No text provided for speech generation"}

	# Extract optional parameters
	speaker_id = inputs.get("speaker_id", None)
	voice_preset = inputs.get("voice_preset", None)
	temperature = inputs.get("temperature", 0.7)

	# Prepare inputs for the model
	input_ids = self.processor(text).to(self.device)

	# Generate speech
	with torch.no_grad():
	if speaker_id:
	# Use speaker_id if provided
	speech_output = self.model.generate(
	input_ids=input_ids,
	speaker_id=speaker_id,
	temperature=temperature
	)
	elif voice_preset:
	# Use voice_preset if provided
	speech_output = self.model.generate(
	input_ids=input_ids,
	voice_preset=voice_preset,
	temperature=temperature
	)
	else:
	# Use default settings
	speech_output = self.model.generate(
	input_ids=input_ids,
	temperature=temperature
	)

	# Convert to numpy array
	audio_array = speech_output.cpu().numpy().squeeze()

	return {"audio_array": audio_array, "sample_rate": self.model.generation_config.sample_rate}

	def postprocess(self, inference_output: Dict) -> Dict:
	"""
	Process the model outputs after inference.
	Args:
	inference_output (Dict): Model outputs.
	Returns:
	Dict: Processed outputs ready for the response.
	"""
	if "error" in inference_output:
	return {"error": inference_output["error"]}

	audio_array = inference_output.get("audio_array")
	sample_rate = inference_output.get("sample_rate", 24000)

	# Convert audio array to WAV format
	try:
	import scipy.io.wavfile as wav
	audio_buffer = io.BytesIO()
	wav.write(audio_buffer, sample_rate, audio_array)
	audio_buffer.seek(0)
	audio_data = audio_buffer.read()

	# Encode audio data to base64
	audio_base64 = base64.b64encode(audio_data).decode("utf-8")

	return {
	"audio": audio_base64,
	"sample_rate": sample_rate,
	"format": "wav"
	}
	except Exception as e:
	return {"error": f"Error converting audio: {str(e)}"}

	def __call__(self, data: Dict) -> Dict:
	"""
	Main entry point for the handler.
	Args:
	data (Dict): Request data.
	Returns:
	Dict: Response data.
	"""
	# Ensure the model is initialized
	if not self.initialized:
	self.setup()

	# Process the request
	try:
	inputs = self.preprocess(data)
	outputs = self.inference(inputs)
	response = self.postprocess(outputs)
	return response
	except Exception as e:
	return {"error": f"Error processing request: {str(e)}"}