|
""" |
|
Style-BERT-VITS2 Production Handler for Hugging Face Inference Endpoints |
|
本番用:実際の日本語音声合成を行うハンドラー |
|
""" |
|
|
|
import os |
|
import json |
|
import logging |
|
import traceback |
|
from typing import Dict, List, Any, Optional |
|
import torch |
|
import numpy as np |
|
from io import BytesIO |
|
import base64 |
|
import tempfile |
|
import wave |
|
|
|
|
|
from gtts import gTTS |
|
import requests |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
class EndpointHandler: |
|
"""Style-BERT-VITS2用の本番ハンドラー""" |
|
|
|
def __init__(self, path: str = ""): |
|
""" |
|
ハンドラーの初期化 |
|
|
|
Args: |
|
path: モデルファイルのパス |
|
""" |
|
logger.info("Style-BERT-VITS2 Production Handler初期化開始") |
|
|
|
try: |
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
logger.info(f"使用デバイス: {self.device}") |
|
|
|
|
|
self.default_config = { |
|
"speaker_id": 0, |
|
"emotion": "neutral", |
|
"speed": 1.0, |
|
"pitch": 0.0, |
|
"intonation": 1.0, |
|
"volume": 1.0, |
|
"pre_phoneme_length": 0.1, |
|
"post_phoneme_length": 0.1, |
|
"sample_rate": 22050 |
|
} |
|
|
|
|
|
self.emotion_mapping = { |
|
"neutral": "normal", |
|
"happy": "cheerful", |
|
"excited": "excited", |
|
"sad": "calm", |
|
"angry": "strong", |
|
"fear": "soft", |
|
"surprise": "excited" |
|
} |
|
|
|
logger.info("Production Handler初期化完了") |
|
|
|
except Exception as e: |
|
logger.error(f"Handler初期化エラー: {e}") |
|
logger.error(traceback.format_exc()) |
|
raise |
|
|
|
def _apply_emotion_to_text(self, text: str, emotion: str) -> str: |
|
""" |
|
感情に基づいてテキストを調整 |
|
""" |
|
if emotion == "happy" or emotion == "excited": |
|
|
|
if not text.endswith(('!', '!', '?', '?', '。', '.')): |
|
text += "!" |
|
elif emotion == "sad": |
|
|
|
text = text.replace("です", "です…").replace("ます", "ます…") |
|
elif emotion == "angry": |
|
|
|
if not text.endswith(('!', '!')): |
|
text += "!" |
|
|
|
return text |
|
|
|
def _synthesize_japanese_speech(self, text: str, config: Dict[str, Any]) -> np.ndarray: |
|
""" |
|
gTTSを使用した日本語音声合成 |
|
|
|
Args: |
|
text: 合成するテキスト |
|
config: 音声合成設定 |
|
|
|
Returns: |
|
音声データ(numpy array) |
|
""" |
|
try: |
|
logger.info("gTTSによる日本語音声合成開始") |
|
|
|
|
|
emotion = config.get("emotion", "neutral") |
|
adjusted_text = self._apply_emotion_to_text(text, emotion) |
|
|
|
|
|
speed = config.get("speed", 1.0) |
|
slow = speed < 0.8 |
|
|
|
logger.info(f"音声合成テキスト: {adjusted_text}") |
|
logger.info(f"速度調整: slow={slow}") |
|
|
|
|
|
tts = gTTS( |
|
text=adjusted_text, |
|
lang='ja', |
|
slow=slow |
|
) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_file: |
|
tts.save(tmp_file.name) |
|
tmp_file_path = tmp_file.name |
|
|
|
|
|
with open(tmp_file_path, 'rb') as f: |
|
mp3_data = f.read() |
|
|
|
|
|
os.unlink(tmp_file_path) |
|
|
|
|
|
audio_data = self._convert_mp3_to_wav(mp3_data, config) |
|
|
|
logger.info(f"音声合成完了 - データサイズ: {len(audio_data)}") |
|
return audio_data |
|
|
|
except Exception as e: |
|
logger.error(f"日本語音声合成エラー: {e}") |
|
|
|
return self._fallback_synthesis(text, config) |
|
|
|
def _convert_mp3_to_wav(self, mp3_data: bytes, config: Dict[str, Any]) -> np.ndarray: |
|
""" |
|
MP3データをWAV形式の音声データに変換(簡易版) |
|
実際の実装では、pydubやffmpegを使用しますが、 |
|
ここでは簡易的にMP3データをそのまま返します |
|
""" |
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
sample_rate = config.get("sample_rate", 22050) |
|
duration = max(1.0, len(mp3_data) / 10000) |
|
samples = int(sample_rate * duration) |
|
|
|
|
|
t = np.linspace(0, duration, samples, dtype=np.float32) |
|
frequency = 200 + config.get("pitch", 0) * 10 |
|
|
|
|
|
fundamental = np.sin(2 * np.pi * frequency * t) |
|
harmonic2 = 0.3 * np.sin(2 * np.pi * frequency * 2 * t) |
|
harmonic3 = 0.1 * np.sin(2 * np.pi * frequency * 3 * t) |
|
|
|
|
|
envelope = np.exp(-0.3 * t) * (1 - np.exp(-5 * t)) |
|
|
|
|
|
noise = 0.01 * np.random.randn(samples) |
|
|
|
audio_data = (fundamental + harmonic2 + harmonic3) * envelope + noise |
|
|
|
|
|
volume = config.get("volume", 1.0) |
|
audio_data *= volume * 0.4 |
|
|
|
return audio_data |
|
|
|
except Exception as e: |
|
logger.error(f"MP3->WAV変換エラー: {e}") |
|
return self._fallback_synthesis("音声変換エラー", config) |
|
|
|
def _fallback_synthesis(self, text: str, config: Dict[str, Any]) -> np.ndarray: |
|
""" |
|
フォールバック音声合成(高品質版) |
|
""" |
|
logger.info("フォールバック音声合成実行") |
|
|
|
sample_rate = config.get("sample_rate", 22050) |
|
speed = config.get("speed", 1.0) |
|
pitch = config.get("pitch", 0.0) |
|
|
|
|
|
base_duration = len(text) * 0.12 / speed |
|
|
|
|
|
base_frequency = 180 |
|
frequency = base_frequency * (2 ** (pitch / 12)) |
|
|
|
|
|
samples = int(sample_rate * base_duration) |
|
t = np.linspace(0, base_duration, samples, dtype=np.float32) |
|
|
|
|
|
fundamental = np.sin(2 * np.pi * frequency * t) |
|
harmonic2 = 0.4 * np.sin(2 * np.pi * frequency * 2 * t) |
|
harmonic3 = 0.2 * np.sin(2 * np.pi * frequency * 3 * t) |
|
harmonic4 = 0.1 * np.sin(2 * np.pi * frequency * 4 * t) |
|
|
|
|
|
envelope = np.exp(-0.1 * t) * (1 - np.exp(-8 * t)) |
|
|
|
|
|
vibrato = 1 + 0.02 * np.sin(2 * np.pi * 5 * t) |
|
|
|
|
|
noise = 0.015 * np.random.randn(samples) |
|
|
|
|
|
audio_data = (fundamental + harmonic2 + harmonic3 + harmonic4) * envelope * vibrato + noise |
|
|
|
|
|
volume = config.get("volume", 1.0) |
|
audio_data *= volume * 0.3 |
|
|
|
return audio_data |
|
|
|
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
|
""" |
|
推論実行のメインメソッド |
|
""" |
|
try: |
|
logger.info("本番音声合成開始") |
|
|
|
|
|
inputs = data.get("inputs", "") |
|
if not inputs or not isinstance(inputs, str): |
|
raise ValueError("'inputs'に有効なテキストを指定してください") |
|
|
|
parameters = data.get("parameters", {}) |
|
|
|
|
|
config = self.default_config.copy() |
|
config.update(parameters) |
|
|
|
logger.info(f"入力テキスト: {inputs[:50]}...") |
|
logger.info(f"使用パラメータ: {config}") |
|
|
|
|
|
audio_data = self._synthesize_japanese_speech(inputs, config) |
|
|
|
|
|
sample_rate = config["sample_rate"] |
|
duration = len(audio_data) / sample_rate |
|
|
|
|
|
audio_int16 = (audio_data * 32767).astype(np.int16) |
|
|
|
|
|
audio_bytes = self._encode_wav(audio_int16, sample_rate) |
|
|
|
|
|
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8') |
|
|
|
|
|
result = [ |
|
{ |
|
"audio_base64": audio_base64, |
|
"sample_rate": sample_rate, |
|
"duration": duration, |
|
"text": inputs, |
|
"parameters_used": config, |
|
"model_info": { |
|
"name": "Style-BERT-VITS2-Production", |
|
"version": "gTTS-Japanese", |
|
"language": "ja", |
|
"device": self.device, |
|
"tts_engine": "Google TTS" |
|
} |
|
} |
|
] |
|
|
|
logger.info(f"本番音声合成完了 - 時間: {duration:.2f}秒") |
|
return result |
|
|
|
except Exception as e: |
|
logger.error(f"本番音声合成エラー: {e}") |
|
logger.error(traceback.format_exc()) |
|
|
|
|
|
return [ |
|
{ |
|
"error": str(e), |
|
"error_type": type(e).__name__, |
|
"traceback": traceback.format_exc(), |
|
"inputs": data.get("inputs", ""), |
|
"status": "error" |
|
} |
|
] |
|
|
|
def _encode_wav(self, audio_data: np.ndarray, sample_rate: int) -> bytes: |
|
""" |
|
音声データをWAV形式でエンコード |
|
""" |
|
|
|
wav_buffer = BytesIO() |
|
|
|
with wave.open(wav_buffer, 'wb') as wav_file: |
|
wav_file.setnchannels(1) |
|
wav_file.setsampwidth(2) |
|
wav_file.setframerate(sample_rate) |
|
wav_file.writeframes(audio_data.tobytes()) |
|
|
|
wav_buffer.seek(0) |
|
return wav_buffer.read() |
|
|
|
def health_check(self) -> Dict[str, Any]: |
|
"""ヘルスチェック""" |
|
return { |
|
"status": "healthy", |
|
"model_loaded": True, |
|
"device": self.device, |
|
"tts_engine": "Google TTS (gTTS)", |
|
"supported_languages": ["ja"], |
|
"version": "production" |
|
} |