File size: 4,786 Bytes
ae335a4 44b2e1a ae335a4 44b2e1a ae335a4 2544536 44b2e1a ae335a4 ff1f5a0 ae335a4 44b2e1a ae335a4 44b2e1a ae335a4 44b2e1a ae335a4 44b2e1a ae335a4 44b2e1a ae335a4 44b2e1a ae335a4 44b2e1a ae335a4 44b2e1a ae335a4 44b2e1a 2544536 44b2e1a ae335a4 44b2e1a bfd2ec1 ae335a4 44b2e1a ae335a4 ff1f5a0 bd98776 ff1f5a0 b4d2f16 ff1f5a0 b4d2f16 ff1f5a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import json
import logging
import os
import tempfile
import numpy as np
import requests
import soundfile as sf
from fastapi import BackgroundTasks, FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from kokoro import KPipeline
from pydantic import BaseModel
from starlette.requests import Request
from prompts import map_prompts_to_params
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
language_map = {"en": "a", "es": "e"}
speaker_map = {"en": "af_heart", "es": "em_santa"}
def cleanup_temp_file(file_path: str):
"""Clean up temporary file after response is sent"""
try:
os.unlink(file_path)
except OSError:
pass
def text_to_audio_chunks(text, voice="af_heart", language="a"):
pipeline = KPipeline(lang_code=language)
generator = pipeline(text, voice=voice)
audios = [audio for (gs, ps, audio) in generator]
return audios
def concat_chunks(audios, samplerate=24000, silence_dur=0.3):
# Convert PyTorch tensors to NumPy arrays
audio_arrays = [
audio.numpy() if hasattr(audio, "numpy") else audio for audio in audios
]
if not audio_arrays:
return np.array([]) # Return empty array if no audio chunks
silence = np.zeros(int(samplerate * silence_dur), dtype=audio_arrays[0].dtype)
# Insert silence between all but last
chunks = sum([[chunk, silence] for chunk in audio_arrays[:-1]], []) + [
audio_arrays[-1]
]
return np.concatenate(chunks)
def get_audio(text: str, language: str):
voice = speaker_map.get(language, "af_heart")
language = language_map.get(language, "a")
audios = text_to_audio_chunks(text, voice=voice, language=language)
final_audio = concat_chunks(audios)
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
sf.write(tmp.name, final_audio, 24000)
tmp.close()
return tmp.name
def generate_text(prompt: str):
response = requests.post(
url="https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
"HTTP-Referer": "Emotions API",
"X-Title": "Emotions API",
},
data=json.dumps(
{
"model": "google/gemma-3n-e4b-it:free",
"temperature": 0.0,
"max_tokens": 2048,
"top_p": 0.99,
"messages": [{"role": "user", "content": prompt}],
}
),
)
response_json = response.json()
answer = response_json["choices"][0]["message"]["content"]
return answer, response_json
def generate_audio(text: str, language: str) -> FileResponse:
audio_path = get_audio(text, language)
background_tasks = BackgroundTasks()
background_tasks.add_task(cleanup_temp_file, audio_path)
return FileResponse(
path=audio_path,
media_type="audio/wav",
filename="generated_audio.wav",
background=background_tasks,
)
class InputLoadT2A(BaseModel):
text: str
language: str
class InputLoadP2T(BaseModel):
text: str
class ResponseLoadP2T(BaseModel):
text: str
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/health")
def health_check():
return {"server": "running"}
@app.post("/genaudio/")
async def receive(input_load: InputLoadT2A, request: Request) -> FileResponse:
return generate_audio(input_load.text, input_load.language)
@app.post("/gentext/")
async def gen_text(input_load: InputLoadP2T, request: Request) -> ResponseLoadP2T:
text, _ = generate_text(input_load.text)
return ResponseLoadP2T(text=text)
@app.post("/genemotion/")
async def gen_emotion(input_load: InputLoadT2A, request: Request) -> FileResponse:
text, _ = generate_text(input_load.text)
return generate_audio(text, input_load.language)
@app.post("/genemotionfast/")
async def gen_emotion_fast(input_load: InputLoadT2A, request: Request) -> FileResponse:
logger.info(f"Received request at FASTgen for text: {input_load.text}, language: {input_load.language}")
return get_preloaded_audio(input_load.text, input_load.language)
def get_preloaded_audio(prompt: str, language: str) -> FileResponse:
request_params = map_prompts_to_params[prompt]
audio_file = f"preloaded_audio/audio_{request_params['emotion']}_{language}.mp3"
return FileResponse(
path=audio_file,
media_type="audio/mp3",
filename=f"audio_{request_params['emotion']}_{language}.mp3",
)
|