add endpoints for text gen and full prompt2audio
Browse files
server.py
CHANGED
@@ -1,19 +1,19 @@
|
|
|
|
1 |
import logging
|
|
|
|
|
2 |
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
from
|
7 |
from fastapi.responses import FileResponse
|
8 |
-
from fastapi import BackgroundTasks
|
9 |
-
from starlette.requests import Request
|
10 |
from kokoro import KPipeline
|
11 |
-
|
12 |
-
import
|
13 |
-
|
14 |
-
|
15 |
|
16 |
-
random.seed(42)
|
17 |
|
18 |
logging.basicConfig()
|
19 |
logger = logging.getLogger(__name__)
|
@@ -29,7 +29,7 @@ def cleanup_temp_file(file_path: str):
|
|
29 |
try:
|
30 |
os.unlink(file_path)
|
31 |
except OSError:
|
32 |
-
pass
|
33 |
|
34 |
|
35 |
def text_to_audio_chunks(text, voice="af_heart", language="a"):
|
@@ -41,14 +41,18 @@ def text_to_audio_chunks(text, voice="af_heart", language="a"):
|
|
41 |
|
42 |
def concat_chunks(audios, samplerate=24000, silence_dur=0.3):
|
43 |
# Convert PyTorch tensors to NumPy arrays
|
44 |
-
audio_arrays = [
|
45 |
-
|
|
|
|
|
46 |
if not audio_arrays:
|
47 |
return np.array([]) # Return empty array if no audio chunks
|
48 |
-
|
49 |
silence = np.zeros(int(samplerate * silence_dur), dtype=audio_arrays[0].dtype)
|
50 |
# Insert silence between all but last
|
51 |
-
chunks = sum([[chunk, silence] for chunk in audio_arrays[:-1]], []) + [
|
|
|
|
|
52 |
return np.concatenate(chunks)
|
53 |
|
54 |
|
@@ -60,14 +64,60 @@ def get_audio(text: str, language: str):
|
|
60 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
61 |
sf.write(tmp.name, final_audio, 24000)
|
62 |
tmp.close()
|
63 |
-
return tmp.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
|
|
67 |
text: str
|
68 |
language: str
|
69 |
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
app = FastAPI()
|
72 |
|
73 |
|
@@ -76,17 +126,18 @@ def health_check():
|
|
76 |
return {"server": "running"}
|
77 |
|
78 |
|
79 |
-
@app.post("/
|
80 |
-
async def receive(input_load:
|
81 |
-
|
82 |
|
83 |
-
background_tasks = BackgroundTasks()
|
84 |
-
background_tasks.add_task(cleanup_temp_file, audio_path)
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
)
|
92 |
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
import logging
|
3 |
+
import os
|
4 |
+
import tempfile
|
5 |
|
6 |
+
import numpy as np
|
7 |
+
import requests
|
8 |
+
import soundfile as sf
|
9 |
+
from fastapi import BackgroundTasks, FastAPI
|
10 |
from fastapi.responses import FileResponse
|
|
|
|
|
11 |
from kokoro import KPipeline
|
12 |
+
from pydantic import BaseModel
|
13 |
+
from starlette.requests import Request
|
14 |
+
|
15 |
+
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
16 |
|
|
|
17 |
|
18 |
logging.basicConfig()
|
19 |
logger = logging.getLogger(__name__)
|
|
|
29 |
try:
|
30 |
os.unlink(file_path)
|
31 |
except OSError:
|
32 |
+
pass
|
33 |
|
34 |
|
35 |
def text_to_audio_chunks(text, voice="af_heart", language="a"):
|
|
|
41 |
|
42 |
def concat_chunks(audios, samplerate=24000, silence_dur=0.3):
|
43 |
# Convert PyTorch tensors to NumPy arrays
|
44 |
+
audio_arrays = [
|
45 |
+
audio.numpy() if hasattr(audio, "numpy") else audio for audio in audios
|
46 |
+
]
|
47 |
+
|
48 |
if not audio_arrays:
|
49 |
return np.array([]) # Return empty array if no audio chunks
|
50 |
+
|
51 |
silence = np.zeros(int(samplerate * silence_dur), dtype=audio_arrays[0].dtype)
|
52 |
# Insert silence between all but last
|
53 |
+
chunks = sum([[chunk, silence] for chunk in audio_arrays[:-1]], []) + [
|
54 |
+
audio_arrays[-1]
|
55 |
+
]
|
56 |
return np.concatenate(chunks)
|
57 |
|
58 |
|
|
|
64 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
65 |
sf.write(tmp.name, final_audio, 24000)
|
66 |
tmp.close()
|
67 |
+
return tmp.name
|
68 |
+
|
69 |
+
|
70 |
+
def generate_text(prompt: str):
|
71 |
+
response = requests.post(
|
72 |
+
url="https://openrouter.ai/api/v1/chat/completions",
|
73 |
+
headers={
|
74 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
75 |
+
"Content-Type": "application/json",
|
76 |
+
"HTTP-Referer": "Emotions API",
|
77 |
+
"X-Title": "Emotions API",
|
78 |
+
},
|
79 |
+
data=json.dumps(
|
80 |
+
{
|
81 |
+
"model": "google/gemma-3n-e4b-it:free",
|
82 |
+
"temperature": 0.0,
|
83 |
+
"max_tokens": 2048,
|
84 |
+
"top_p": 0.99,
|
85 |
+
"messages": [{"role": "user", "content": prompt}],
|
86 |
+
}
|
87 |
+
),
|
88 |
+
)
|
89 |
+
response_json = response.json()
|
90 |
+
answer = response_json["choices"][0]["message"]["content"]
|
91 |
+
return answer, response_json
|
92 |
+
|
93 |
+
|
94 |
+
def generate_audio(text: str, language: str) -> FileResponse:
|
95 |
+
audio_path = get_audio(text, language)
|
96 |
+
|
97 |
+
background_tasks = BackgroundTasks()
|
98 |
+
background_tasks.add_task(cleanup_temp_file, audio_path)
|
99 |
|
100 |
+
return FileResponse(
|
101 |
+
path=audio_path,
|
102 |
+
media_type="audio/wav",
|
103 |
+
filename="generated_audio.wav",
|
104 |
+
background=background_tasks,
|
105 |
+
)
|
106 |
|
107 |
+
|
108 |
+
class InputLoadT2A(BaseModel):
|
109 |
text: str
|
110 |
language: str
|
111 |
|
112 |
|
113 |
+
class InputLoadP2T(BaseModel):
|
114 |
+
text: str
|
115 |
+
|
116 |
+
|
117 |
+
class ResponseLoadP2T(BaseModel):
|
118 |
+
text: str
|
119 |
+
|
120 |
+
|
121 |
app = FastAPI()
|
122 |
|
123 |
|
|
|
126 |
return {"server": "running"}
|
127 |
|
128 |
|
129 |
+
@app.post("/genaudio/")
|
130 |
+
async def receive(input_load: InputLoadT2A, request: Request) -> FileResponse:
|
131 |
+
return generate_audio(input_load.text, input_load.language)
|
132 |
|
|
|
|
|
133 |
|
134 |
+
@app.post("/gentext/")
|
135 |
+
async def gen_text(input_load: InputLoadP2T, request: Request) -> ResponseLoadP2T:
|
136 |
+
text, _ = generate_text(input_load.text)
|
137 |
+
return ResponseLoadP2T(text=text)
|
138 |
+
|
|
|
139 |
|
140 |
+
@app.post("/genemotion/")
|
141 |
+
async def gen_emotion(input_load: InputLoadT2A, request: Request) -> FileResponse:
|
142 |
+
text, _ = generate_text(input_load.text)
|
143 |
+
return generate_audio(text, input_load.language)
|