Spaces:
Running
Running
Upload 3 files
Browse files- app.py +315 -0
- modal/app.py +56 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import httpx
|
4 |
+
import json
|
5 |
+
from typing import List, Tuple, Dict
|
6 |
+
from dataclasses import dataclass
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
|
10 |
+
import base64
|
11 |
+
from mistralai import Mistral
|
12 |
+
|
13 |
+
from scrapling.fetchers import Fetcher
|
14 |
+
from newspaper import Article
|
15 |
+
from trafilatura import extract
|
16 |
+
|
17 |
+
import wave
|
18 |
+
import time
|
19 |
+
import asyncio
|
20 |
+
import uuid
|
21 |
+
|
22 |
+
api_key = os.environ["MISTRAL_API_KEY"]
|
23 |
+
client = Mistral(api_key=api_key)
|
24 |
+
|
25 |
+
|
26 |
+
def get_text_from_document(document_url: str) -> str:
|
27 |
+
ocr_response = client.ocr.process(
|
28 |
+
model="mistral-ocr-latest",
|
29 |
+
document={"type": "document_url", "document_url": document_url},
|
30 |
+
include_image_base64=False,
|
31 |
+
)
|
32 |
+
pages_text = []
|
33 |
+
for page_number, page in enumerate(ocr_response.pages, start=1):
|
34 |
+
page_content = f"--- Page {page_number} ---\n{page.markdown}\n\n"
|
35 |
+
pages_text.append(page_content)
|
36 |
+
final_text = "".join(pages_text)
|
37 |
+
return final_text
|
38 |
+
|
39 |
+
|
40 |
+
def get_text_from_link(link: str) -> str:
|
41 |
+
try:
|
42 |
+
page = Fetcher.get(link, stealthy_headers=True, follow_redirects=True)
|
43 |
+
content = extract(page.html_content, with_metadata=True)
|
44 |
+
if content:
|
45 |
+
return content
|
46 |
+
|
47 |
+
except Exception as e:
|
48 |
+
print(f"Trafilatura extraction failed for {link}: {str(e)}")
|
49 |
+
try:
|
50 |
+
article = Article(link)
|
51 |
+
article.download()
|
52 |
+
article.parse()
|
53 |
+
|
54 |
+
metadata_text = f"#Title: {article.title}\n"
|
55 |
+
if article.authors:
|
56 |
+
metadata_text += f"Authors: {', '.join(article.authors)}\n"
|
57 |
+
if article.publish_date:
|
58 |
+
metadata_text += f"Published: {article.publish_date}\n"
|
59 |
+
if article.keywords:
|
60 |
+
metadata_text += f"Keywords: {', '.join(article.keywords)}\n"
|
61 |
+
if article.summary:
|
62 |
+
metadata_text += f"Summary: {article.summary}\n\n"
|
63 |
+
|
64 |
+
return metadata_text + article.text
|
65 |
+
except Exception as e:
|
66 |
+
print(f"Newspaper extraction failed for {link}: {str(e)}")
|
67 |
+
return None
|
68 |
+
|
69 |
+
|
70 |
+
def just_text(text: str) -> str:
|
71 |
+
if not text:
|
72 |
+
raise ValueError("Input text cannot be empty")
|
73 |
+
return text
|
74 |
+
|
75 |
+
|
76 |
+
def build_prompt(text: str) -> str:
|
77 |
+
template = """{
|
78 |
+
"conversation": [
|
79 |
+
{"speaker": "Olivia", "text": ""},
|
80 |
+
{"speaker": "Brian", "text": ""}
|
81 |
+
]
|
82 |
+
}"""
|
83 |
+
prompt = """
|
84 |
+
Turn the text above into a casual podcast conversation between two hosts.
|
85 |
+
|
86 |
+
- Use a relaxed, informal tone like you're chatting with a friend
|
87 |
+
- Include natural conversation fillers like 'you know', 'I mean', 'like'
|
88 |
+
- Feel free to go off on brief relevant tangents or share quick personal takes
|
89 |
+
- Keep the back-and-forth flowing naturally
|
90 |
+
- Cover the key points but maintain a conversational style
|
91 |
+
- Aim for about 1 minute of casual discussion.
|
92 |
+
|
93 |
+
Output in this JSON format:"""
|
94 |
+
return f"{text}\n{prompt}\n{template}"
|
95 |
+
|
96 |
+
|
97 |
+
def extract_conversation(text: str) -> Dict:
|
98 |
+
|
99 |
+
prompt = build_prompt(text)
|
100 |
+
|
101 |
+
max_retries = 3
|
102 |
+
attempt = 0
|
103 |
+
|
104 |
+
while attempt < max_retries:
|
105 |
+
try:
|
106 |
+
chat_completion = client.chat.complete(
|
107 |
+
model="codestral-latest",
|
108 |
+
messages=[
|
109 |
+
{
|
110 |
+
"role": "system",
|
111 |
+
"content": "You are a helpful assistant.",
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"role": "user",
|
115 |
+
"content": prompt,
|
116 |
+
},
|
117 |
+
],
|
118 |
+
response_format={
|
119 |
+
"type": "json_object",
|
120 |
+
},
|
121 |
+
)
|
122 |
+
pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
|
123 |
+
json_match = re.search(pattern, chat_completion.choices[0].message.content)
|
124 |
+
|
125 |
+
if not json_match:
|
126 |
+
raise ValueError("No valid JSON found in response")
|
127 |
+
|
128 |
+
result = json.loads(json_match.group())
|
129 |
+
|
130 |
+
if "conversation" not in result:
|
131 |
+
if attempt == max_retries - 1:
|
132 |
+
raise ValueError(
|
133 |
+
"Response JSON missing 'conversation' key after all retries"
|
134 |
+
)
|
135 |
+
attempt += 1
|
136 |
+
continue
|
137 |
+
|
138 |
+
return result
|
139 |
+
|
140 |
+
except Exception as e:
|
141 |
+
if attempt == max_retries - 1:
|
142 |
+
raise RuntimeError(
|
143 |
+
f"Failed to extract conversation after {max_retries} attempts: {e}"
|
144 |
+
)
|
145 |
+
attempt += 1
|
146 |
+
|
147 |
+
|
148 |
+
async def generate_audio(text: str, voice: str, file_out_path: str) -> str:
|
149 |
+
url = "https://eswardivi--kokoro-api-kokoro-generate.modal.run/"
|
150 |
+
|
151 |
+
querystring = {"text": text, "voice": voice}
|
152 |
+
payload = ""
|
153 |
+
headers = {
|
154 |
+
"Accept": "*/*",
|
155 |
+
"Accept-Encoding": "gzip, deflate, br",
|
156 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
157 |
+
"Connection": "keep-alive",
|
158 |
+
}
|
159 |
+
async with httpx.AsyncClient() as client:
|
160 |
+
response = await client.post(
|
161 |
+
url, headers=headers, params=querystring, data=payload, timeout=90.0
|
162 |
+
)
|
163 |
+
audio_data = response.content
|
164 |
+
|
165 |
+
with open(file_out_path, "wb") as f:
|
166 |
+
f.write(audio_data)
|
167 |
+
|
168 |
+
return file_out_path
|
169 |
+
|
170 |
+
|
171 |
+
def merge_audio_files(audio_files: List[str]) -> str:
|
172 |
+
random_name = str(uuid.uuid4())
|
173 |
+
merged_file = f"{random_name}.wav"
|
174 |
+
|
175 |
+
with wave.open(audio_files[0], "rb") as first_wav:
|
176 |
+
params = first_wav.getparams()
|
177 |
+
|
178 |
+
merged_audio = wave.open(merged_file, "wb")
|
179 |
+
merged_audio.setparams(params)
|
180 |
+
|
181 |
+
for audio_file in audio_files:
|
182 |
+
with wave.open(audio_file, "rb") as wav_file:
|
183 |
+
merged_audio.writeframes(wav_file.readframes(wav_file.getnframes()))
|
184 |
+
os.remove(audio_file)
|
185 |
+
|
186 |
+
merged_audio.close()
|
187 |
+
return merged_file
|
188 |
+
|
189 |
+
|
190 |
+
async def wake_up_api():
|
191 |
+
url = "https://eswardivi--kokoro-api-kokoro-wake-up.modal.run/"
|
192 |
+
async with httpx.AsyncClient() as client:
|
193 |
+
response = await client.get(url, timeout=90.0)
|
194 |
+
if response.status_code == 200:
|
195 |
+
print("API is awake")
|
196 |
+
else:
|
197 |
+
print("API is not awake Yet")
|
198 |
+
|
199 |
+
|
200 |
+
def generate_podcast(input_type: str, input: str):
|
201 |
+
"""
|
202 |
+
Generate a podcast-style conversation from various input types.
|
203 |
+
|
204 |
+
This function takes content from a document URL, webpage link, or raw text and
|
205 |
+
converts it into a natural-sounding podcast dialogue between two hosts. The conversation
|
206 |
+
is then synthesized into audio using text-to-speech.
|
207 |
+
|
208 |
+
Args:
|
209 |
+
input_type (str): The type of input to process. Must be one of:
|
210 |
+
- "Document": URL to a document (PDF, etc.) to extract text from
|
211 |
+
- "Link": URL to a webpage to scrape content from
|
212 |
+
- "Text": Raw text input to convert directly
|
213 |
+
|
214 |
+
input (str): The actual input content matching the specified input_type:
|
215 |
+
- For "Document": Document URL (e.g. "https://example.com/doc.pdf")
|
216 |
+
- For "Link": Webpage URL (e.g. "https://example.com/article")
|
217 |
+
- For "Text": Plain text content
|
218 |
+
|
219 |
+
Returns:
|
220 |
+
str: Path to the generated audio file (.wav format) containing the synthesized
|
221 |
+
podcast conversation.
|
222 |
+
|
223 |
+
Raises:
|
224 |
+
ValueError: If the input text cannot be extracted or is empty
|
225 |
+
RuntimeError: If conversation extraction fails after maximum retries
|
226 |
+
"""
|
227 |
+
|
228 |
+
async def async_process():
|
229 |
+
await wake_up_api()
|
230 |
+
start_time = time.time()
|
231 |
+
|
232 |
+
if input_type == "Document":
|
233 |
+
text = get_text_from_document(input)
|
234 |
+
elif input_type == "Link":
|
235 |
+
text = get_text_from_link(input)
|
236 |
+
elif input_type == "Text":
|
237 |
+
text = input
|
238 |
+
if not text:
|
239 |
+
raise ValueError("Input text cannot be empty")
|
240 |
+
|
241 |
+
text_time = time.time()
|
242 |
+
print(f"Text Extracted ({text_time - start_time:.2f}s)")
|
243 |
+
|
244 |
+
conversation = extract_conversation(text)
|
245 |
+
conversation_time = time.time()
|
246 |
+
print(f"Conversation Extracted ({conversation_time - text_time:.2f}s)")
|
247 |
+
|
248 |
+
batch_size = 8
|
249 |
+
tasks = []
|
250 |
+
for i in range(0, len(conversation["conversation"]), batch_size):
|
251 |
+
batch = conversation["conversation"][i : i + batch_size]
|
252 |
+
batch_tasks = []
|
253 |
+
for j, message in enumerate(batch, start=i):
|
254 |
+
if message["speaker"] == "Olivia":
|
255 |
+
voice = "af_heart"
|
256 |
+
elif message["speaker"] == "Brian":
|
257 |
+
voice = "am_fenrir"
|
258 |
+
else:
|
259 |
+
voice = "am_fenrir"
|
260 |
+
batch_tasks.append(
|
261 |
+
generate_audio(message["text"], voice, f"output_{j}.mp3")
|
262 |
+
)
|
263 |
+
tasks.extend(await asyncio.gather(*batch_tasks))
|
264 |
+
|
265 |
+
audio_time = time.time()
|
266 |
+
print(f"Audio Generated ({audio_time - conversation_time:.2f}s)")
|
267 |
+
|
268 |
+
audio_files = [
|
269 |
+
f"output_{index}.mp3" for index in range(len(conversation["conversation"]))
|
270 |
+
]
|
271 |
+
files_time = time.time()
|
272 |
+
print(f"Audio Files Listed ({files_time - audio_time:.2f}s)")
|
273 |
+
|
274 |
+
merged_audio = merge_audio_files(audio_files)
|
275 |
+
merge_time = time.time()
|
276 |
+
print(f"Merged Audio Generated ({merge_time - files_time:.2f}s)")
|
277 |
+
print(f"Total Time: {merge_time - start_time:.2f}s")
|
278 |
+
|
279 |
+
return merged_audio
|
280 |
+
|
281 |
+
return asyncio.run(async_process())
|
282 |
+
|
283 |
+
|
284 |
+
with gr.Blocks(title="Podcast Generator") as demo:
|
285 |
+
gr.Markdown(
|
286 |
+
"""
|
287 |
+
# 🎙️ Podcast Generator
|
288 |
+
Generate engaging podcast conversations from documents, links, or text input.
|
289 |
+
"""
|
290 |
+
)
|
291 |
+
|
292 |
+
with gr.Row():
|
293 |
+
with gr.Column(scale=1):
|
294 |
+
input_type = gr.Dropdown(
|
295 |
+
choices=["Document", "Link", "Text"],
|
296 |
+
label="Input Type",
|
297 |
+
value="Document",
|
298 |
+
interactive=True,
|
299 |
+
)
|
300 |
+
input_text = gr.Textbox(
|
301 |
+
label="Input", placeholder="Enter Document URL, Link or Text", lines=5
|
302 |
+
)
|
303 |
+
generate_btn = gr.Button("Generate Podcast 🎧", variant="primary")
|
304 |
+
|
305 |
+
with gr.Column(scale=1):
|
306 |
+
output_audio = gr.Audio(label="Generated Podcast")
|
307 |
+
|
308 |
+
generate_btn.click(
|
309 |
+
fn=generate_podcast,
|
310 |
+
inputs=[input_type, input_text],
|
311 |
+
outputs=output_audio,
|
312 |
+
api_name="generate",
|
313 |
+
)
|
314 |
+
|
315 |
+
demo.launch(mcp_server=True)
|
modal/app.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import modal
|
2 |
+
import io
|
3 |
+
|
4 |
+
|
5 |
+
image = modal.Image.debian_slim(python_version="3.12").apt_install("espeak-ng")
|
6 |
+
image = image.pip_install(
|
7 |
+
"kokoro>=0.9.4", "soundfile", "fastapi[standard]", "spacy==3.8.0"
|
8 |
+
)
|
9 |
+
image = image.run_commands("python -m spacy download en_core_web_sm")
|
10 |
+
app = modal.App("kokoro-api", image=image)
|
11 |
+
|
12 |
+
with image.imports():
|
13 |
+
import os
|
14 |
+
from kokoro import KPipeline
|
15 |
+
from IPython.display import display, Audio
|
16 |
+
import soundfile as sf
|
17 |
+
import torch
|
18 |
+
from fastapi.responses import StreamingResponse, Response, FileResponse
|
19 |
+
import numpy as np
|
20 |
+
import uuid
|
21 |
+
|
22 |
+
|
23 |
+
@app.cls(gpu="t4", scaledown_window=60 * 2, enable_memory_snapshot=True)
|
24 |
+
@modal.concurrent(max_inputs=30)
|
25 |
+
class kokoro:
|
26 |
+
@modal.enter()
|
27 |
+
def load(self):
|
28 |
+
self.pipeline = KPipeline(lang_code="a")
|
29 |
+
|
30 |
+
@modal.fastapi_endpoint(docs=True, method="POST")
|
31 |
+
def generate(self, text: str, voice: str = "af_heart"):
|
32 |
+
|
33 |
+
if len(text) == 0:
|
34 |
+
return Response(content="Text is empty", status_code=400)
|
35 |
+
|
36 |
+
generator = self.pipeline(text, voice)
|
37 |
+
audio_bytes = io.BytesIO()
|
38 |
+
random_name = str(uuid.uuid4())
|
39 |
+
file_path = f"{random_name}.wav"
|
40 |
+
audio_combined = []
|
41 |
+
|
42 |
+
for _, _, audio in generator:
|
43 |
+
audio_combined.append(audio)
|
44 |
+
|
45 |
+
audio_combined = np.concatenate(audio_combined)
|
46 |
+
|
47 |
+
sf.write(file_path, audio_combined, 24000)
|
48 |
+
|
49 |
+
return FileResponse(path=file_path, media_type="audio/wav", filename=file_path)
|
50 |
+
|
51 |
+
@modal.fastapi_endpoint(docs=True, method="GET")
|
52 |
+
def wake_up(self):
|
53 |
+
return Response(content="Kokoro is awake", status_code=200)
|
54 |
+
|
55 |
+
|
56 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
httpx
|
2 |
+
mistralai==1.8.1
|
3 |
+
modal==1.0.3
|
4 |
+
gradio[mcp]==5.33.0
|
5 |
+
newspaper3k
|
6 |
+
trafilatura
|
7 |
+
scrapling
|
8 |
+
wave
|