eswardivi commited on
Commit
8f362a9
·
verified ·
1 Parent(s): faaec6f

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +315 -0
  2. modal/app.py +56 -0
  3. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import httpx
4
+ import json
5
+ from typing import List, Tuple, Dict
6
+ from dataclasses import dataclass
7
+
8
+ import gradio as gr
9
+
10
+ import base64
11
+ from mistralai import Mistral
12
+
13
+ from scrapling.fetchers import Fetcher
14
+ from newspaper import Article
15
+ from trafilatura import extract
16
+
17
+ import wave
18
+ import time
19
+ import asyncio
20
+ import uuid
21
+
22
+ api_key = os.environ["MISTRAL_API_KEY"]
23
+ client = Mistral(api_key=api_key)
24
+
25
+
26
+ def get_text_from_document(document_url: str) -> str:
27
+ ocr_response = client.ocr.process(
28
+ model="mistral-ocr-latest",
29
+ document={"type": "document_url", "document_url": document_url},
30
+ include_image_base64=False,
31
+ )
32
+ pages_text = []
33
+ for page_number, page in enumerate(ocr_response.pages, start=1):
34
+ page_content = f"--- Page {page_number} ---\n{page.markdown}\n\n"
35
+ pages_text.append(page_content)
36
+ final_text = "".join(pages_text)
37
+ return final_text
38
+
39
+
40
+ def get_text_from_link(link: str) -> str:
41
+ try:
42
+ page = Fetcher.get(link, stealthy_headers=True, follow_redirects=True)
43
+ content = extract(page.html_content, with_metadata=True)
44
+ if content:
45
+ return content
46
+
47
+ except Exception as e:
48
+ print(f"Trafilatura extraction failed for {link}: {str(e)}")
49
+ try:
50
+ article = Article(link)
51
+ article.download()
52
+ article.parse()
53
+
54
+ metadata_text = f"#Title: {article.title}\n"
55
+ if article.authors:
56
+ metadata_text += f"Authors: {', '.join(article.authors)}\n"
57
+ if article.publish_date:
58
+ metadata_text += f"Published: {article.publish_date}\n"
59
+ if article.keywords:
60
+ metadata_text += f"Keywords: {', '.join(article.keywords)}\n"
61
+ if article.summary:
62
+ metadata_text += f"Summary: {article.summary}\n\n"
63
+
64
+ return metadata_text + article.text
65
+ except Exception as e:
66
+ print(f"Newspaper extraction failed for {link}: {str(e)}")
67
+ return None
68
+
69
+
70
+ def just_text(text: str) -> str:
71
+ if not text:
72
+ raise ValueError("Input text cannot be empty")
73
+ return text
74
+
75
+
76
+ def build_prompt(text: str) -> str:
77
+ template = """{
78
+ "conversation": [
79
+ {"speaker": "Olivia", "text": ""},
80
+ {"speaker": "Brian", "text": ""}
81
+ ]
82
+ }"""
83
+ prompt = """
84
+ Turn the text above into a casual podcast conversation between two hosts.
85
+
86
+ - Use a relaxed, informal tone like you're chatting with a friend
87
+ - Include natural conversation fillers like 'you know', 'I mean', 'like'
88
+ - Feel free to go off on brief relevant tangents or share quick personal takes
89
+ - Keep the back-and-forth flowing naturally
90
+ - Cover the key points but maintain a conversational style
91
+ - Aim for about 1 minute of casual discussion.
92
+
93
+ Output in this JSON format:"""
94
+ return f"{text}\n{prompt}\n{template}"
95
+
96
+
97
+ def extract_conversation(text: str) -> Dict:
98
+
99
+ prompt = build_prompt(text)
100
+
101
+ max_retries = 3
102
+ attempt = 0
103
+
104
+ while attempt < max_retries:
105
+ try:
106
+ chat_completion = client.chat.complete(
107
+ model="codestral-latest",
108
+ messages=[
109
+ {
110
+ "role": "system",
111
+ "content": "You are a helpful assistant.",
112
+ },
113
+ {
114
+ "role": "user",
115
+ "content": prompt,
116
+ },
117
+ ],
118
+ response_format={
119
+ "type": "json_object",
120
+ },
121
+ )
122
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
123
+ json_match = re.search(pattern, chat_completion.choices[0].message.content)
124
+
125
+ if not json_match:
126
+ raise ValueError("No valid JSON found in response")
127
+
128
+ result = json.loads(json_match.group())
129
+
130
+ if "conversation" not in result:
131
+ if attempt == max_retries - 1:
132
+ raise ValueError(
133
+ "Response JSON missing 'conversation' key after all retries"
134
+ )
135
+ attempt += 1
136
+ continue
137
+
138
+ return result
139
+
140
+ except Exception as e:
141
+ if attempt == max_retries - 1:
142
+ raise RuntimeError(
143
+ f"Failed to extract conversation after {max_retries} attempts: {e}"
144
+ )
145
+ attempt += 1
146
+
147
+
148
+ async def generate_audio(text: str, voice: str, file_out_path: str) -> str:
149
+ url = "https://eswardivi--kokoro-api-kokoro-generate.modal.run/"
150
+
151
+ querystring = {"text": text, "voice": voice}
152
+ payload = ""
153
+ headers = {
154
+ "Accept": "*/*",
155
+ "Accept-Encoding": "gzip, deflate, br",
156
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
157
+ "Connection": "keep-alive",
158
+ }
159
+ async with httpx.AsyncClient() as client:
160
+ response = await client.post(
161
+ url, headers=headers, params=querystring, data=payload, timeout=90.0
162
+ )
163
+ audio_data = response.content
164
+
165
+ with open(file_out_path, "wb") as f:
166
+ f.write(audio_data)
167
+
168
+ return file_out_path
169
+
170
+
171
+ def merge_audio_files(audio_files: List[str]) -> str:
172
+ random_name = str(uuid.uuid4())
173
+ merged_file = f"{random_name}.wav"
174
+
175
+ with wave.open(audio_files[0], "rb") as first_wav:
176
+ params = first_wav.getparams()
177
+
178
+ merged_audio = wave.open(merged_file, "wb")
179
+ merged_audio.setparams(params)
180
+
181
+ for audio_file in audio_files:
182
+ with wave.open(audio_file, "rb") as wav_file:
183
+ merged_audio.writeframes(wav_file.readframes(wav_file.getnframes()))
184
+ os.remove(audio_file)
185
+
186
+ merged_audio.close()
187
+ return merged_file
188
+
189
+
190
+ async def wake_up_api():
191
+ url = "https://eswardivi--kokoro-api-kokoro-wake-up.modal.run/"
192
+ async with httpx.AsyncClient() as client:
193
+ response = await client.get(url, timeout=90.0)
194
+ if response.status_code == 200:
195
+ print("API is awake")
196
+ else:
197
+ print("API is not awake Yet")
198
+
199
+
200
+ def generate_podcast(input_type: str, input: str):
201
+ """
202
+ Generate a podcast-style conversation from various input types.
203
+
204
+ This function takes content from a document URL, webpage link, or raw text and
205
+ converts it into a natural-sounding podcast dialogue between two hosts. The conversation
206
+ is then synthesized into audio using text-to-speech.
207
+
208
+ Args:
209
+ input_type (str): The type of input to process. Must be one of:
210
+ - "Document": URL to a document (PDF, etc.) to extract text from
211
+ - "Link": URL to a webpage to scrape content from
212
+ - "Text": Raw text input to convert directly
213
+
214
+ input (str): The actual input content matching the specified input_type:
215
+ - For "Document": Document URL (e.g. "https://example.com/doc.pdf")
216
+ - For "Link": Webpage URL (e.g. "https://example.com/article")
217
+ - For "Text": Plain text content
218
+
219
+ Returns:
220
+ str: Path to the generated audio file (.wav format) containing the synthesized
221
+ podcast conversation.
222
+
223
+ Raises:
224
+ ValueError: If the input text cannot be extracted or is empty
225
+ RuntimeError: If conversation extraction fails after maximum retries
226
+ """
227
+
228
+ async def async_process():
229
+ await wake_up_api()
230
+ start_time = time.time()
231
+
232
+ if input_type == "Document":
233
+ text = get_text_from_document(input)
234
+ elif input_type == "Link":
235
+ text = get_text_from_link(input)
236
+ elif input_type == "Text":
237
+ text = input
238
+ if not text:
239
+ raise ValueError("Input text cannot be empty")
240
+
241
+ text_time = time.time()
242
+ print(f"Text Extracted ({text_time - start_time:.2f}s)")
243
+
244
+ conversation = extract_conversation(text)
245
+ conversation_time = time.time()
246
+ print(f"Conversation Extracted ({conversation_time - text_time:.2f}s)")
247
+
248
+ batch_size = 8
249
+ tasks = []
250
+ for i in range(0, len(conversation["conversation"]), batch_size):
251
+ batch = conversation["conversation"][i : i + batch_size]
252
+ batch_tasks = []
253
+ for j, message in enumerate(batch, start=i):
254
+ if message["speaker"] == "Olivia":
255
+ voice = "af_heart"
256
+ elif message["speaker"] == "Brian":
257
+ voice = "am_fenrir"
258
+ else:
259
+ voice = "am_fenrir"
260
+ batch_tasks.append(
261
+ generate_audio(message["text"], voice, f"output_{j}.mp3")
262
+ )
263
+ tasks.extend(await asyncio.gather(*batch_tasks))
264
+
265
+ audio_time = time.time()
266
+ print(f"Audio Generated ({audio_time - conversation_time:.2f}s)")
267
+
268
+ audio_files = [
269
+ f"output_{index}.mp3" for index in range(len(conversation["conversation"]))
270
+ ]
271
+ files_time = time.time()
272
+ print(f"Audio Files Listed ({files_time - audio_time:.2f}s)")
273
+
274
+ merged_audio = merge_audio_files(audio_files)
275
+ merge_time = time.time()
276
+ print(f"Merged Audio Generated ({merge_time - files_time:.2f}s)")
277
+ print(f"Total Time: {merge_time - start_time:.2f}s")
278
+
279
+ return merged_audio
280
+
281
+ return asyncio.run(async_process())
282
+
283
+
284
+ with gr.Blocks(title="Podcast Generator") as demo:
285
+ gr.Markdown(
286
+ """
287
+ # 🎙️ Podcast Generator
288
+ Generate engaging podcast conversations from documents, links, or text input.
289
+ """
290
+ )
291
+
292
+ with gr.Row():
293
+ with gr.Column(scale=1):
294
+ input_type = gr.Dropdown(
295
+ choices=["Document", "Link", "Text"],
296
+ label="Input Type",
297
+ value="Document",
298
+ interactive=True,
299
+ )
300
+ input_text = gr.Textbox(
301
+ label="Input", placeholder="Enter Document URL, Link or Text", lines=5
302
+ )
303
+ generate_btn = gr.Button("Generate Podcast 🎧", variant="primary")
304
+
305
+ with gr.Column(scale=1):
306
+ output_audio = gr.Audio(label="Generated Podcast")
307
+
308
+ generate_btn.click(
309
+ fn=generate_podcast,
310
+ inputs=[input_type, input_text],
311
+ outputs=output_audio,
312
+ api_name="generate",
313
+ )
314
+
315
+ demo.launch(mcp_server=True)
modal/app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+ import io
3
+
4
+
5
+ image = modal.Image.debian_slim(python_version="3.12").apt_install("espeak-ng")
6
+ image = image.pip_install(
7
+ "kokoro>=0.9.4", "soundfile", "fastapi[standard]", "spacy==3.8.0"
8
+ )
9
+ image = image.run_commands("python -m spacy download en_core_web_sm")
10
+ app = modal.App("kokoro-api", image=image)
11
+
12
+ with image.imports():
13
+ import os
14
+ from kokoro import KPipeline
15
+ from IPython.display import display, Audio
16
+ import soundfile as sf
17
+ import torch
18
+ from fastapi.responses import StreamingResponse, Response, FileResponse
19
+ import numpy as np
20
+ import uuid
21
+
22
+
23
+ @app.cls(gpu="t4", scaledown_window=60 * 2, enable_memory_snapshot=True)
24
+ @modal.concurrent(max_inputs=30)
25
+ class kokoro:
26
+ @modal.enter()
27
+ def load(self):
28
+ self.pipeline = KPipeline(lang_code="a")
29
+
30
+ @modal.fastapi_endpoint(docs=True, method="POST")
31
+ def generate(self, text: str, voice: str = "af_heart"):
32
+
33
+ if len(text) == 0:
34
+ return Response(content="Text is empty", status_code=400)
35
+
36
+ generator = self.pipeline(text, voice)
37
+ audio_bytes = io.BytesIO()
38
+ random_name = str(uuid.uuid4())
39
+ file_path = f"{random_name}.wav"
40
+ audio_combined = []
41
+
42
+ for _, _, audio in generator:
43
+ audio_combined.append(audio)
44
+
45
+ audio_combined = np.concatenate(audio_combined)
46
+
47
+ sf.write(file_path, audio_combined, 24000)
48
+
49
+ return FileResponse(path=file_path, media_type="audio/wav", filename=file_path)
50
+
51
+ @modal.fastapi_endpoint(docs=True, method="GET")
52
+ def wake_up(self):
53
+ return Response(content="Kokoro is awake", status_code=200)
54
+
55
+
56
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ httpx
2
+ mistralai==1.8.1
3
+ modal==1.0.3
4
+ gradio[mcp]==5.33.0
5
+ newspaper3k
6
+ trafilatura
7
+ scrapling
8
+ wave