Spaces:
Running
Running
Upload 2 files
Browse files- app.py +656 -0
- requirements.txt +10 -0
app.py
ADDED
@@ -0,0 +1,656 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import google.generativeai as genai
|
3 |
+
from gtts import gTTS
|
4 |
+
import pyttsx3
|
5 |
+
from pathlib import Path
|
6 |
+
import tempfile
|
7 |
+
import os
|
8 |
+
from uuid import uuid4
|
9 |
+
import time
|
10 |
+
import asyncio
|
11 |
+
import edge_tts
|
12 |
+
import numpy as np
|
13 |
+
import soundfile as sf
|
14 |
+
import re
|
15 |
+
|
16 |
+
# Voice configurations for different speakers
|
17 |
+
VOICE_CONFIGS = {
|
18 |
+
"2_speakers": [
|
19 |
+
{"name": "Alex", "voice": "en-US-AriaNeural", "gender": "female"},
|
20 |
+
{"name": "Brian", "voice": "en-US-GuyNeural", "gender": "male"}
|
21 |
+
],
|
22 |
+
"3_speakers": [
|
23 |
+
{"name": "Sarah", "voice": "en-US-JennyNeural", "gender": "female"},
|
24 |
+
{"name": "Mike", "voice": "en-US-BrandonNeural", "gender": "male"},
|
25 |
+
{"name": "Emma", "voice": "en-US-AriaNeural", "gender": "female"}
|
26 |
+
],
|
27 |
+
"4_speakers": [
|
28 |
+
{"name": "Sarah", "voice": "en-US-JennyNeural", "gender": "female"},
|
29 |
+
{"name": "Mike", "voice": "en-US-BrandonNeural", "gender": "male"},
|
30 |
+
{"name": "Emma", "voice": "en-US-AriaNeural", "gender": "female"},
|
31 |
+
{"name": "David", "voice": "en-US-GuyNeural", "gender": "male"}
|
32 |
+
]
|
33 |
+
}
|
34 |
+
|
35 |
+
# Initialize Gemini client
|
36 |
+
client = None
|
37 |
+
|
38 |
+
def init_gemini(api_key):
|
39 |
+
"""Initialize Gemini client with API key"""
|
40 |
+
global client
|
41 |
+
if api_key:
|
42 |
+
try:
|
43 |
+
genai.configure(api_key=api_key)
|
44 |
+
client = genai.GenerativeModel('gemini-2.0-flash')
|
45 |
+
return "✅ Gemini API connected successfully!"
|
46 |
+
except Exception as e:
|
47 |
+
return f"❌ Gemini API error: {str(e)}"
|
48 |
+
return "ℹ️ Add Gemini API key for better summaries"
|
49 |
+
|
50 |
+
def generate_with_gtts(text, filename):
|
51 |
+
"""Generate speech using Google's gTTS"""
|
52 |
+
try:
|
53 |
+
tts = gTTS(text=text, lang='en', slow=False)
|
54 |
+
tts.save(filename)
|
55 |
+
return filename, None
|
56 |
+
except Exception as e:
|
57 |
+
return None, f"gTTS Error: {str(e)}"
|
58 |
+
|
59 |
+
async def generate_with_edge_tts(text, voice, filename):
|
60 |
+
"""Generate speech using Microsoft Edge TTS with specific voice"""
|
61 |
+
try:
|
62 |
+
communicate = edge_tts.Communicate(text, voice)
|
63 |
+
await communicate.save(filename)
|
64 |
+
return filename, None
|
65 |
+
except Exception as e:
|
66 |
+
return None, f"Edge TTS Error: {str(e)}"
|
67 |
+
|
68 |
+
def combine_audio_files(audio_files, output_filename):
|
69 |
+
"""Combine multiple audio files into one"""
|
70 |
+
try:
|
71 |
+
from scipy.signal import resample
|
72 |
+
combined_audio = []
|
73 |
+
sample_rate = None
|
74 |
+
|
75 |
+
for audio_file in audio_files:
|
76 |
+
if os.path.exists(audio_file):
|
77 |
+
data, sr = sf.read(audio_file)
|
78 |
+
if sample_rate is None:
|
79 |
+
sample_rate = sr
|
80 |
+
elif sr != sample_rate:
|
81 |
+
# Resample if needed
|
82 |
+
data = resample(data, int(len(data) * sample_rate / sr))
|
83 |
+
|
84 |
+
combined_audio.append(data)
|
85 |
+
# Add small pause between speakers
|
86 |
+
pause = np.zeros(int(sample_rate * 0.5)) # 0.5 second pause
|
87 |
+
combined_audio.append(pause)
|
88 |
+
|
89 |
+
if combined_audio:
|
90 |
+
final_audio = np.concatenate(combined_audio)
|
91 |
+
sf.write(output_filename, final_audio, sample_rate)
|
92 |
+
return output_filename, None
|
93 |
+
else:
|
94 |
+
return None, "No audio files to combine"
|
95 |
+
except Exception as e:
|
96 |
+
return None, f"Audio combination error: {str(e)}"
|
97 |
+
|
98 |
+
async def generate_multi_speaker_audio(script_parts, speaker_count, output_filename):
|
99 |
+
"""Generate multi-speaker podcast audio"""
|
100 |
+
try:
|
101 |
+
voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
|
102 |
+
audio_files = []
|
103 |
+
|
104 |
+
for i, (speaker_text, speaker_idx) in enumerate(script_parts):
|
105 |
+
voice = voice_config[speaker_idx]["voice"]
|
106 |
+
temp_filename = f"temp_speaker_{i}_{uuid4().hex[:8]}.wav"
|
107 |
+
|
108 |
+
result, error = await generate_with_edge_tts(speaker_text, voice, temp_filename)
|
109 |
+
if result:
|
110 |
+
audio_files.append(temp_filename)
|
111 |
+
else:
|
112 |
+
# Cleanup and return error
|
113 |
+
for f in audio_files:
|
114 |
+
try:
|
115 |
+
os.unlink(f)
|
116 |
+
except:
|
117 |
+
pass
|
118 |
+
return None, f"Error generating voice {i+1}: {error}"
|
119 |
+
|
120 |
+
# Combine all audio files
|
121 |
+
final_file, error = combine_audio_files(audio_files, output_filename)
|
122 |
+
|
123 |
+
# Cleanup temp files
|
124 |
+
for f in audio_files:
|
125 |
+
try:
|
126 |
+
os.unlink(f)
|
127 |
+
except:
|
128 |
+
pass
|
129 |
+
|
130 |
+
return final_file, error
|
131 |
+
except Exception as e:
|
132 |
+
return None, f"Multi-speaker generation error: {str(e)}"
|
133 |
+
|
134 |
+
def generate_with_pyttsx3(text, filename):
|
135 |
+
"""Generate speech using system's TTS engine"""
|
136 |
+
try:
|
137 |
+
engine = pyttsx3.init()
|
138 |
+
|
139 |
+
# Set properties for better audio quality
|
140 |
+
engine.setProperty('rate', 180)
|
141 |
+
engine.setProperty('volume', 0.9)
|
142 |
+
|
143 |
+
# Try to find a good voice
|
144 |
+
voices = engine.getProperty('voices')
|
145 |
+
for voice in voices:
|
146 |
+
if 'female' in voice.name.lower() or 'zira' in voice.name.lower():
|
147 |
+
engine.setProperty('voice', voice.id)
|
148 |
+
break
|
149 |
+
|
150 |
+
engine.save_to_file(text, filename)
|
151 |
+
engine.runAndWait()
|
152 |
+
return filename, None
|
153 |
+
except Exception as e:
|
154 |
+
return None, f"pyttsx3 Error: {str(e)}"
|
155 |
+
|
156 |
+
def generate_podcast_script(text, speaker_count, use_gemini):
|
157 |
+
"""Generate a podcast script with multiple speakers"""
|
158 |
+
if use_gemini and client:
|
159 |
+
try:
|
160 |
+
voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
|
161 |
+
speaker_names = [config["name"] for config in voice_config]
|
162 |
+
|
163 |
+
prompt = f"""Create an engaging podcast conversation between {speaker_count} hosts: {', '.join(speaker_names)}.
|
164 |
+
|
165 |
+
Transform this text into a natural conversation where each speaker contributes meaningfully.
|
166 |
+
|
167 |
+
Guidelines:
|
168 |
+
- Make it sound like a real podcast discussion
|
169 |
+
- Each speaker should have distinct perspectives and speaking styles
|
170 |
+
- Include natural transitions and interactions
|
171 |
+
- Keep it under 2500 characters total
|
172 |
+
- Use speaker names clearly (e.g., "Sarah: Hello everyone...")
|
173 |
+
- Make it conversational and engaging
|
174 |
+
|
175 |
+
Original text: {text[:3000]}
|
176 |
+
|
177 |
+
Format the output with clear speaker labels like:
|
178 |
+
Speaker1: [text]
|
179 |
+
Speaker2: [text]
|
180 |
+
etc."""
|
181 |
+
|
182 |
+
response = client.generate_content(prompt)
|
183 |
+
return response.text
|
184 |
+
except Exception as e:
|
185 |
+
# Fallback to simple script
|
186 |
+
return f"Error generating script: {str(e)}"
|
187 |
+
else:
|
188 |
+
# Simple fallback for single speaker
|
189 |
+
return text[:2000] + ("..." if len(text) > 2000 else "")
|
190 |
+
|
191 |
+
def parse_script_for_speakers(script, speaker_count):
|
192 |
+
"""Parse the script to extract speaker parts"""
|
193 |
+
try:
|
194 |
+
voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
|
195 |
+
speaker_names = [config["name"] for config in voice_config]
|
196 |
+
|
197 |
+
# Split script by speaker patterns
|
198 |
+
parts = []
|
199 |
+
lines = script.split('\n')
|
200 |
+
current_speaker = 0
|
201 |
+
current_text = ""
|
202 |
+
|
203 |
+
for line in lines:
|
204 |
+
line = line.strip()
|
205 |
+
if not line:
|
206 |
+
continue
|
207 |
+
|
208 |
+
# Check if line starts with a speaker name
|
209 |
+
speaker_found = False
|
210 |
+
for i, name in enumerate(speaker_names):
|
211 |
+
if line.lower().startswith(f"{name.lower()}:"):
|
212 |
+
# Save previous speaker's text
|
213 |
+
if current_text.strip():
|
214 |
+
parts.append((current_text.strip(), current_speaker))
|
215 |
+
# Start new speaker
|
216 |
+
current_speaker = i
|
217 |
+
current_text = line[len(name)+1:].strip()
|
218 |
+
speaker_found = True
|
219 |
+
break
|
220 |
+
|
221 |
+
if not speaker_found:
|
222 |
+
current_text += " " + line
|
223 |
+
|
224 |
+
# Add final speaker text
|
225 |
+
if current_text.strip():
|
226 |
+
parts.append((current_text.strip(), current_speaker))
|
227 |
+
|
228 |
+
# If no speakers were found, distribute text evenly
|
229 |
+
if not parts and script.strip():
|
230 |
+
sentences = script.split('. ')
|
231 |
+
sentences_per_speaker = max(1, len(sentences) // speaker_count)
|
232 |
+
|
233 |
+
for i in range(speaker_count):
|
234 |
+
start_idx = i * sentences_per_speaker
|
235 |
+
if i == speaker_count - 1: # Last speaker gets remaining sentences
|
236 |
+
speaker_sentences = sentences[start_idx:]
|
237 |
+
else:
|
238 |
+
speaker_sentences = sentences[start_idx:start_idx + sentences_per_speaker]
|
239 |
+
|
240 |
+
if speaker_sentences:
|
241 |
+
speaker_text = '. '.join(speaker_sentences)
|
242 |
+
if not speaker_text.endswith('.'):
|
243 |
+
speaker_text += '.'
|
244 |
+
parts.append((speaker_text, i))
|
245 |
+
|
246 |
+
return parts
|
247 |
+
except Exception as e:
|
248 |
+
# Fallback: single speaker
|
249 |
+
return [(script, 0)]
|
250 |
+
|
251 |
+
def create_podcast(text, use_gemini, tts_engine, speaker_count, progress=gr.Progress()):
|
252 |
+
"""Main function to create podcast from text with multiple speakers"""
|
253 |
+
progress(0.1, "Starting processing...")
|
254 |
+
|
255 |
+
if not text.strip():
|
256 |
+
return None, "❌ Please enter some text first!", ""
|
257 |
+
|
258 |
+
# Step 1: Generate script using Gemini or use raw text
|
259 |
+
progress(0.3, "Generating podcast script...")
|
260 |
+
|
261 |
+
podcast_script = generate_podcast_script(text, speaker_count, use_gemini)
|
262 |
+
|
263 |
+
progress(0.5, "Parsing script for speakers...")
|
264 |
+
|
265 |
+
# Parse script for multiple speakers
|
266 |
+
script_parts = parse_script_for_speakers(podcast_script, speaker_count)
|
267 |
+
|
268 |
+
progress(0.6, "Generating audio with multiple voices...")
|
269 |
+
|
270 |
+
# Step 2: Generate audio
|
271 |
+
try:
|
272 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
|
273 |
+
temp_filename = tmp_file.name
|
274 |
+
|
275 |
+
if tts_engine == "Multi-Speaker (Edge TTS - Best Quality)" and speaker_count > 1:
|
276 |
+
# Use Edge TTS for multi-speaker
|
277 |
+
loop = asyncio.new_event_loop()
|
278 |
+
asyncio.set_event_loop(loop)
|
279 |
+
try:
|
280 |
+
audio_file, error = loop.run_until_complete(
|
281 |
+
generate_multi_speaker_audio(script_parts, speaker_count, temp_filename)
|
282 |
+
)
|
283 |
+
finally:
|
284 |
+
loop.close()
|
285 |
+
elif tts_engine == "gTTS (Online - Single Voice)":
|
286 |
+
# Use gTTS for single voice
|
287 |
+
full_text = " ".join([part[0] for part in script_parts])
|
288 |
+
audio_file, error = generate_with_gtts(full_text, temp_filename)
|
289 |
+
else:
|
290 |
+
# Use pyttsx3 for offline single voice
|
291 |
+
full_text = " ".join([part[0] for part in script_parts])
|
292 |
+
audio_file, error = generate_with_pyttsx3(full_text, temp_filename)
|
293 |
+
|
294 |
+
if error:
|
295 |
+
return None, f"❌ {error}", ""
|
296 |
+
|
297 |
+
progress(0.9, "Finalizing...")
|
298 |
+
|
299 |
+
# Read the generated audio file
|
300 |
+
with open(audio_file, 'rb') as f:
|
301 |
+
audio_data = f.read()
|
302 |
+
|
303 |
+
# Clean up
|
304 |
+
try:
|
305 |
+
os.unlink(audio_file)
|
306 |
+
except:
|
307 |
+
pass
|
308 |
+
|
309 |
+
progress(1.0, "Complete!")
|
310 |
+
|
311 |
+
return audio_data, "✅ Podcast generated successfully!", podcast_script
|
312 |
+
|
313 |
+
except Exception as e:
|
314 |
+
return None, f"❌ Audio generation failed: {str(e)}", ""
|
315 |
+
|
316 |
+
# Custom CSS for better styling
|
317 |
+
css = """
|
318 |
+
.gradio-container {
|
319 |
+
max-width: 900px !important;
|
320 |
+
margin: 0 auto !important;
|
321 |
+
}
|
322 |
+
.container {
|
323 |
+
padding: 20px;
|
324 |
+
}
|
325 |
+
.header {
|
326 |
+
text-align: center;
|
327 |
+
margin-bottom: 30px;
|
328 |
+
}
|
329 |
+
.header h1 {
|
330 |
+
color: #2563eb;
|
331 |
+
font-size: 2.5em;
|
332 |
+
margin-bottom: 10px;
|
333 |
+
}
|
334 |
+
.header p {
|
335 |
+
color: #6b7280;
|
336 |
+
font-size: 1.1em;
|
337 |
+
}
|
338 |
+
.section {
|
339 |
+
background: white;
|
340 |
+
padding: 25px;
|
341 |
+
border-radius: 12px;
|
342 |
+
margin-bottom: 20px;
|
343 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
|
344 |
+
}
|
345 |
+
.section h2 {
|
346 |
+
color: #374151;
|
347 |
+
margin-bottom: 15px;
|
348 |
+
font-size: 1.4em;
|
349 |
+
}
|
350 |
+
.input-text {
|
351 |
+
min-height: 200px;
|
352 |
+
resize: vertical;
|
353 |
+
}
|
354 |
+
.output-audio {
|
355 |
+
text-align: center;
|
356 |
+
}
|
357 |
+
.output-script {
|
358 |
+
background: #f8fafc;
|
359 |
+
padding: 20px;
|
360 |
+
border-radius: 8px;
|
361 |
+
border-left: 4px solid #2563eb;
|
362 |
+
max-height: 300px;
|
363 |
+
overflow-y: auto;
|
364 |
+
}
|
365 |
+
|
366 |
+
.speaker-info {
|
367 |
+
background: linear-gradient(135deg, #ffeaa7 0%, #fab1a0 100%);
|
368 |
+
padding: 15px;
|
369 |
+
border-radius: 8px;
|
370 |
+
margin: 10px 0;
|
371 |
+
border: 1px solid #fdcb6e;
|
372 |
+
font-weight: bold;
|
373 |
+
}
|
374 |
+
|
375 |
+
.status-message {
|
376 |
+
padding: 15px;
|
377 |
+
border-radius: 8px;
|
378 |
+
font-weight: bold;
|
379 |
+
margin: 10px 0;
|
380 |
+
}
|
381 |
+
|
382 |
+
.status-success {
|
383 |
+
background-color: #d4edda;
|
384 |
+
color: #155724;
|
385 |
+
border: 1px solid #c3e6cb;
|
386 |
+
}
|
387 |
+
|
388 |
+
.status-error {
|
389 |
+
background-color: #f8d7da;
|
390 |
+
color: #721c24;
|
391 |
+
border: 1px solid #f5c6cb;
|
392 |
+
}
|
393 |
+
|
394 |
+
.status-info {
|
395 |
+
background-color: #cce7ff;
|
396 |
+
color: #004085;
|
397 |
+
border: 1px solid #99d3ff;
|
398 |
+
}
|
399 |
+
.instructions {
|
400 |
+
background: #f0f9ff;
|
401 |
+
padding: 20px;
|
402 |
+
border-radius: 8px;
|
403 |
+
border-left: 4px solid #0ea5e9;
|
404 |
+
}
|
405 |
+
.instructions h3 {
|
406 |
+
color: #0369a1;
|
407 |
+
margin-bottom: 10px;
|
408 |
+
}
|
409 |
+
.btn-generate {
|
410 |
+
background: linear-gradient(135deg, #2563eb, #1d4ed8) !important;
|
411 |
+
color: white !important;
|
412 |
+
font-weight: bold !important;
|
413 |
+
padding: 12px 24px !important;
|
414 |
+
border-radius: 8px !important;
|
415 |
+
}
|
416 |
+
.btn-generate:hover {
|
417 |
+
background: linear-gradient(135deg, #1d4ed8, #1e40af) !important;
|
418 |
+
}
|
419 |
+
.status-message {
|
420 |
+
padding: 15px;
|
421 |
+
border-radius: 8px;
|
422 |
+
margin: 10px 0;
|
423 |
+
}
|
424 |
+
.status-success {
|
425 |
+
background: #dcfce7;
|
426 |
+
color: #166534;
|
427 |
+
border-left: 4px solid #22c55e;
|
428 |
+
}
|
429 |
+
.status-error {
|
430 |
+
background: #fee2e2;
|
431 |
+
color: #991b1b;
|
432 |
+
border-left: 4px solid #ef4444;
|
433 |
+
}
|
434 |
+
.status-info {
|
435 |
+
background: #dbeafe;
|
436 |
+
color: #1e40af;
|
437 |
+
border-left: 4px solid #3b82f6;
|
438 |
+
}
|
439 |
+
"""
|
440 |
+
|
441 |
+
# Create the Gradio interface
|
442 |
+
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
443 |
+
with gr.Column(elem_classes="container"):
|
444 |
+
# Header
|
445 |
+
with gr.Column(elem_classes="header"):
|
446 |
+
gr.Markdown("# 🎙️ Blog to Podcast Converter")
|
447 |
+
gr.Markdown("Transform your text into engaging podcast audio using AI")
|
448 |
+
|
449 |
+
# API Section
|
450 |
+
with gr.Column(elem_classes="section"):
|
451 |
+
gr.Markdown("## 🔑 API Configuration")
|
452 |
+
api_key = gr.Textbox(
|
453 |
+
label="Gemini API Key (Optional)",
|
454 |
+
type="password",
|
455 |
+
placeholder="Enter your Google Gemini API key for better summaries...",
|
456 |
+
info="Get a free key from https://aistudio.google.com/"
|
457 |
+
)
|
458 |
+
api_status = gr.Textbox(
|
459 |
+
label="API Status",
|
460 |
+
interactive=False,
|
461 |
+
value="ℹ️ Add Gemini API key for AI-powered summaries"
|
462 |
+
)
|
463 |
+
api_key.change(init_gemini, inputs=api_key, outputs=api_status)
|
464 |
+
|
465 |
+
# Input Section
|
466 |
+
with gr.Column(elem_classes="section"):
|
467 |
+
gr.Markdown("## 📝 Input Text")
|
468 |
+
input_text = gr.Textbox(
|
469 |
+
label="Paste your blog post or article text",
|
470 |
+
placeholder="Enter your text here... (2000+ characters works best)",
|
471 |
+
lines=8,
|
472 |
+
elem_classes="input-text"
|
473 |
+
)
|
474 |
+
|
475 |
+
# Configuration Section
|
476 |
+
with gr.Column(elem_classes="section"):
|
477 |
+
gr.Markdown("## ⚙️ Podcast Configuration")
|
478 |
+
|
479 |
+
with gr.Row():
|
480 |
+
speaker_count = gr.Radio(
|
481 |
+
label="Number of Speakers",
|
482 |
+
choices=[1, 2, 3, 4],
|
483 |
+
value=2,
|
484 |
+
info="Choose how many voices/speakers for your podcast"
|
485 |
+
)
|
486 |
+
|
487 |
+
use_gemini = gr.Checkbox(
|
488 |
+
label="Use AI for better summaries",
|
489 |
+
value=True,
|
490 |
+
info="Requires valid Gemini API key above"
|
491 |
+
)
|
492 |
+
|
493 |
+
tts_engine = gr.Radio(
|
494 |
+
label="Voice Engine",
|
495 |
+
choices=[
|
496 |
+
"Multi-Speaker (Edge TTS - Best Quality)",
|
497 |
+
"gTTS (Online - Single Voice)",
|
498 |
+
"pyttsx3 (Offline - Single Voice)"
|
499 |
+
],
|
500 |
+
value="Multi-Speaker (Edge TTS - Best Quality)",
|
501 |
+
info="Edge TTS provides realistic multi-speaker conversations"
|
502 |
+
)
|
503 |
+
|
504 |
+
# Generate Button
|
505 |
+
generate_btn = gr.Button(
|
506 |
+
"🎙️ Generate Podcast",
|
507 |
+
elem_classes="btn-generate",
|
508 |
+
size="lg"
|
509 |
+
)
|
510 |
+
|
511 |
+
# Output Section
|
512 |
+
with gr.Column(elem_classes="section"):
|
513 |
+
gr.Markdown("## 🎧 Generated Podcast")
|
514 |
+
|
515 |
+
# Status message
|
516 |
+
status_msg = gr.HTML(
|
517 |
+
value="<div class='status-message status-info'>Ready to generate podcast...</div>"
|
518 |
+
)
|
519 |
+
|
520 |
+
# Audio output with download
|
521 |
+
with gr.Row():
|
522 |
+
audio_output = gr.Audio(
|
523 |
+
label="Generated Podcast",
|
524 |
+
type="filepath",
|
525 |
+
visible=False
|
526 |
+
)
|
527 |
+
download_btn = gr.DownloadButton(
|
528 |
+
"⬇️ Download Podcast",
|
529 |
+
visible=False,
|
530 |
+
variant="secondary"
|
531 |
+
)
|
532 |
+
|
533 |
+
# Speaker info display
|
534 |
+
speaker_info = gr.HTML(
|
535 |
+
value="",
|
536 |
+
visible=False
|
537 |
+
)
|
538 |
+
|
539 |
+
# Script output
|
540 |
+
script_output = gr.Textbox(
|
541 |
+
label="Podcast Script",
|
542 |
+
visible=False,
|
543 |
+
lines=8,
|
544 |
+
elem_classes="output-script"
|
545 |
+
)
|
546 |
+
|
547 |
+
# Instructions
|
548 |
+
with gr.Column(elem_classes="instructions"):
|
549 |
+
gr.Markdown("### ℹ️ How to Use")
|
550 |
+
gr.Markdown("""
|
551 |
+
1. **Optional**: Enter your Gemini API key for AI-powered conversation generation
|
552 |
+
2. **Paste your text** in the input box (articles, blogs, etc.)
|
553 |
+
3. **Choose number of speakers** (1-4) for different conversation styles
|
554 |
+
4. **Select voice engine**:
|
555 |
+
- Multi-Speaker Edge TTS (best quality, realistic voices)
|
556 |
+
- gTTS (single voice, good quality)
|
557 |
+
- pyttsx3 (offline, system voice)
|
558 |
+
5. **Click Generate Podcast** and wait for processing
|
559 |
+
6. **Listen and download** your podcast!
|
560 |
+
|
561 |
+
**Speaker Configurations**:
|
562 |
+
- **1 Speaker**: Solo narration
|
563 |
+
- **2 Speakers**: Host conversation (Alex & Brian)
|
564 |
+
- **3 Speakers**: Panel discussion (Sarah, Mike & Emma)
|
565 |
+
- **4 Speakers**: Full roundtable (Sarah, Mike, Emma & David)
|
566 |
+
|
567 |
+
**Tips**:
|
568 |
+
- For best results, use 500-3000 characters of text
|
569 |
+
- Multi-speaker works best with Gemini AI enabled
|
570 |
+
- Edge TTS provides the most realistic conversations
|
571 |
+
""")
|
572 |
+
|
573 |
+
def get_speaker_info(speaker_count):
|
574 |
+
"""Get speaker information for display"""
|
575 |
+
if speaker_count == 1:
|
576 |
+
return "<div class='speaker-info'><b>Single Speaker Mode</b><br/>Solo narration with one voice</div>"
|
577 |
+
|
578 |
+
voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
|
579 |
+
speakers_html = "<div class='speaker-info'><b>Speakers in this podcast:</b><br/>"
|
580 |
+
|
581 |
+
for i, config in enumerate(voice_config):
|
582 |
+
speakers_html += f"🎤 <b>{config['name']}</b> ({config['gender']} voice)<br/>"
|
583 |
+
|
584 |
+
speakers_html += "</div>"
|
585 |
+
return speakers_html
|
586 |
+
|
587 |
+
# Event handlers
|
588 |
+
def update_status(message, success=True):
|
589 |
+
"""Update status message with appropriate styling"""
|
590 |
+
status_class = "status-success" if success else "status-error"
|
591 |
+
if "Ready" in message or "ℹ️" in message:
|
592 |
+
status_class = "status-info"
|
593 |
+
return f"<div class='status-message {status_class}'>{message}</div>"
|
594 |
+
|
595 |
+
def generate_podcast_wrapper(text, use_gemini, tts_engine, speaker_count, progress=gr.Progress()):
|
596 |
+
"""Wrapper function for podcast generation"""
|
597 |
+
audio_data, message, script = create_podcast(text, use_gemini, tts_engine, speaker_count, progress)
|
598 |
+
|
599 |
+
status_html = update_status(message, success=audio_data is not None)
|
600 |
+
|
601 |
+
outputs = [status_html]
|
602 |
+
if audio_data:
|
603 |
+
# Save audio to temporary file for playback and download
|
604 |
+
filename = f"podcast_{speaker_count}speakers_{uuid4().hex[:8]}.wav"
|
605 |
+
filepath = os.path.join(tempfile.gettempdir(), filename)
|
606 |
+
|
607 |
+
with open(filepath, 'wb') as f:
|
608 |
+
f.write(audio_data)
|
609 |
+
|
610 |
+
# Get speaker info
|
611 |
+
speaker_info_html = get_speaker_info(speaker_count)
|
612 |
+
|
613 |
+
outputs.extend([filepath, filepath, speaker_info_html, script])
|
614 |
+
else:
|
615 |
+
outputs.extend([None, None, "", script])
|
616 |
+
|
617 |
+
return outputs
|
618 |
+
|
619 |
+
# Connect the button click event
|
620 |
+
generate_btn.click(
|
621 |
+
fn=generate_podcast_wrapper,
|
622 |
+
inputs=[input_text, use_gemini, tts_engine, speaker_count],
|
623 |
+
outputs=[status_msg, audio_output, download_btn, speaker_info, script_output]
|
624 |
+
)
|
625 |
+
|
626 |
+
# Update speaker info when speaker count changes
|
627 |
+
speaker_count.change(
|
628 |
+
fn=get_speaker_info,
|
629 |
+
inputs=speaker_count,
|
630 |
+
outputs=speaker_info
|
631 |
+
)
|
632 |
+
|
633 |
+
# Show/hide outputs based on results
|
634 |
+
def toggle_visibility(audio_data):
|
635 |
+
has_audio = audio_data is not None
|
636 |
+
return (
|
637 |
+
gr.Audio(visible=has_audio),
|
638 |
+
gr.DownloadButton(visible=has_audio),
|
639 |
+
gr.HTML(visible=has_audio),
|
640 |
+
gr.Textbox(visible=has_audio)
|
641 |
+
)
|
642 |
+
|
643 |
+
audio_output.change(
|
644 |
+
fn=toggle_visibility,
|
645 |
+
inputs=audio_output,
|
646 |
+
outputs=[audio_output, download_btn, speaker_info, script_output]
|
647 |
+
)
|
648 |
+
|
649 |
+
# Launch the application
|
650 |
+
if __name__ == "__main__":
|
651 |
+
demo.launch(
|
652 |
+
server_name="0.0.0.0",
|
653 |
+
server_port=7860,
|
654 |
+
share=False,
|
655 |
+
show_error=True
|
656 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
gtts
|
3 |
+
pyttsx3
|
4 |
+
requests
|
5 |
+
uuid
|
6 |
+
google-generativeai
|
7 |
+
edge-tts
|
8 |
+
soundfile
|
9 |
+
numpy
|
10 |
+
scipy
|