Spaces:

hari7261
/

PodcastAgent

Running

App Files Files Community

hari7261 commited on 16 days ago

Commit

a3d1b01

verified ·

1 Parent(s): 9fec753

Upload 2 files

Browse files

Files changed (2) hide show

app.py +656 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,656 @@

+import gradio as gr
+import google.generativeai as genai
+from gtts import gTTS
+import pyttsx3
+from pathlib import Path
+import tempfile
+import os
+from uuid import uuid4
+import time
+import asyncio
+import edge_tts
+import numpy as np
+import soundfile as sf
+import re
+# Voice configurations for different speakers
+VOICE_CONFIGS = {
+    "2_speakers": [
+        {"name": "Alex", "voice": "en-US-AriaNeural", "gender": "female"},
+        {"name": "Brian", "voice": "en-US-GuyNeural", "gender": "male"}
+    ],
+    "3_speakers": [
+        {"name": "Sarah", "voice": "en-US-JennyNeural", "gender": "female"},
+        {"name": "Mike", "voice": "en-US-BrandonNeural", "gender": "male"},
+        {"name": "Emma", "voice": "en-US-AriaNeural", "gender": "female"}
+    ],
+    "4_speakers": [
+        {"name": "Sarah", "voice": "en-US-JennyNeural", "gender": "female"},
+        {"name": "Mike", "voice": "en-US-BrandonNeural", "gender": "male"},
+        {"name": "Emma", "voice": "en-US-AriaNeural", "gender": "female"},
+        {"name": "David", "voice": "en-US-GuyNeural", "gender": "male"}
+    ]
+}
+# Initialize Gemini client
+client = None
+def init_gemini(api_key):
+    """Initialize Gemini client with API key"""
+    global client
+    if api_key:
+        try:
+            genai.configure(api_key=api_key)
+            client = genai.GenerativeModel('gemini-2.0-flash')
+            return "✅ Gemini API connected successfully!"
+        except Exception as e:
+            return f"❌ Gemini API error: {str(e)}"
+    return "ℹ️ Add Gemini API key for better summaries"
+def generate_with_gtts(text, filename):
+    """Generate speech using Google's gTTS"""
+    try:
+        tts = gTTS(text=text, lang='en', slow=False)
+        tts.save(filename)
+        return filename, None
+    except Exception as e:
+        return None, f"gTTS Error: {str(e)}"
+async def generate_with_edge_tts(text, voice, filename):
+    """Generate speech using Microsoft Edge TTS with specific voice"""
+    try:
+        communicate = edge_tts.Communicate(text, voice)
+        await communicate.save(filename)
+        return filename, None
+    except Exception as e:
+        return None, f"Edge TTS Error: {str(e)}"
+def combine_audio_files(audio_files, output_filename):
+    """Combine multiple audio files into one"""
+    try:
+        from scipy.signal import resample
+        combined_audio = []
+        sample_rate = None
+        for audio_file in audio_files:
+            if os.path.exists(audio_file):
+                data, sr = sf.read(audio_file)
+                if sample_rate is None:
+                    sample_rate = sr
+                elif sr != sample_rate:
+                    # Resample if needed
+                    data = resample(data, int(len(data) * sample_rate / sr))
+                combined_audio.append(data)
+                # Add small pause between speakers
+                pause = np.zeros(int(sample_rate * 0.5))  # 0.5 second pause
+                combined_audio.append(pause)
+        if combined_audio:
+            final_audio = np.concatenate(combined_audio)
+            sf.write(output_filename, final_audio, sample_rate)
+            return output_filename, None
+        else:
+            return None, "No audio files to combine"
+    except Exception as e:
+        return None, f"Audio combination error: {str(e)}"
+async def generate_multi_speaker_audio(script_parts, speaker_count, output_filename):
+    """Generate multi-speaker podcast audio"""
+    try:
+        voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
+        audio_files = []
+        for i, (speaker_text, speaker_idx) in enumerate(script_parts):
+            voice = voice_config[speaker_idx]["voice"]
+            temp_filename = f"temp_speaker_{i}_{uuid4().hex[:8]}.wav"
+            result, error = await generate_with_edge_tts(speaker_text, voice, temp_filename)
+            if result:
+                audio_files.append(temp_filename)
+            else:
+                # Cleanup and return error
+                for f in audio_files:
+                    try:
+                        os.unlink(f)
+                    except:
+                        pass
+                return None, f"Error generating voice {i+1}: {error}"
+        # Combine all audio files
+        final_file, error = combine_audio_files(audio_files, output_filename)
+        # Cleanup temp files
+        for f in audio_files:
+            try:
+                os.unlink(f)
+            except:
+                pass
+        return final_file, error
+    except Exception as e:
+        return None, f"Multi-speaker generation error: {str(e)}"
+def generate_with_pyttsx3(text, filename):
+    """Generate speech using system's TTS engine"""
+    try:
+        engine = pyttsx3.init()
+        # Set properties for better audio quality
+        engine.setProperty('rate', 180)
+        engine.setProperty('volume', 0.9)
+        # Try to find a good voice
+        voices = engine.getProperty('voices')
+        for voice in voices:
+            if 'female' in voice.name.lower() or 'zira' in voice.name.lower():
+                engine.setProperty('voice', voice.id)
+                break
+        engine.save_to_file(text, filename)
+        engine.runAndWait()
+        return filename, None
+    except Exception as e:
+        return None, f"pyttsx3 Error: {str(e)}"
+def generate_podcast_script(text, speaker_count, use_gemini):
+    """Generate a podcast script with multiple speakers"""
+    if use_gemini and client:
+        try:
+            voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
+            speaker_names = [config["name"] for config in voice_config]
+            prompt = f"""Create an engaging podcast conversation between {speaker_count} hosts: {', '.join(speaker_names)}.
+            Transform this text into a natural conversation where each speaker contributes meaningfully.
+            Guidelines:
+            - Make it sound like a real podcast discussion
+            - Each speaker should have distinct perspectives and speaking styles
+            - Include natural transitions and interactions
+            - Keep it under 2500 characters total
+            - Use speaker names clearly (e.g., "Sarah: Hello everyone...")
+            - Make it conversational and engaging
+            Original text: {text[:3000]}
+            Format the output with clear speaker labels like:
+            Speaker1: [text]
+            Speaker2: [text]
+            etc."""
+            response = client.generate_content(prompt)
+            return response.text
+        except Exception as e:
+            # Fallback to simple script
+            return f"Error generating script: {str(e)}"
+    else:
+        # Simple fallback for single speaker
+        return text[:2000] + ("..." if len(text) > 2000 else "")
+def parse_script_for_speakers(script, speaker_count):
+    """Parse the script to extract speaker parts"""
+    try:
+        voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
+        speaker_names = [config["name"] for config in voice_config]
+        # Split script by speaker patterns
+        parts = []
+        lines = script.split('\n')
+        current_speaker = 0
+        current_text = ""
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            # Check if line starts with a speaker name
+            speaker_found = False
+            for i, name in enumerate(speaker_names):
+                if line.lower().startswith(f"{name.lower()}:"):
+                    # Save previous speaker's text
+                    if current_text.strip():
+                        parts.append((current_text.strip(), current_speaker))
+                    # Start new speaker
+                    current_speaker = i
+                    current_text = line[len(name)+1:].strip()
+                    speaker_found = True
+                    break
+            if not speaker_found:
+                current_text += " " + line
+        # Add final speaker text
+        if current_text.strip():
+            parts.append((current_text.strip(), current_speaker))
+        # If no speakers were found, distribute text evenly
+        if not parts and script.strip():
+            sentences = script.split('. ')
+            sentences_per_speaker = max(1, len(sentences) // speaker_count)
+            for i in range(speaker_count):
+                start_idx = i * sentences_per_speaker
+                if i == speaker_count - 1:  # Last speaker gets remaining sentences
+                    speaker_sentences = sentences[start_idx:]
+                else:
+                    speaker_sentences = sentences[start_idx:start_idx + sentences_per_speaker]
+                if speaker_sentences:
+                    speaker_text = '. '.join(speaker_sentences)
+                    if not speaker_text.endswith('.'):
+                        speaker_text += '.'
+                    parts.append((speaker_text, i))
+        return parts
+    except Exception as e:
+        # Fallback: single speaker
+        return [(script, 0)]
+def create_podcast(text, use_gemini, tts_engine, speaker_count, progress=gr.Progress()):
+    """Main function to create podcast from text with multiple speakers"""
+    progress(0.1, "Starting processing...")
+    if not text.strip():
+        return None, "❌ Please enter some text first!", ""
+    # Step 1: Generate script using Gemini or use raw text
+    progress(0.3, "Generating podcast script...")
+    podcast_script = generate_podcast_script(text, speaker_count, use_gemini)
+    progress(0.5, "Parsing script for speakers...")
+    # Parse script for multiple speakers
+    script_parts = parse_script_for_speakers(podcast_script, speaker_count)
+    progress(0.6, "Generating audio with multiple voices...")
+    # Step 2: Generate audio
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            temp_filename = tmp_file.name
+        if tts_engine == "Multi-Speaker (Edge TTS - Best Quality)" and speaker_count > 1:
+            # Use Edge TTS for multi-speaker
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                audio_file, error = loop.run_until_complete(
+                    generate_multi_speaker_audio(script_parts, speaker_count, temp_filename)
+                )
+            finally:
+                loop.close()
+        elif tts_engine == "gTTS (Online - Single Voice)":
+            # Use gTTS for single voice
+            full_text = " ".join([part[0] for part in script_parts])
+            audio_file, error = generate_with_gtts(full_text, temp_filename)
+        else:
+            # Use pyttsx3 for offline single voice
+            full_text = " ".join([part[0] for part in script_parts])
+            audio_file, error = generate_with_pyttsx3(full_text, temp_filename)
+        if error:
+            return None, f"❌ {error}", ""
+        progress(0.9, "Finalizing...")
+        # Read the generated audio file
+        with open(audio_file, 'rb') as f:
+            audio_data = f.read()
+        # Clean up
+        try:
+            os.unlink(audio_file)
+        except:
+            pass
+        progress(1.0, "Complete!")
+        return audio_data, "✅ Podcast generated successfully!", podcast_script
+    except Exception as e:
+        return None, f"❌ Audio generation failed: {str(e)}", ""
+# Custom CSS for better styling
+css = """
+.gradio-container {
+    max-width: 900px !important;
+    margin: 0 auto !important;
+}
+.container {
+    padding: 20px;
+}
+.header {
+    text-align: center;
+    margin-bottom: 30px;
+}
+.header h1 {
+    color: #2563eb;
+    font-size: 2.5em;
+    margin-bottom: 10px;
+}
+.header p {
+    color: #6b7280;
+    font-size: 1.1em;
+}
+.section {
+    background: white;
+    padding: 25px;
+    border-radius: 12px;
+    margin-bottom: 20px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
+}
+.section h2 {
+    color: #374151;
+    margin-bottom: 15px;
+    font-size: 1.4em;
+}
+.input-text {
+    min-height: 200px;
+    resize: vertical;
+}
+.output-audio {
+    text-align: center;
+}
+.output-script {
+    background: #f8fafc;
+    padding: 20px;
+    border-radius: 8px;
+    border-left: 4px solid #2563eb;
+    max-height: 300px;
+    overflow-y: auto;
+}
+.speaker-info {
+    background: linear-gradient(135deg, #ffeaa7 0%, #fab1a0 100%);
+    padding: 15px;
+    border-radius: 8px;
+    margin: 10px 0;
+    border: 1px solid #fdcb6e;
+    font-weight: bold;
+}
+.status-message {
+    padding: 15px;
+    border-radius: 8px;
+    font-weight: bold;
+    margin: 10px 0;
+}
+.status-success {
+    background-color: #d4edda;
+    color: #155724;
+    border: 1px solid #c3e6cb;
+}
+.status-error {
+    background-color: #f8d7da;
+    color: #721c24;
+    border: 1px solid #f5c6cb;
+}
+.status-info {
+    background-color: #cce7ff;
+    color: #004085;
+    border: 1px solid #99d3ff;
+}
+.instructions {
+    background: #f0f9ff;
+    padding: 20px;
+    border-radius: 8px;
+    border-left: 4px solid #0ea5e9;
+}
+.instructions h3 {
+    color: #0369a1;
+    margin-bottom: 10px;
+}
+.btn-generate {
+    background: linear-gradient(135deg, #2563eb, #1d4ed8) !important;
+    color: white !important;
+    font-weight: bold !important;
+    padding: 12px 24px !important;
+    border-radius: 8px !important;
+}
+.btn-generate:hover {
+    background: linear-gradient(135deg, #1d4ed8, #1e40af) !important;
+}
+.status-message {
+    padding: 15px;
+    border-radius: 8px;
+    margin: 10px 0;
+}
+.status-success {
+    background: #dcfce7;
+    color: #166534;
+    border-left: 4px solid #22c55e;
+}
+.status-error {
+    background: #fee2e2;
+    color: #991b1b;
+    border-left: 4px solid #ef4444;
+}
+.status-info {
+    background: #dbeafe;
+    color: #1e40af;
+    border-left: 4px solid #3b82f6;
+}
+"""
+# Create the Gradio interface
+with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
+    with gr.Column(elem_classes="container"):
+        # Header
+        with gr.Column(elem_classes="header"):
+            gr.Markdown("# 🎙️ Blog to Podcast Converter")
+            gr.Markdown("Transform your text into engaging podcast audio using AI")
+        # API Section
+        with gr.Column(elem_classes="section"):
+            gr.Markdown("## 🔑 API Configuration")
+            api_key = gr.Textbox(
+                label="Gemini API Key (Optional)",
+                type="password",
+                placeholder="Enter your Google Gemini API key for better summaries...",
+                info="Get a free key from https://aistudio.google.com/"
+            )
+            api_status = gr.Textbox(
+                label="API Status",
+                interactive=False,
+                value="ℹ️ Add Gemini API key for AI-powered summaries"
+            )
+            api_key.change(init_gemini, inputs=api_key, outputs=api_status)
+        # Input Section
+        with gr.Column(elem_classes="section"):
+            gr.Markdown("## 📝 Input Text")
+            input_text = gr.Textbox(
+                label="Paste your blog post or article text",
+                placeholder="Enter your text here... (2000+ characters works best)",
+                lines=8,
+                elem_classes="input-text"
+            )
+        # Configuration Section
+        with gr.Column(elem_classes="section"):
+            gr.Markdown("## ⚙️ Podcast Configuration")
+            with gr.Row():
+                speaker_count = gr.Radio(
+                    label="Number of Speakers",
+                    choices=[1, 2, 3, 4],
+                    value=2,
+                    info="Choose how many voices/speakers for your podcast"
+                )
+                use_gemini = gr.Checkbox(
+                    label="Use AI for better summaries",
+                    value=True,
+                    info="Requires valid Gemini API key above"
+                )
+            tts_engine = gr.Radio(
+                label="Voice Engine",
+                choices=[
+                    "Multi-Speaker (Edge TTS - Best Quality)",
+                    "gTTS (Online - Single Voice)",
+                    "pyttsx3 (Offline - Single Voice)"
+                ],
+                value="Multi-Speaker (Edge TTS - Best Quality)",
+                info="Edge TTS provides realistic multi-speaker conversations"
+            )
+        # Generate Button
+        generate_btn = gr.Button(
+            "🎙️ Generate Podcast",
+            elem_classes="btn-generate",
+            size="lg"
+        )
+        # Output Section
+        with gr.Column(elem_classes="section"):
+            gr.Markdown("## 🎧 Generated Podcast")
+            # Status message
+            status_msg = gr.HTML(
+                value="<div class='status-message status-info'>Ready to generate podcast...</div>"
+            )
+            # Audio output with download
+            with gr.Row():
+                audio_output = gr.Audio(
+                    label="Generated Podcast",
+                    type="filepath",
+                    visible=False
+                )
+                download_btn = gr.DownloadButton(
+                    "⬇️ Download Podcast",
+                    visible=False,
+                    variant="secondary"
+                )
+            # Speaker info display
+            speaker_info = gr.HTML(
+                value="",
+                visible=False
+            )
+            # Script output
+            script_output = gr.Textbox(
+                label="Podcast Script",
+                visible=False,
+                lines=8,
+                elem_classes="output-script"
+            )
+        # Instructions
+        with gr.Column(elem_classes="instructions"):
+            gr.Markdown("### ℹ️ How to Use")
+            gr.Markdown("""
+            1. **Optional**: Enter your Gemini API key for AI-powered conversation generation
+            2. **Paste your text** in the input box (articles, blogs, etc.)
+            3. **Choose number of speakers** (1-4) for different conversation styles
+            4. **Select voice engine**:
+               - Multi-Speaker Edge TTS (best quality, realistic voices)
+               - gTTS (single voice, good quality)
+               - pyttsx3 (offline, system voice)
+            5. **Click Generate Podcast** and wait for processing
+            6. **Listen and download** your podcast!
+            **Speaker Configurations**:
+            - **1 Speaker**: Solo narration
+            - **2 Speakers**: Host conversation (Alex & Brian)
+            - **3 Speakers**: Panel discussion (Sarah, Mike & Emma)
+            - **4 Speakers**: Full roundtable (Sarah, Mike, Emma & David)
+            **Tips**:
+            - For best results, use 500-3000 characters of text
+            - Multi-speaker works best with Gemini AI enabled
+            - Edge TTS provides the most realistic conversations
+            """)
+def get_speaker_info(speaker_count):
+    """Get speaker information for display"""
+    if speaker_count == 1:
+        return "<div class='speaker-info'><b>Single Speaker Mode</b><br/>Solo narration with one voice</div>"
+    voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
+    speakers_html = "<div class='speaker-info'><b>Speakers in this podcast:</b><br/>"
+    for i, config in enumerate(voice_config):
+        speakers_html += f"🎤 <b>{config['name']}</b> ({config['gender']} voice)<br/>"
+    speakers_html += "</div>"
+    return speakers_html
+    # Event handlers
+    def update_status(message, success=True):
+        """Update status message with appropriate styling"""
+        status_class = "status-success" if success else "status-error"
+        if "Ready" in message or "ℹ️" in message:
+            status_class = "status-info"
+        return f"<div class='status-message {status_class}'>{message}</div>"
+    def generate_podcast_wrapper(text, use_gemini, tts_engine, speaker_count, progress=gr.Progress()):
+        """Wrapper function for podcast generation"""
+        audio_data, message, script = create_podcast(text, use_gemini, tts_engine, speaker_count, progress)
+        status_html = update_status(message, success=audio_data is not None)
+        outputs = [status_html]
+        if audio_data:
+            # Save audio to temporary file for playback and download
+            filename = f"podcast_{speaker_count}speakers_{uuid4().hex[:8]}.wav"
+            filepath = os.path.join(tempfile.gettempdir(), filename)
+            with open(filepath, 'wb') as f:
+                f.write(audio_data)
+            # Get speaker info
+            speaker_info_html = get_speaker_info(speaker_count)
+            outputs.extend([filepath, filepath, speaker_info_html, script])
+        else:
+            outputs.extend([None, None, "", script])
+        return outputs
+    # Connect the button click event
+    generate_btn.click(
+        fn=generate_podcast_wrapper,
+        inputs=[input_text, use_gemini, tts_engine, speaker_count],
+        outputs=[status_msg, audio_output, download_btn, speaker_info, script_output]
+    )
+    # Update speaker info when speaker count changes
+    speaker_count.change(
+        fn=get_speaker_info,
+        inputs=speaker_count,
+        outputs=speaker_info
+    )
+    # Show/hide outputs based on results
+    def toggle_visibility(audio_data):
+        has_audio = audio_data is not None
+        return (
+            gr.Audio(visible=has_audio),
+            gr.DownloadButton(visible=has_audio),
+            gr.HTML(visible=has_audio),
+            gr.Textbox(visible=has_audio)
+        )
+    audio_output.change(
+        fn=toggle_visibility,
+        inputs=audio_output,
+        outputs=[audio_output, download_btn, speaker_info, script_output]
+    )
+# Launch the application
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio
+gtts
+pyttsx3
+requests
+uuid
+google-generativeai
+edge-tts
+soundfile
+numpy
+scipy