podcastgen

Sleeping

App Files Files Community

Rausda6 commited on May 22

Commit

4c6c365

verified ·

1 Parent(s): 132e1a9

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -38

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ import mimetypes
 from typing import List
 from PyPDF2 import PdfReader
 # Define model name clearly
 MODEL_NAME = "unsloth/gemma-3-1b-pt"
@@ -39,7 +40,19 @@ class PodcastGenerator:
     async def generate_script(self, prompt: str, language: str, api_key: str, file_obj=None, progress=None):
         example = """
-{...}
         """
         if language == "Auto Detect":
             language_instruction = "- The podcast MUST be in the same language as the user input."
@@ -47,11 +60,18 @@ class PodcastGenerator:
             language_instruction = f"- The podcast MUST be in {language} language"
         system_prompt = f"""
-You are a professional podcast generator...
 {language_instruction}
 Follow this example structure:
 {example}
 """
         if prompt and file_obj:
             user_prompt = f"Please generate a podcast script based on the uploaded file following user input:\n{prompt}"
         elif prompt:
@@ -59,10 +79,11 @@ Follow this example structure:
         else:
             user_prompt = "Please generate a podcast script based on the uploaded file."
         if file_obj:
             file_size = getattr(file_obj, 'size', os.path.getsize(file_obj.name))
             if file_size > MAX_FILE_SIZE_BYTES:
-                raise Exception("File size exceeds limit.")
             ext = os.path.splitext(file_obj.name)[1].lower()
             if ext == '.pdf':
                 reader = PdfReader(file_obj)
@@ -73,54 +94,147 @@ Follow this example structure:
             user_prompt += f"\n\n―― FILE CONTENT ――\n{text}"
         prompt_text = system_prompt + "\n" + user_prompt
         try:
-            if progress: progress(0.3, "Generating podcast script...")
             def hf_generate(p):
                 inputs = tokenizer(p, return_tensors="pt").to(model.device)
-                outs = model.generate(**inputs, max_new_tokens=1024, do_sample=True, temperature=1.0)
                 return tokenizer.decode(outs[0], skip_special_tokens=True)
-            generated_text = await asyncio.wait_for(asyncio.to_thread(hf_generate, prompt_text), timeout=60)
         except asyncio.TimeoutError:
-            raise Exception("Script generation timed out.")
         except Exception as e:
-            raise Exception(f"Failed to generate script: {e}")
-        if progress: progress(0.4, "Script generated successfully!")
-        return json.loads(generated_text)
-    # ... TTS and combine_audio_files methods unchanged ...
-async def process_input(input_text, input_file, language, speaker1, speaker2, api_key="", progress=None):
-    # Implementation unchanged
-    ...
 # Gradio UI
 with gr.Blocks(title="PodcastGen 🎙️") as demo:
     gr.Markdown("""
-    # PodcastGen 🎙️
-    Generate a 2-speaker podcast from text or PDF!
-    """
-    )
     with gr.Row():
         with gr.Column():
-            input_text = gr.Textbox(...)
-            input_file = gr.File(...)
         with gr.Column():
-            language = gr.Dropdown(...)
-            speaker1 = gr.Dropdown(...)
-            speaker2 = gr.Dropdown(...)
-            api_key = gr.Textbox(...)
     generate_btn = gr.Button("Generate Podcast 🎙️", variant="primary")
-    output_audio = gr.Audio(...)
-    # Bind async function directly
-    generate_btn.click(
-        fn=process_input,
-        inputs=[input_text, input_file, language, speaker1, speaker2, api_key],
-        outputs=output_audio,
-        show_progress=True
-    )
-demo.queue()
-demo.launch(server_name="0.0.0.0", share=True, debug=True)

 from typing import List
 from PyPDF2 import PdfReader
+from pydub import AudioSegment
 # Define model name clearly
 MODEL_NAME = "unsloth/gemma-3-1b-pt"
     async def generate_script(self, prompt: str, language: str, api_key: str, file_obj=None, progress=None):
         example = """
+{
+    "topic": "AGI",
+    "podcast": [
+        {"speaker": 2, "line": "So, AGI, huh? Seems like everyone's talking about it these days."},
+        {"speaker": 1, "line": "Yeah, it's definitely having a moment, isn't it?"},
+        {"speaker": 2, "line": "It is and for good reason, right? I mean, you've been digging into this stuff, listening to the podcasts and everything. What really stood out to you? What got you hooked?"},
+        {"speaker": 1, "line": "It's easy to get lost in the noise, for sure."},
+        {"speaker": 2, "line": "Exactly. So how about we try to cut through some of that, shall we?"},
+        {"speaker": 1, "line": "Sounds like a plan."},
+        {"speaker": 2, "line": "It certainly is and on that note, we'll wrap up this deep dive. Thanks for listening, everyone."},
+        {"speaker": 1, "line": "Peace."}
+    ]
+}
         """
         if language == "Auto Detect":
             language_instruction = "- The podcast MUST be in the same language as the user input."
             language_instruction = f"- The podcast MUST be in {language} language"
         system_prompt = f"""
+You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input.
 {language_instruction}
+- The podcast should have 2 speakers.
+- The podcast should be long.
+- Do not use names for the speakers.
+- The podcast should be interesting, lively, and engaging, and hook the listener from the start.
+- The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
+- The script must be in JSON format.
 Follow this example structure:
 {example}
 """
+        # Build the user prompt
         if prompt and file_obj:
             user_prompt = f"Please generate a podcast script based on the uploaded file following user input:\n{prompt}"
         elif prompt:
         else:
             user_prompt = "Please generate a podcast script based on the uploaded file."
+        # Handle file content
         if file_obj:
             file_size = getattr(file_obj, 'size', os.path.getsize(file_obj.name))
             if file_size > MAX_FILE_SIZE_BYTES:
+                raise Exception(f"File size exceeds the {MAX_FILE_SIZE_MB}MB limit. Please upload a smaller file.")
             ext = os.path.splitext(file_obj.name)[1].lower()
             if ext == '.pdf':
                 reader = PdfReader(file_obj)
             user_prompt += f"\n\n―― FILE CONTENT ――\n{text}"
         prompt_text = system_prompt + "\n" + user_prompt
         try:
+            if progress:
+                progress(0.3, "Generating podcast script...")
             def hf_generate(p):
                 inputs = tokenizer(p, return_tensors="pt").to(model.device)
+                outs = model.generate(
+                    **inputs,
+                    max_new_tokens=1024,
+                    do_sample=True,
+                    temperature=1.0
+                )
                 return tokenizer.decode(outs[0], skip_special_tokens=True)
+            generated_text = await asyncio.wait_for(
+                asyncio.to_thread(hf_generate, prompt_text),
+                timeout=60
+            )
         except asyncio.TimeoutError:
+            raise Exception("The script generation request timed out. Please try again later.")
         except Exception as e:
+            raise Exception(f"Failed to generate podcast script: {e}")
+        if progress:
+            progress(0.4, "Script generated successfully!")
+        return json.loads(generated_text)
+    async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
+        voice = speaker1 if speaker == 1 else speaker2
+        speech = edge_tts.Communicate(text, voice)
+        temp_filename = f"temp_{uuid.uuid4()}.wav"
+        try:
+            await asyncio.wait_for(speech.save(temp_filename), timeout=30)
+            return temp_filename
+        except asyncio.TimeoutError:
+            if os.path.exists(temp_filename): os.remove(temp_filename)
+            raise Exception("Text-to-speech generation timed out. Please try with a shorter text.")
+        except Exception as e:
+            if os.path.exists(temp_filename): os.remove(temp_filename)
+            raise e
+    async def combine_audio_files(self, audio_files: List[str], progress=None) -> str:
+        if progress: progress(0.9, "Combining audio files...")
+        combined_audio = AudioSegment.empty()
+        for audio_file in audio_files:
+            combined_audio += AudioSegment.from_file(audio_file)
+            os.remove(audio_file)
+        output_filename = f"output_{uuid.uuid4()}.wav"
+        combined_audio.export(output_filename, format="wav")
+        if progress: progress(1.0, "Podcast generated successfully!")
+        return output_filename
+    async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_obj=None, progress=None) -> str:
+        try:
+            if progress: progress(0.1, "Starting podcast generation...")
+            return await asyncio.wait_for(
+                self._generate_podcast_internal(input_text, language, speaker1, speaker2, api_key, file_obj, progress),
+                timeout=600
+            )
+        except asyncio.TimeoutError:
+            raise Exception("The podcast generation process timed out. Please try with shorter text or try again later.")
+        except Exception as e:
+            raise Exception(f"Error generating podcast: {str(e)}")
+    async def _generate_podcast_internal(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_obj=None, progress=None) -> str:
+        if progress: progress(0.2, "Generating podcast script...")
+        podcast_json = await self.generate_script(input_text, language, api_key, file_obj, progress)
+        if progress: progress(0.5, "Converting text to speech...")
+        audio_files = []
+        total_lines = len(podcast_json['podcast'])
+        batch_size = 10
+        for batch_start in range(0, total_lines, batch_size):
+            batch_end = min(batch_start + batch_size, total_lines)
+            batch = podcast_json['podcast'][batch_start:batch_end]
+            tts_tasks = [self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in batch]
+            try:
+                batch_results = await asyncio.gather(*tts_tasks, return_exceptions=True)
+                for result in batch_results:
+                    if isinstance(result, Exception):
+                        for file in audio_files:
+                            if os.path.exists(file): os.remove(file)
+                        raise Exception(f"Error generating speech: {str(result)}")
+                    audio_files.append(result)
+                if progress:
+                    progress(0.5 + (0.4 * (batch_end / total_lines)), f"Processed {batch_end}/{total_lines} speech segments...")
+            except Exception as e:
+                for file in audio_files:
+                    if os.path.exists(file): os.remove(file)
+                raise Exception(f"Error in batch TTS generation: {str(e)}")
+        combined = await self.combine_audio_files(audio_files, progress)
+        return combined
+async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "", progress=None) -> str:
+    start_time = time.time()
+    voice_names = {
+        "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
+        "Ava - English (United States)": "en-US-AvaMultilingualNeural",
+        "Brian - English (United States)": "en-US-BrianMultilingualNeural",
+        "Emma - English (United States)": "en-US-EmmaMultilingualNeural",
+        "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural",
+        "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural",
+        "Remy - French (France)": "fr-FR-RemyMultilingualNeural",
+        "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural"
+    }
+    speaker1 = voice_names[speaker1]
+    speaker2 = voice_names[speaker2]
+    try:
+        if progress: progress(0.05, "Processing input...")
+        if not api_key:
+            api_key = "saf"
+            if not api_key:
+                raise Exception("No API key provided. Please provide a Gemini API key.")
+        generator = PodcastGenerator()
+        output = await generator.generate_podcast(input_text, lan
+                                                  guage, speaker1, speaker2, api_key, input_file, progress)
+        print(f"Total podcast generation time: {time.time() - start_time:.2f} seconds")
+        return output
+    except Exception as e:
+        msg = str(e)
+        if "rate limit" in msg.lower():
+            raise Exception("Rate limit exceeded. Please try again later or use your own API key.")
+        elif "timeout" in msg.lower():
+            raise Exception("The request timed out... Please try with shorter text.")
+        else:
+            raise Exception(f"Error: {msg}")
 # Gradio UI
 with gr.Blocks(title="PodcastGen 🎙️") as demo:
     gr.Markdown("""
+# PodcastGen 🎙️
+Generate a 2-speaker podcast from text or PDF!
+""" )
     with gr.Row():
         with gr.Column():
+            input_text = gr.Textbox(label="Input Text", lines=10, placeholder="Enter podcast topic or paste text here...", elem_id="input_text")
+            input_file = gr.File(label="Or Upload a PDF or TXT file", file_types=[".pdf", ".txt"] )
         with gr.Column():
+            language = gr.Dropdown(label="Podcast Language", choices=["Auto Detect","English","German","French","Spanish","Italian","Dutch","Portuguese","Russian","Chinese","Japanese","Korean","Other" ], value="Auto Detect")
+            speaker1 = gr.Dropdown(label="Speaker 1 Voice", choices=["Andrew - English (United States)","Ava - English (United States)","Brian - English (United States)","Emma - English (United States)","Florian - German (Germany)","Seraphina - German (Germany)","Remy - French (France)","Vivienne - French (France)" ], value="Andrew - English (United States)")
+            speaker2 = gr.Dropdown(label="Speaker 2 Voice", choices=["Andrew - English (United States)","Ava - English (United States)","Brian - English (United States)","Emma - English (United States)","Florian - German (Germany)","Seraphina - German (Germany)","Remy - French (France)","Vivienne - French (France)" ], value="Ava - English (United States)")
+            api_key = gr.Textbox(label="Gemini API Key (Optional)", type="password", placeholder="Needed only if you're getting rate limited.")
     generate_btn = gr.Button("Generate Podcast 🎙️", variant="primary")
+    output_audio = gr.Audio(label="Generated Podcast", type="filepath", format="wav", elem_id="output_audio")
+    generate_btn.click(fn=process_input, inputs=[input_text, input_file, language, speaker1, speaker2, api_key], outputs=output_audio, show_progress=True)
+ demo.queue()
+ demo.launch(server_name="0.0.0.0", share=True, debug=True)