podcastgen

Sleeping

App Files Files Community

Rausda6 commited on May 21

Commit

4211f84

verified ·

1 Parent(s): 428399e

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -184

app.py CHANGED Viewed

@@ -10,211 +10,133 @@ import uuid
 import json
 from typing import List, Dict
-from transformers import pipeline
 import torch
-# Configuration: local model path or remote repo ID and HF token secret
-LOCAL_MODEL_PATH = os.getenv("GEMMA_MODEL_PATH")  # e.g. "./models/gemma-2-l6b"
-REMOTE_MODEL_ID = "meta-llama/Llama-3.1-8B"
-HF_TOKEN = os.getenv("Tokentest")  # your secret token name
-# Determine model source and auth
-if LOCAL_MODEL_PATH and os.path.isdir(LOCAL_MODEL_PATH):
-    model_source = LOCAL_MODEL_PATH
-    auth_token = None
-else:
-    model_source = REMOTE_MODEL_ID
-    auth_token = HF_TOKEN or os.getenv("HUGGINGFACE_HUB_TOKEN")
-# Initialize Gemma text-generation pipeline
-_device = 0 if torch.cuda.is_available() else -1
-pipeline_kwargs = {
-    "model": model_source,
-    "device": _device,
-    "torch_dtype": "auto"
-}
-if auth_token:
-    pipeline_kwargs["use_auth_token"] = auth_token
-text_generator = pipeline(
-    "text-generation",
-    **pipeline_kwargs
 )
-# Constants
-MAX_FILE_SIZE_MB = 20
-MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
 class PodcastGenerator:
     def __init__(self):
         pass
     async def generate_script(self, prompt: str, language: str, file_obj=None, progress=None) -> Dict:
-        example = """
-        {
-          "topic": "AGI",
-          "podcast": [ ... ]
-        }
-        """
-        if language == "Auto Detect":
-            language_instruction = "- The podcast MUST be in the same language as the user input."
-        else:
-            language_instruction = f"- The podcast MUST be in {language} language"
-        system_prompt = f"""
-        You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input.
-        {language_instruction}
-        - The podcast should have 2 speakers.
-        - The podcast should be long.
-        - Do not use names for the speakers.
-        - The podcast should be interesting, lively, and engaging, and hook the listener from the start.
-        - The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
-        - The script must be in JSON format.
-        Follow this example structure:
-        {example}
-        """
         if prompt and file_obj:
-            user_prompt = f"Please generate a podcast script based on the uploaded file following user input:\n{prompt}"
         elif prompt:
-            user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
         else:
-            user_prompt = "Please generate a podcast script based on the uploaded file."
         full_prompt = system_prompt + "\n\n" + user_prompt
         loop = asyncio.get_event_loop()
-        result = await loop.run_in_executor(
-            None,
-            lambda: text_generator(full_prompt, max_new_tokens=512, do_sample=True)
-        )
-        gen_text = result[0]["generated_text"]
-        return json.loads(gen_text)
     async def _read_file_bytes(self, file_obj) -> bytes:
-        if hasattr(file_obj, 'size'):
-            file_size = file_obj.size
-        else:
-            file_size = os.path.getsize(file_obj.name)
-        if file_size > MAX_FILE_SIZE_BYTES:
-            raise Exception(f"File size exceeds the {MAX_FILE_SIZE_MB}MB limit. Please upload a smaller file.")
-        if hasattr(file_obj, 'read'):
-            return file_obj.read()
-        else:
-            async with aiofiles.open(file_obj.name, 'rb') as f:
-                return await f.read()
-    def _get_mime_type(self, filename: str) -> str:
-        ext = os.path.splitext(filename)[1].lower()
-        if ext == '.pdf':
-            return "application/pdf"
-        elif ext == '.txt':
-            return "text/plain"
-        mime_type, _ = mimetypes.guess_type(filename)
-        return mime_type or "application/octet-stream"
-    async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
-        voice = speaker1 if speaker == 1 else speaker2
-        speech = edge_tts.Communicate(text, voice)
-        temp_filename = f"temp_{uuid.uuid4()}.wav"
-        try:
-            await asyncio.wait_for(speech.save(temp_filename), timeout=30)
-            return temp_filename
-        except Exception:
-            if os.path.exists(temp_filename):
-                os.remove(temp_filename)
-            raise
-    async def combine_audio_files(self, audio_files: List[str], progress=None) -> str:
-        if progress:
-            progress(0.9, "Combining audio files...")
-        combined_audio = AudioSegment.empty()
-        for audio_file in audio_files:
-            combined_audio += AudioSegment.from_file(audio_file)
-            os.remove(audio_file)
-        output_filename = f"output_{uuid.uuid4()}.wav"
-        combined_audio.export(output_filename, format="wav")
-        if progress:
-            progress(1.0, "Podcast generated successfully!")
-        return output_filename
-    async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, file_obj=None, progress=None) -> str:
-        return await asyncio.wait_for(
-            self._generate_podcast_internal(input_text, language, speaker1, speaker2, file_obj, progress),
-            timeout=600
-        )
-    async def _generate_podcast_internal(self, input_text: str, language: str, speaker1: str, speaker2: str, file_obj=None, progress=None) -> str:
-        if progress:
-            progress(0.2, "Generating podcast script...")
-        podcast_json = await self.generate_script(input_text, language, file_obj, progress)
-        if progress:
-            progress(0.5, "Converting text to speech...")
         audio_files = []
-        total_lines = len(podcast_json['podcast'])
-        batch_size = 10
-        for batch_start in range(0, total_lines, batch_size):
-            batch = podcast_json['podcast'][batch_start:batch_start+batch_size]
-            tasks = [self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in batch]
-            results = await asyncio.gather(*tasks)
-            audio_files.extend(results)
-            if progress:
-                progress(0.5 + 0.4 * ((batch_start+len(batch)) / total_lines), f"Processed {batch_start+len(batch)}/{total_lines} segments...")
-        combined = await self.combine_audio_files(audio_files, progress)
-        return combined
-async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, progress=None) -> str:
-    generator = PodcastGenerator()
-    return await generator.generate_podcast(input_text, language, speaker1, speaker2, input_file, progress)
 # Gradio UI
-def generate_podcast_gradio(input_text, input_file, language, speaker1, speaker2, progress=gr.Progress()):
-    def progress_callback(val, msg):
-        progress(val, msg)
-    result = asyncio.run(process_input(
-        input_text,
-        input_file,
-        language,
-        speaker1,
-        speaker2,
-        progress_callback
-    ))
-    return result
-def main():
-    language_options = ["Auto Detect", "English", "German", "French"]
-    voice_options = [
-        "Andrew - English (United States)",
-        "Ava - English (United States)",
-        "Brian - English (United States)",
-        "Emma - English (United States)",
-        "Florian - German (Germany)",
-        "Seraphina - German (Germany)",
-        "Remy - French (France)",
-        "Vivienne - French (France)"
-    ]
-    with gr.Blocks(title="PodcastGen 🎙️") as demo:
-        gr.Markdown("# PodcastGen 🎙️")
-        gr.Markdown("Generate a 2-speaker podcast from text input or documents!")
-        with gr.Row():
-            input_text = gr.Textbox(label="Input Text", lines=10)
-            input_file = gr.File(label="Or Upload a PDF or TXT file", file_types=[".pdf", ".txt"])
-        with gr.Row():
-            language = gr.Dropdown(label="Language", choices=language_options, value="Auto Detect")
-            speaker1 = gr.Dropdown(label="Speaker 1 Voice", choices=voice_options, value="Andrew - English (United States)")
-            speaker2 = gr.Dropdown(label="Speaker 2 Voice", choices=voice_options, value="Ava - English (United States)")
-        generate_btn = gr.Button("Generate Podcast", variant="primary")
-        output_audio = gr.Audio(label="Generated Podcast", type="filepath", format="wav")
-        generate_btn.click(
-            fn=generate_podcast_gradio,
-            inputs=[input_text, input_file, language, speaker1, speaker2],
-            outputs=[output_audio]
         )
     demo.launch()
-if __name__ == "__main__":
-    main()

 import json
 from typing import List, Dict
+# Model imports
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
+# Configuration
+# Use this MODEL_ID, adjust if you have a local path instead
+MODEL_ID = os.getenv("GEMMA_MODEL_PATH", "tabularisai/german-gemma-3-1b-it")
+# Hugging Face token secret (optional, for gated/private models)
+HF_TOKEN = os.getenv("Tokentest")
+# Load tokenizer and model
+print(f"Loading model {MODEL_ID}...")
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    use_auth_token=HF_TOKEN
 )
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    use_auth_token=HF_TOKEN,
+    torch_dtype=(torch.bfloat16 if torch.cuda.is_available() else torch.float32),
+    device_map="auto"
+).eval()
+# Optional: set up a simple stopping criteria on <end_of_turn> token
+PAD = tokenizer.pad_token_id or tokenizer.eos_token_id
+EOT = tokenizer.convert_tokens_to_ids('<end_of_turn>')
 class PodcastGenerator:
+    MAX_FILE_MB = 20
+    MAX_FILE_BYTES = MAX_FILE_MB * 1024 * 1024
     def __init__(self):
         pass
     async def generate_script(self, prompt: str, language: str, file_obj=None, progress=None) -> Dict:
+        example = '{"topic": "AGI", "podcast": [ ... ] }'
+        lang_inst = (
+            "- The podcast MUST be in the same language as the user input."
+            if language == "Auto Detect"
+            else f"- The podcast MUST be in {language} language"
+        )
+        system_prompt = (
+            "You are a professional podcast generator. Your task is to generate a professional podcast script..."
+            f"\n{lang_inst}\n- The podcast should have 2 speakers.\n- The podcast should be long."
+            "\n- Do not use names for the speakers.\n- The podcast should be interesting, lively, and engaging..."
+            "\n- The script must be in JSON format. Follow this example structure:" + example
+        )
         if prompt and file_obj:
+            user_prompt = f"Generate podcast script based on file and prompt:\n{prompt}"
         elif prompt:
+            user_prompt = f"Generate podcast script based on prompt:\n{prompt}"
         else:
+            user_prompt = "Generate podcast script based on uploaded file."
         full_prompt = system_prompt + "\n\n" + user_prompt
+        # sync generation in executor
+        def gen_sync():
+            inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
+            # add stopping criteria
+            stop_crit = StoppingCriteriaList([StoppingCriteria(max_length=512)])
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                do_sample=True,
+                pad_token_id=PAD,
+                eos_token_id=EOT
+            )
+            return tokenizer.decode(outputs[0], skip_special_tokens=True)
         loop = asyncio.get_event_loop()
+        text = await loop.run_in_executor(None, gen_sync)
+        return json.loads(text)
     async def _read_file_bytes(self, file_obj) -> bytes:
+        size = getattr(file_obj, 'size', os.path.getsize(file_obj.name))
+        if size > self.MAX_FILE_BYTES:
+            raise Exception(f"File > {self.MAX_FILE_MB}MB")
+        return file_obj.read() if hasattr(file_obj, 'read') else await aiofiles.open(file_obj.name, 'rb').read()
+    async def tts_generate(self, text: str, speaker: int, s1: str, s2: str) -> str:
+        voice = s1 if speaker == 1 else s2
+        speech = edge_tts.Communicate(text, voice)
+        fname = f"tmp_{uuid.uuid4()}.wav"
+        await speech.save(fname)
+        return fname
+    async def combine_audio_files(self, files: List[str], progress=None) -> str:
+        combined = AudioSegment.empty()
+        for f in files:
+            combined += AudioSegment.from_file(f)
+            os.remove(f)
+        out = f"out_{uuid.uuid4()}.wav"
+        combined.export(out, format="wav")
+        return out
+    async def generate_podcast(self, text: str, lang: str, sp1: str, sp2: str, file_obj=None, progress=None) -> str:
+        pj = await self.generate_script(text, lang, file_obj, progress)
+        parts = pj['podcast']
         audio_files = []
+        for seg in parts:
+            audio_files.append(await self.tts_generate(seg['line'], seg['speaker'], sp1, sp2))
+        return await self.combine_audio_files(audio_files)
 # Gradio UI
+def run_app():
+    langs = ["Auto Detect","German","English","French"]
+    voices = ["Florian - German (Germany)", "Andrew - English (US)"]
+    gen = PodcastGenerator()
+    with gr.Blocks() as demo:
+        inp = gr.Textbox(label="Input Text")
+        file_u = gr.File(label="Upload PDF/TXT")
+        lang_dd = gr.Dropdown(langs, value="Auto Detect", label="Language")
+        sp1 = gr.Dropdown(voices, value=voices[0], label="Speaker 1")
+        sp2 = gr.Dropdown(voices, value=voices[1], label="Speaker 2")
+        out = gr.Audio(label="Podcast", type="filepath")
+        btn = gr.Button("Generate")
+        btn.click(
+            lambda t,f,l,a,b: asyncio.run(gen.generate_podcast(t,l,a,b,f)),
+            inputs=[inp, file_u, lang_dd, sp1, sp2],
+            outputs=[out]
         )
     demo.launch()
+if __name__ == '__main__':
+    run_app()