import gradio as gr from pydub import AudioSegment import json import uuid import edge_tts import asyncio import aiofiles import os import time import mimetypes from typing import List, Dict # NEW – Hugging Face Transformers from transformers import AutoTokenizer, AutoModelForCausalLM import torch # NEW – external model id MODEL_ID = "tabularisai/german-gemma-3-1b-it" # Constants MAX_FILE_SIZE_MB = 20 MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes class PodcastGenerator: def __init__(self): self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) self.model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, device_map="auto", ).eval() async def generate_script( self, prompt: str, language: str, api_key: str, file_obj=None, progress=None, ) -> Dict: example = """ { "topic": "AGI", "podcast": [ { "speaker": 2, "line": "So, AGI, huh? Seems like everyone's talking about it these days." }, { "speaker": 1, "line": "Yeah, it's definitely having a moment, isn't it?" } ] } """ if language == "Auto Detect": language_instruction = ( "- The podcast MUST be in the same language as the user input." ) else: language_instruction = f"- The podcast MUST be in {language} language" system_prompt = f""" You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input. {language_instruction} - The podcast should have 2 speakers. - The podcast should be long. - Do not use names for the speakers. - The podcast should be interesting, lively, and engaging, and hook the listener from the start. - The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast. - The script must be in JSON format. Follow this example structure: {example} """ if prompt and file_obj: user_prompt = ( f"Please generate a podcast script based on the uploaded file following user input:\n{prompt}" ) elif prompt: user_prompt = ( f"Please generate a podcast script based on the following user input:\n{prompt}" ) else: user_prompt = "Please generate a podcast script based on the uploaded file." # If a file is provided we still read it for completeness (not required for HF generation) if file_obj: _ = await self._read_file_bytes(file_obj) if progress: progress(0.3, "Generating podcast script...") inputs = self.tokenizer( f"{system_prompt}\n\n{user_prompt}", return_tensors="pt" ).to(self.model.device) try: output = self.model.generate(**inputs, max_new_tokens=2048, temperature=1.0) response_text = self.tokenizer.decode(output[0], skip_special_tokens=True) except Exception as e: raise Exception(f"Failed to generate podcast script: {e}") print(f"Generated podcast script:\n{response_text}") if progress: progress(0.4, "Script generated successfully!") return json.loads(response_text) async def _read_file_bytes(self, file_obj) -> bytes: if hasattr(file_obj, "size"): file_size = file_obj.size else: file_size = os.path.getsize(file_obj.name) if file_size > MAX_FILE_SIZE_BYTES: raise Exception( f"File size exceeds the {MAX_FILE_SIZE_MB}MB limit. Please upload a smaller file." ) if hasattr(file_obj, "read"): return file_obj.read() else: async with aiofiles.open(file_obj.name, "rb") as f: return await f.read() @staticmethod def _get_mime_type(filename: str) -> str: ext = os.path.splitext(filename)[1].lower() if ext == ".pdf": return "application/pdf" elif ext == ".txt": return "text/plain" else: mime_type, _ = mimetypes.guess_type(filename) return mime_type or "application/octet-stream"