import gradio as gr import base64 import io import os from openai import OpenAI import PyPDF2 import speech_recognition as sr import tempfile from pydub import AudioSegment from typing import List, Tuple, Optional class MultimodalChatbot: def __init__(self, api_key: str): self.client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=api_key, ) self.model = "google/gemma-3n-e2b-it:free" self.conversation_history = [] def extract_pdf_text(self, pdf_file) -> str: """Extract text from PDF file""" try: if hasattr(pdf_file, 'name'): pdf_path = pdf_file.name else: pdf_path = pdf_file text = "" with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page_num, page in enumerate(pdf_reader.pages): page_text = page.extract_text() if page_text.strip(): text += f"Page {page_num + 1}:\n{page_text}\n\n" return text.strip() if text.strip() else "No text could be extracted from this PDF." except Exception as e: return f"Error extracting PDF: {str(e)}" def convert_audio_to_wav(self, audio_file) -> str: """Convert audio file to WAV format for speech recognition""" try: if hasattr(audio_file, 'name'): audio_path = audio_file.name else: audio_path = audio_file file_ext = os.path.splitext(audio_path)[1].lower() if file_ext == '.wav': return audio_path audio = AudioSegment.from_file(audio_path) wav_path = tempfile.mktemp(suffix='.wav') audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"]) return wav_path except Exception as e: raise Exception(f"Error converting audio: {str(e)}") def transcribe_audio(self, audio_file) -> str: """Transcribe audio file to text""" try: recognizer = sr.Recognizer() wav_path = self.convert_audio_to_wav(audio_file) with sr.AudioFile(wav_path) as source: recognizer.adjust_for_ambient_noise(source, duration=0.2) audio_data = recognizer.record(source) try: text = recognizer.recognize_google(audio_data) return text except sr.UnknownValueError: return "Could not understand the audio. Please try with clearer audio." except sr.RequestError as e: try: text = recognizer.recognize_sphinx(audio_data) return text except: return f"Speech recognition service error: {str(e)}" except Exception as e: return f"Error transcribing audio: {str(e)}" def create_multimodal_message(self, text_input: str = "", pdf_file=None, audio_file=None) -> dict: """Create a multimodal message for the API""" content_parts = [] processing_info = [] if text_input: content_parts.append({"type": "text", "text": text_input}) if pdf_file is not None: pdf_text = self.extract_pdf_text(pdf_file) content_parts.append({ "type": "text", "text": f"PDF Content:\n{pdf_text}" }) processing_info.append("šŸ“„ PDF processed") if audio_file is not None: audio_text = self.transcribe_audio(audio_file) content_parts.append({ "type": "text", "text": f"Audio Transcription:\n{audio_text}" }) processing_info.append("šŸŽ¤ Audio transcribed") return {"role": "user", "content": content_parts}, processing_info def chat(self, text_input: str = "", pdf_file=None, audio_file=None, history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]: """Main chat function""" if history is None: history = [] try: user_message_parts = [] if text_input: user_message_parts.append(f"Text: {text_input}") if pdf_file: user_message_parts.append("šŸ“„ PDF uploaded") if audio_file: user_message_parts.append("šŸŽ¤ Audio uploaded") user_display = " | ".join(user_message_parts) user_message, processing_info = self.create_multimodal_message( text_input, pdf_file, audio_file ) if processing_info: user_display += f"\n{' | '.join(processing_info)}" messages = [user_message] completion = self.client.chat.completions.create( extra_headers={ "HTTP-Referer": "https://multimodal-chatbot.local", "X-Title": "Multimodal Chatbot", }, model=self.model, messages=messages, max_tokens=2048, temperature=0.7 ) bot_response = completion.choices[0].message.content history.append((user_display, bot_response)) return history, "" except Exception as e: error_msg = f"Error: {str(e)}" history.append((user_display if 'user_display' in locals() else "Error in input", error_msg)) return history, "" def create_interface(): """Create the Gradio interface""" with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # šŸ¤– Multimodal Chatbot with Gemma 3n This chatbot can process multiple types of input: - **Text**: Regular text messages - **PDF**: Extract and analyze document content - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC) **Setup**: Enter your OpenRouter API key below to get started """) with gr.Row(): with gr.Column(): api_key_input = gr.Textbox( label="šŸ”‘ OpenRouter API Key", placeholder="Enter your OpenRouter API key here...", type="password", info="Your API key is not stored and only used for this session" ) api_status = gr.Textbox( label="Connection Status", value="āŒ API Key not provided", interactive=False ) with gr.Tabs(): with gr.TabItem("šŸ’¬ Text Chat"): with gr.Row(): with gr.Column(scale=1): text_input = gr.Textbox( label="šŸ’¬ Text Input", placeholder="Type your message here...", lines=5 ) text_submit_btn = gr.Button("šŸš€ Send", variant="primary", size="lg", interactive=False) text_clear_btn = gr.Button("šŸ—‘ļø Clear", variant="secondary") with gr.Column(scale=2): text_chatbot = gr.Chatbot( label="Text Chat History", height=600, bubble_full_width=False, show_copy_button=True ) with gr.TabItem("šŸ“„ PDF Chat"): with gr.Row(): with gr.Column(scale=1): pdf_input = gr.File( label="šŸ“„ PDF Upload", file_types=[".pdf"], type="filepath" ) pdf_text_input = gr.Textbox( label="šŸ’¬ Question about PDF", placeholder="Ask something about the PDF...", lines=3 ) pdf_submit_btn = gr.Button("šŸš€ Send", variant="primary", size="lg", interactive=False) pdf_clear_btn = gr.Button("šŸ—‘ļø Clear", variant="secondary") with gr.Column(scale=2): pdf_chatbot = gr.Chatbot( label="PDF Chat History", height=600, bubble_full_width=False, show_copy_button=True ) with gr.TabItem("šŸŽ¤ Audio Chat"): with gr.Row(): with gr.Column(scale=1): audio_input = gr.File( label="šŸŽ¤ Audio Upload", file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"], type="filepath" ) audio_text_input = gr.Textbox( label="šŸ’¬ Question about Audio", placeholder="Ask something about the audio...", lines=3 ) audio_submit_btn = gr.Button("šŸš€ Send", variant="primary", size="lg", interactive=False) audio_clear_btn = gr.Button("šŸ—‘ļø Clear", variant="secondary") with gr.Column(scale=2): audio_chatbot = gr.Chatbot( label="Audio Chat History", height=600, bubble_full_width=False, show_copy_button=True ) with gr.TabItem("🌟 Combined Chat"): with gr.Row(): with gr.Column(scale=1): combined_text_input = gr.Textbox( label="šŸ’¬ Text Input", placeholder="Type your message here...", lines=3 ) combined_pdf_input = gr.File( label="šŸ“„ PDF Upload", file_types=[".pdf"], type="filepath" ) combined_audio_input = gr.File( label="šŸŽ¤ Audio Upload", file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"], type="filepath" ) combined_submit_btn = gr.Button("šŸš€ Send All", variant="primary", size="lg", interactive=False) combined_clear_btn = gr.Button("šŸ—‘ļø Clear All", variant="secondary") with gr.Column(scale=2): combined_chatbot = gr.Chatbot( label="Combined Chat History", height=600, bubble_full_width=False, show_copy_button=True ) def validate_api_key(api_key): if not api_key or len(api_key.strip()) == 0: return "āŒ API Key not provided", *[gr.update(interactive=False) for _ in range(4)] try: test_client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=api_key.strip(), ) return "āœ… API Key validated successfully", *[gr.update(interactive=True) for _ in range(4)] except Exception as e: return f"āŒ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(4)] def process_text_input(api_key, text, history): if not api_key or len(api_key.strip()) == 0: if history is None: history = [] history.append(("Error", "āŒ Please provide a valid API key first")) return history, "" chatbot = MultimodalChatbot(api_key.strip()) return chatbot.chat(text_input=text, history=history) def process_pdf_input(api_key, pdf, text, history): if not api_key or len(api_key.strip()) == 0: if history is None: history = [] history.append(("Error", "āŒ Please provide a valid API key first")) return history, "" chatbot = MultimodalChatbot(api_key.strip()) return chatbot.chat(text_input=text, pdf_file=pdf, history=history) def process_audio_input(api_key, audio, text, history): if not api_key or len(api_key.strip()) == 0: if history is None: history = [] history.append(("Error", "āŒ Please provide a valid API key first")) return history, "" chatbot = MultimodalChatbot(api_key.strip()) return chatbot.chat(text_input=text, audio_file=audio, history=history) def process_combined_input(api_key, text, pdf, audio, history): if not api_key or len(api_key.strip()) == 0: if history is None: history = [] history.append(("Error", "āŒ Please provide a valid API key first")) return history, "" chatbot = MultimodalChatbot(api_key.strip()) return chatbot.chat(text, pdf, audio, history) def clear_chat(): return [], "" def clear_all_inputs(): return [], "", None, None api_key_input.change( validate_api_key, inputs=[api_key_input], outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn, combined_submit_btn] ) text_submit_btn.click( process_text_input, inputs=[api_key_input, text_input, text_chatbot], outputs=[text_chatbot, text_input] ) text_input.submit( process_text_input, inputs=[api_key_input, text_input, text_chatbot], outputs=[text_chatbot, text_input] ) text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input]) pdf_submit_btn.click( process_pdf_input, inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot], outputs=[pdf_chatbot, pdf_text_input] ) pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input]) audio_submit_btn.click( process_audio_input, inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot], outputs=[audio_chatbot, audio_text_input] ) audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input]) combined_submit_btn.click( process_combined_input, inputs=[api_key_input, combined_text_input, combined_pdf_input, combined_audio_input, combined_chatbot], outputs=[combined_chatbot, combined_text_input] ) combined_clear_btn.click(clear_all_inputs, outputs=[combined_chatbot, combined_text_input, combined_pdf_input, combined_audio_input]) gr.Markdown(""" ### šŸŽÆ How to Use Each Tab: **šŸ’¬ Text Chat**: Simple text conversations with the AI **šŸ“„ PDF Chat**: Upload a PDF and ask questions about its content **šŸŽ¤ Audio Chat**: Upload audio files for transcription and analysis - Supports: WAV, MP3, M4A, FLAC, OGG formats - Best results with clear speech and minimal background noise **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis ### šŸ”‘ Getting an API Key: 1. Go to [OpenRouter.ai](https://openrouter.ai) 2. Sign up for an account 3. Navigate to the API Keys section 4. Create a new API key 5. Copy and paste it in the field above ### āš ļø Current Limitations: - Audio transcription requires internet connection for best results - Large files may take longer to process """) return demo if __name__ == "__main__": required_packages = [ "gradio", "openai", "PyPDF2", "SpeechRecognition", "pydub" ] print("šŸš€ Multimodal Chatbot with Gemma 3n") print("=" * 50) print("Required packages:", ", ".join(required_packages)) print("\nšŸ“¦ To install: pip install " + " ".join(required_packages)) print("\nšŸŽ¤ For audio processing, you may also need:") print(" - ffmpeg (for audio conversion)") print(" - sudo apt-get install espeak espeak-data libespeak1 libespeak-dev (for offline speech recognition)") print("\nšŸ”‘ Get your API key from: https://openrouter.ai") print("šŸ’” Enter your API key in the web interface when it loads") demo = create_interface() demo.launch( share=True )