Spaces:

raksama19
/

Test-Dolphin-PDF

Runtime error

App Files Files Community

raksama19 commited on Jul 17

Commit

6c3a257

verified ·

1 Parent(s): 6c7ef32

Update app.py

Browse files

Files changed (1) hide show

app.py +185 -226

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
-DOLPHIN PDF Document AI - Alt Text Enhanced Version
-Optimized for HuggingFace Spaces NVIDIA T4 Small deployment
-Features: AI-generated alt text for accessibility using Gemma 3n
 """
 import gradio as gr
@@ -10,18 +10,16 @@ import markdown
 import cv2
 import numpy as np
 from PIL import Image
-from transformers import AutoProcessor, VisionEncoderDecoderModel, Gemma3nForConditionalGeneration, pipeline
 import torch
 try:
     from sentence_transformers import SentenceTransformer
     import numpy as np
     from sklearn.metrics.pairwise import cosine_similarity
-    import google.generativeai as genai
-    from google.generativeai import types
     RAG_DEPENDENCIES_AVAILABLE = True
 except ImportError as e:
     print(f"RAG dependencies not available: {e}")
-    print("Please install: pip install sentence-transformers scikit-learn google-generativeai")
     RAG_DEPENDENCIES_AVAILABLE = False
     SentenceTransformer = None
 import os
@@ -43,7 +41,7 @@ except ImportError:
 class DOLPHIN:
     def __init__(self, model_id_or_path):
-        """Initialize the Hugging Face model optimized for T4 Small"""
         self.processor = AutoProcessor.from_pretrained(model_id_or_path)
         self.model = VisionEncoderDecoderModel.from_pretrained(
             model_id_or_path,
@@ -93,7 +91,7 @@ class DOLPHIN:
                 decoder_input_ids=batch_prompt_ids,
                 decoder_attention_mask=batch_attention_mask,
                 min_length=1,
-                max_length=1024,  # Reduced for T4 Small
                 pad_token_id=self.tokenizer.pad_token_id,
                 eos_token_id=self.tokenizer.eos_token_id,
                 use_cache=True,
@@ -117,6 +115,139 @@ class DOLPHIN:
         return results
 def convert_pdf_to_images_gradio(pdf_file):
     """Convert uploaded PDF file to list of PIL Images"""
     try:
@@ -170,7 +301,7 @@ def process_pdf_document(pdf_file, model, progress=gr.Progress()):
                 padded_image,
                 dims,
                 model,
-                max_batch_size=2  # Smaller batch for T4 Small
             )
             try:
@@ -199,8 +330,8 @@ def process_pdf_document(pdf_file, model, progress=gr.Progress()):
         return error_msg, "error"
-def process_elements_optimized(layout_results, padded_image, dims, model, max_batch_size=2):
-    """Optimized element processing for T4 Small"""
     layout_results = parse_layout_string(layout_results)
     text_elements = []
@@ -221,8 +352,8 @@ def process_elements_optimized(layout_results, padded_image, dims, model, max_ba
                     pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
                     pil_crop = crop_margin(pil_crop)
-                    # Generate alt text for accessibility
-                    alt_text = generate_alt_text_for_image(pil_crop)
                     buffered = io.BytesIO()
                     pil_crop.save(buffered, format="PNG")
@@ -274,8 +405,8 @@ def process_elements_optimized(layout_results, padded_image, dims, model, max_ba
     return recognition_results
-def process_element_batch_optimized(elements, model, prompt, max_batch_size=2):
-    """Process elements in small batches for T4 Small"""
     results = []
     batch_size = min(len(elements), max_batch_size)
@@ -316,7 +447,7 @@ def generate_fallback_markdown(recognition_results):
     return markdown_content
-# Initialize model
 model_path = "./hf_model"
 if not os.path.exists(model_path):
     model_path = "ByteDance/DOLPHIN"
@@ -324,164 +455,30 @@ if not os.path.exists(model_path):
 # Model paths and configuration
 model_path = "./hf_model" if os.path.exists("./hf_model") else "ByteDance/DOLPHIN"
 hf_token = os.getenv('HF_TOKEN')
-# Don't load models initially - load them on demand
-model_status = "✅ Models ready (Dynamic loading)"
-# Initialize embedding model and Gemini API
 if RAG_DEPENDENCIES_AVAILABLE:
     try:
         print("Loading embedding model for RAG...")
         embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
         print("✅ Embedding model loaded successfully (CPU)")
-        # Initialize Gemini API
-        gemini_api_key = os.getenv('GEMINI_API_KEY')
-        if gemini_api_key:
-            genai.configure(api_key=gemini_api_key)
-            gemini_client = True  # Just mark as configured
-            print("✅ Gemini API configured successfully")
-        else:
-            print("❌ GEMINI_API_KEY not found in environment")
-            gemini_client = None
     except Exception as e:
-        print(f"❌ Error loading models: {e}")
-        import traceback
-        traceback.print_exc()
         embedding_model = None
-        gemini_client = None
 else:
     print("❌ RAG dependencies not available")
     embedding_model = None
-    gemini_client = None
-# Model management functions
-def load_dolphin_model():
-    """Load DOLPHIN model for PDF processing"""
-    global dolphin_model, current_model
-    if current_model == "dolphin":
-        return dolphin_model
-    # No need to unload chatbot model (using API now)
-    try:
-        print("Loading DOLPHIN model...")
-        dolphin_model = DOLPHIN(model_path)
-        current_model = "dolphin"
-        print(f"✅ DOLPHIN model loaded (Device: {dolphin_model.device})")
-        return dolphin_model
-    except Exception as e:
-        print(f"❌ Error loading DOLPHIN model: {e}")
-        return None
-def unload_dolphin_model():
-    """Unload DOLPHIN model to free memory"""
-    global dolphin_model, current_model
-    if dolphin_model is not None:
-        print("Unloading DOLPHIN model...")
-        del dolphin_model
-        dolphin_model = None
-        if current_model == "dolphin":
-            current_model = None
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        print("✅ DOLPHIN model unloaded")
-def initialize_gemini_client():
-    """Initialize Gemini API client"""
-    global gemini_client
-    if gemini_client is not None:
-        return gemini_client
-    try:
-        gemini_api_key = os.getenv('GEMINI_API_KEY')
-        if not gemini_api_key:
-            print("❌ GEMINI_API_KEY not found in environment")
-            return None
-        print("Initializing Gemini API client...")
-        gemini_client = genai.configure(api_key=gemini_api_key)
-        print("✅ Gemini API client ready for gemma-3n-e4b-it")
-        return gemini_client
-    except Exception as e:
-        print(f"❌ Error initializing Gemini client: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
-def generate_alt_text_for_image(pil_image):
-    """Generate alt text for an image using Gemma 3n model via Google AI API"""
-    try:
-        # Initialize Gemini client
-        client = initialize_gemini_client()
-        if client is None:
-            print("❌ Gemini client not initialized for alt text generation")
-            return "Image description unavailable"
-        # Debug: Check image format and properties
-        print(f"🔍 Image format: {pil_image.format}, mode: {pil_image.mode}, size: {pil_image.size}")
-        # Ensure image is in RGB mode
-        if pil_image.mode != 'RGB':
-            print(f"Converting image from {pil_image.mode} to RGB")
-            pil_image = pil_image.convert('RGB')
-        # Convert PIL image to bytes
-        buffered = io.BytesIO()
-        pil_image.save(buffered, format="JPEG")
-        image_bytes = buffered.getvalue()
-        print(f"🔍 Generating alt text for image with Gemma 3n...")
-        # Create a detailed prompt for alt text generation
-        prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
-Focus on:
-- Main subject or content of the image
-- Important details, text, or data shown
-- Layout and structure if relevant (charts, diagrams, tables)
-- Context that would help someone understand the image's purpose
-Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
-        # Use the Google AI API client with proper format
-        response = genai.GenerativeModel('gemma-3n-e4b-it').generate_content([
-            types.Part.from_bytes(
-                data=image_bytes,
-                mime_type='image/jpeg',
-            ),
-            prompt
-        ])
-        print(f"📡 API response received: {type(response)}")
-        if hasattr(response, 'text') and response.text:
-            alt_text = response.text.strip()
-            print(f"✅ Alt text generated: {alt_text[:100]}...")
-        else:
-            print(f"❌ No text in response. Response: {response}")
-            return "Image description unavailable"
-        # Clean up the alt text
-        alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
-        # Remove common prefixes if they appear
-        prefixes_to_remove = ["This image shows", "The image shows", "This shows", "The figure shows"]
-        for prefix in prefixes_to_remove:
-            if alt_text.startswith(prefix):
-                alt_text = alt_text[len(prefix):].strip()
-                break
-        return alt_text if alt_text else "Image description unavailable"
-    except Exception as e:
-        print(f"❌ Error generating alt text: {e}")
-        import traceback
-        traceback.print_exc()
-        return "Image description unavailable"
 # Global state for managing tabs
@@ -490,14 +487,9 @@ show_results_tab = False
 document_chunks = []
 document_embeddings = None
-# Global model state
-dolphin_model = None
-gemini_client = None
-current_model = None  # Track which model is currently loaded
 def chunk_document(text, chunk_size=1024, overlap=100):
-    """Split document into overlapping chunks for RAG - optimized for API quota"""
     words = text.split()
     chunks = []
@@ -554,16 +546,9 @@ def process_uploaded_pdf(pdf_file, progress=gr.Progress()):
         return "❌ No PDF uploaded", gr.Tabs(visible=False)
     try:
-        # Load DOLPHIN model for PDF processing
-        progress(0.1, desc="Loading DOLPHIN model...")
-        dolphin = load_dolphin_model()
-        if dolphin is None:
-            return "❌ Failed to load DOLPHIN model", gr.Tabs(visible=False)
         # Process PDF
-        progress(0.2, desc="Processing PDF...")
-        combined_markdown, status = process_pdf_document(pdf_file, dolphin, progress)
         if status == "processing_complete":
             processed_markdown = combined_markdown
@@ -574,9 +559,6 @@ def process_uploaded_pdf(pdf_file, progress=gr.Progress()):
             document_embeddings = create_embeddings(document_chunks)
             print(f"Created {len(document_chunks)} chunks")
-            # Keep DOLPHIN model loaded for GPU usage
-            progress(0.95, desc="Preparing chatbot...")
             show_results_tab = True
             progress(1.0, desc="PDF processed successfully!")
             return "✅ PDF processed successfully! Chatbot is ready in the Chat tab.", gr.Tabs(visible=True)
@@ -604,15 +586,16 @@ def clear_all():
     document_chunks = []
     document_embeddings = None
-    # Unload DOLPHIN model
-    unload_dolphin_model()
     return None, "", gr.Tabs(visible=False)
 # Create Gradio interface
 with gr.Blocks(
-    title="DOLPHIN PDF AI",
     theme=gr.themes.Soft(),
     css="""
     @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
@@ -662,16 +645,15 @@ with gr.Blocks(
         # Home Tab
         with gr.TabItem("🏠 Home", id="home"):
             embedding_status = "✅ RAG ready" if embedding_model else "❌ RAG not loaded"
-            gemini_status = "✅ Gemini API ready" if gemini_client else "❌ Gemini API not configured"
-            current_status = f"Currently loaded: {current_model or 'None'}"
             gr.Markdown(
-                "# Scholar Express - Alt Text Enhanced\n"
-                "### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot powered by Gemini API.\n"
                 f"**System:** {model_status}\n"
                 f"**RAG System:** {embedding_status}\n"
-                f"**Gemini API:** {gemini_status}\n"
                 f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
-                f"**Status:** {current_status}"
             )
             with gr.Column(elem_classes="upload-container"):
@@ -742,7 +724,7 @@ with gr.Blocks(
                 send_btn = gr.Button("Send", variant="primary", scale=1)
             gr.Markdown(
-                "*Ask questions about your processed document. The AI uses RAG (Retrieval-Augmented Generation) with Gemini API to find relevant sections and provide accurate answers.*",
                 elem_id="chat-notice"
             )
@@ -771,7 +753,7 @@ with gr.Blocks(
         outputs=[chat_tab]
     )
-    # Chatbot functionality with Gemini API
     def chatbot_response(message, history):
         if not message.strip():
             return history
@@ -780,26 +762,20 @@ with gr.Blocks(
             return history + [[message, "❌ Please process a PDF document first before asking questions."]]
         try:
-            # Initialize Gemini client
-            client = initialize_gemini_client()
-            if client is None:
-                return history + [[message, "❌ Failed to initialize Gemini client. Please check your GEMINI_API_KEY."]]
-            # Use RAG to get relevant chunks from markdown (balanced for performance vs quota)
             if document_chunks and len(document_chunks) > 0:
                 relevant_chunks = retrieve_relevant_chunks(message, document_chunks, document_embeddings, top_k=3)
                 context = "\n\n".join(relevant_chunks)
-                # Smart truncation: aim for ~4000 chars (good context while staying under quota)
-                if len(context) > 4000:
                     # Try to cut at sentence boundaries
-                    sentences = context[:4000].split('.')
-                    context = '.'.join(sentences[:-1]) + '...' if len(sentences) > 1 else context[:4000] + '...'
             else:
                 # Fallback to truncated document if RAG fails
-                context = processed_markdown[:4000] + "..." if len(processed_markdown) > 4000 else processed_markdown
-            # Create prompt for Gemini
             prompt = f"""You are a helpful assistant that answers questions about documents. Use the provided context to answer questions accurately and concisely.
 Context from the document:
@@ -809,26 +785,9 @@ Question: {message}
 Please provide a clear and helpful answer based on the context provided."""
-            # Generate response using Gemini API with retry logic
-            import time
-            max_retries = 2
-            for attempt in range(max_retries):
-                try:
-                    response = genai.GenerativeModel('gemma-3n-e4b-it').generate_content(prompt)
-                    response_text = response.text if hasattr(response, 'text') else str(response)
-                    return history + [[message, response_text]]
-                except Exception as api_error:
-                    if "429" in str(api_error) and attempt < max_retries - 1:
-                        # Rate limit hit, wait and retry
-                        time.sleep(3)
-                        continue
-                    else:
-                        # Other error or final attempt failed
-                        if "429" in str(api_error):
-                            return history + [[message, "❌ API quota exceeded. Please wait a moment and try again, or check your Gemini API billing."]]
-                        else:
-                            raise api_error
         except Exception as e:
             error_msg = f"❌ Error generating response: {str(e)}"
@@ -863,7 +822,7 @@ if __name__ == "__main__":
         server_port=7860,
         share=False,
         show_error=True,
-        max_threads=1,  # Single thread for T4 Small
         inbrowser=False,
         quiet=True
     )

 """
+DOLPHIN PDF Document AI - Local Gemma 3n Version
+Optimized for powerful GPU deployment with local models
+Features: AI-generated alt text for accessibility using local Gemma 3n
 """
 import gradio as gr
 import cv2
 import numpy as np
 from PIL import Image
+from transformers import AutoProcessor, VisionEncoderDecoderModel, AutoModelForImageTextToText
 import torch
 try:
     from sentence_transformers import SentenceTransformer
     import numpy as np
     from sklearn.metrics.pairwise import cosine_similarity
     RAG_DEPENDENCIES_AVAILABLE = True
 except ImportError as e:
     print(f"RAG dependencies not available: {e}")
+    print("Please install: pip install sentence-transformers scikit-learn")
     RAG_DEPENDENCIES_AVAILABLE = False
     SentenceTransformer = None
 import os
 class DOLPHIN:
     def __init__(self, model_id_or_path):
+        """Initialize the Hugging Face model optimized for powerful GPU"""
         self.processor = AutoProcessor.from_pretrained(model_id_or_path)
         self.model = VisionEncoderDecoderModel.from_pretrained(
             model_id_or_path,
                 decoder_input_ids=batch_prompt_ids,
                 decoder_attention_mask=batch_attention_mask,
                 min_length=1,
+                max_length=2048,
                 pad_token_id=self.tokenizer.pad_token_id,
                 eos_token_id=self.tokenizer.eos_token_id,
                 use_cache=True,
         return results
+class Gemma3nModel:
+    def __init__(self, model_id="google/gemma-3n-E4B-it"):
+        """Initialize the Gemma 3n model for text generation and image description"""
+        self.model_id = model_id
+        self.processor = AutoProcessor.from_pretrained(model_id)
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            model_id,
+            torch_dtype="auto",
+            device_map="auto"
+        )
+        self.model.eval()
+        print(f"✅ Gemma 3n loaded (Device: {self.model.device}, DType: {self.model.dtype})")
+    def generate_alt_text(self, pil_image):
+        """Generate alt text for an image using local Gemma 3n"""
+        try:
+            # Ensure image is in RGB mode
+            if pil_image.mode != 'RGB':
+                pil_image = pil_image.convert('RGB')
+            # Create a detailed prompt for alt text generation
+            prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
+Focus on:
+- Main subject or content of the image
+- Important details, text, or data shown
+- Layout and structure if relevant (charts, diagrams, tables)
+- Context that would help someone understand the image's purpose
+Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
+            # Prepare the message format
+            message = {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": pil_image},
+                    {"type": "text", "text": prompt}
+                ]
+            }
+            # Apply chat template and generate
+            input_ids = self.processor.apply_chat_template(
+                [message],
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+            )
+            input_len = input_ids["input_ids"].shape[-1]
+            input_ids = input_ids.to(self.model.device, dtype=self.model.dtype)
+            outputs = self.model.generate(
+                **input_ids,
+                max_new_tokens=256,
+                disable_compile=True,
+                do_sample=False,
+                temperature=0.1
+            )
+            text = self.processor.batch_decode(
+                outputs[:, input_len:],
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True
+            )
+            alt_text = text[0].strip()
+            # Clean up the alt text
+            alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
+            # Remove common prefixes if they appear
+            prefixes_to_remove = ["This image shows", "The image shows", "This shows", "The figure shows"]
+            for prefix in prefixes_to_remove:
+                if alt_text.startswith(prefix):
+                    alt_text = alt_text[len(prefix):].strip()
+                    break
+            return alt_text if alt_text else "Image description unavailable"
+        except Exception as e:
+            print(f"❌ Error generating alt text: {e}")
+            import traceback
+            traceback.print_exc()
+            return "Image description unavailable"
+    def chat(self, prompt, history=None):
+        """Chat functionality using Gemma 3n for text-only conversations"""
+        try:
+            # Create message format
+            message = {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt}
+                ]
+            }
+            # If history exists, include it
+            conversation = history if history else []
+            conversation.append(message)
+            # Apply chat template and generate
+            input_ids = self.processor.apply_chat_template(
+                conversation,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+            )
+            input_len = input_ids["input_ids"].shape[-1]
+            input_ids = input_ids.to(self.model.device, dtype=self.model.dtype)
+            outputs = self.model.generate(
+                **input_ids,
+                max_new_tokens=1024,
+                disable_compile=True,
+                do_sample=True,
+                temperature=0.7
+            )
+            text = self.processor.batch_decode(
+                outputs[:, input_len:],
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True
+            )
+            return text[0].strip()
+        except Exception as e:
+            print(f"❌ Error in chat: {e}")
+            import traceback
+            traceback.print_exc()
+            return f"Error generating response: {str(e)}"
 def convert_pdf_to_images_gradio(pdf_file):
     """Convert uploaded PDF file to list of PIL Images"""
     try:
                 padded_image,
                 dims,
                 model,
+                max_batch_size=4
             )
             try:
         return error_msg, "error"
+def process_elements_optimized(layout_results, padded_image, dims, model, max_batch_size=4):
+    """Optimized element processing for powerful GPU"""
     layout_results = parse_layout_string(layout_results)
     text_elements = []
                     pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
                     pil_crop = crop_margin(pil_crop)
+                    # Generate alt text for accessibility using local Gemma 3n
+                    alt_text = gemma_model.generate_alt_text(pil_crop)
                     buffered = io.BytesIO()
                     pil_crop.save(buffered, format="PNG")
     return recognition_results
+def process_element_batch_optimized(elements, model, prompt, max_batch_size=4):
+    """Process elements in batches for powerful GPU"""
     results = []
     batch_size = min(len(elements), max_batch_size)
     return markdown_content
+# Initialize models
 model_path = "./hf_model"
 if not os.path.exists(model_path):
     model_path = "ByteDance/DOLPHIN"
 # Model paths and configuration
 model_path = "./hf_model" if os.path.exists("./hf_model") else "ByteDance/DOLPHIN"
 hf_token = os.getenv('HF_TOKEN')
+gemma_model_id = "google/gemma-3n-E4B-it"
+# Initialize models
+print("Loading DOLPHIN model...")
+dolphin_model = DOLPHIN(model_path)
+print(f"✅ DOLPHIN model loaded (Device: {dolphin_model.device})")
+print("Loading Gemma 3n model...")
+gemma_model = Gemma3nModel(gemma_model_id)
+model_status = "✅ Both models loaded successfully"
+# Initialize embedding model
 if RAG_DEPENDENCIES_AVAILABLE:
     try:
         print("Loading embedding model for RAG...")
         embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
         print("✅ Embedding model loaded successfully (CPU)")
     except Exception as e:
+        print(f"❌ Error loading embedding model: {e}")
         embedding_model = None
 else:
     print("❌ RAG dependencies not available")
     embedding_model = None
 # Global state for managing tabs
 document_chunks = []
 document_embeddings = None
 def chunk_document(text, chunk_size=1024, overlap=100):
+    """Split document into overlapping chunks for RAG"""
     words = text.split()
     chunks = []
         return "❌ No PDF uploaded", gr.Tabs(visible=False)
     try:
         # Process PDF
+        progress(0.1, desc="Processing PDF...")
+        combined_markdown, status = process_pdf_document(pdf_file, dolphin_model, progress)
         if status == "processing_complete":
             processed_markdown = combined_markdown
             document_embeddings = create_embeddings(document_chunks)
             print(f"Created {len(document_chunks)} chunks")
             show_results_tab = True
             progress(1.0, desc="PDF processed successfully!")
             return "✅ PDF processed successfully! Chatbot is ready in the Chat tab.", gr.Tabs(visible=True)
     document_chunks = []
     document_embeddings = None
+    # Clear GPU cache
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     return None, "", gr.Tabs(visible=False)
 # Create Gradio interface
 with gr.Blocks(
+    title="DOLPHIN PDF AI - Local Gemma 3n",
     theme=gr.themes.Soft(),
     css="""
     @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
         # Home Tab
         with gr.TabItem("🏠 Home", id="home"):
             embedding_status = "✅ RAG ready" if embedding_model else "❌ RAG not loaded"
             gr.Markdown(
+                "# Scholar Express - Local Gemma 3n Version\n"
+                "### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot powered by local Gemma 3n.\n"
                 f"**System:** {model_status}\n"
                 f"**RAG System:** {embedding_status}\n"
+                f"**DOLPHIN:** Local model for PDF processing\n"
+                f"**Gemma 3n:** Local model for alt text generation and chat\n"
                 f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
+                f"**GPU:** {'CUDA available' if torch.cuda.is_available() else 'CPU only'}"
             )
             with gr.Column(elem_classes="upload-container"):
                 send_btn = gr.Button("Send", variant="primary", scale=1)
             gr.Markdown(
+                "*Ask questions about your processed document. The AI uses RAG (Retrieval-Augmented Generation) with local Gemma 3n to find relevant sections and provide accurate answers.*",
                 elem_id="chat-notice"
             )
         outputs=[chat_tab]
     )
+    # Chatbot functionality with local Gemma 3n
     def chatbot_response(message, history):
         if not message.strip():
             return history
             return history + [[message, "❌ Please process a PDF document first before asking questions."]]
         try:
+            # Use RAG to get relevant chunks from markdown
             if document_chunks and len(document_chunks) > 0:
                 relevant_chunks = retrieve_relevant_chunks(message, document_chunks, document_embeddings, top_k=3)
                 context = "\n\n".join(relevant_chunks)
+                # Smart truncation: aim for ~6000 chars for local model
+                if len(context) > 6000:
                     # Try to cut at sentence boundaries
+                    sentences = context[:6000].split('.')
+                    context = '.'.join(sentences[:-1]) + '...' if len(sentences) > 1 else context[:6000] + '...'
             else:
                 # Fallback to truncated document if RAG fails
+                context = processed_markdown[:6000] + "..." if len(processed_markdown) > 6000 else processed_markdown
+            # Create prompt for Gemma 3n
             prompt = f"""You are a helpful assistant that answers questions about documents. Use the provided context to answer questions accurately and concisely.
 Context from the document:
 Please provide a clear and helpful answer based on the context provided."""
+            # Generate response using local Gemma 3n
+            response_text = gemma_model.chat(prompt)
+            return history + [[message, response_text]]
         except Exception as e:
             error_msg = f"❌ Error generating response: {str(e)}"
         server_port=7860,
         share=False,
         show_error=True,
+        max_threads=4,
         inbrowser=False,
         quiet=True
     )