import spaces
import os
import json
import requests
import torch
import gradio as gr
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
from pdf2image import convert_from_path
from PyPDF2 import PdfReader

# Load the multimodal model
model_id = "miike-ai/r1-11b-vision"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)

# File download function (for remote images or PDFs)
def download_file(url, save_dir="downloads"):
    os.makedirs(save_dir, exist_ok=True)
    local_filename = os.path.join(save_dir, url.split("/")[-1])
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(local_filename, "wb") as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        return local_filename
    return None

# Extracts text and images from a PDF
def extract_pdf_content(pdf_path):
    extracted_text = []
    images = convert_from_path(pdf_path)[:1]  # Keep the first page image

    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text = page.extract_text()
        if text:
            extracted_text.append(text)

    return " ".join(extracted_text), images

# Core multimodal processing function
@spaces.GPU
def multimodal_chat(text_prompt, file_input=None):
    conversation = []
    images = []
    extracted_text = ""
    
    # Handle file input (if any)
    if file_input:
        file_path = file_input.name if hasattr(file_input, "name") else file_input  # Handle both file objects & paths
        
        if isinstance(file_path, str) and file_path.startswith("http"):
            file_path = download_file(file_path)

        if file_path.lower().endswith(".pdf"):
            extracted_text, images = extract_pdf_content(file_path)
        elif file_path.lower().endswith((".png", ".jpg", ".jpeg", ".webp")):
            images.append(Image.open(file_path))

    # Prepare user input
    user_message = {"role": "user", "content": [{"type": "text", "text": text_prompt}]}
    
    if extracted_text:
        user_message["content"].append({"type": "text", "text": extracted_text})
    if images:
        user_message["content"].insert(0, {"type": "image"})

    conversation.append(user_message)

    # Apply chat template and process input
    input_text = processor.apply_chat_template(conversation, add_generation_prompt=True)

    if images:
        inputs = processor(images=images, text=[input_text], add_special_tokens=True, return_tensors="pt").to(model.device)
    else:
        inputs = processor(text=[input_text], add_special_tokens=True, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=8192)

    response_text = processor.decode(output[0], skip_special_tokens=True)

    # Format JSON response
    response_json = {
        "user_input": text_prompt,
        "file_path": file_path if file_input else None,
        "response": response_text
    }

    return json.dumps(response_json, indent=4)

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# 🤖 Multimodal AI Chatbot")
    gr.Markdown("Type a message and optionally upload an **image or PDF** to chat with the AI.")

    text_input = gr.Textbox(label="Enter your question")
    file_input = gr.File(label="Upload an image/PDF (or enter URL)", type="filepath", interactive=True)
    chat_button = gr.Button("Submit")
    output_json = gr.Textbox(label="Response (JSON Output)", interactive=False)

    chat_button.click(multimodal_chat, inputs=[text_input, file_input], outputs=output_json)

# Run the Gradio app
demo.launch()