from transformers import MllamaForConditionalGeneration, AutoProcessor
from PIL import Image
import torch
import gradio as gr
import spaces

# Initialize model and processor
ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
    ckpt,
    torch_dtype=torch.bfloat16
).to("cuda")
processor = AutoProcessor.from_pretrained(ckpt)

@spaces.GPU
def extract_text(image):
    # Convert image to RGB
    image = Image.open(image).convert("RGB")
    prompt = (
    "Output ONLY the raw text as it appears in the image, nothing else."
    "You have an image containing both handwritten and printed text in French and/or English, and also punctuation and underscores.\n"
    "Your task: transcribe EXACTLY all visible text, preserving all characters, accents, punctuation, spacing, and line breaks.\n"
    "Include tables and forms clearly if present.\n"
    "Do NOT add any explanations, comments, summaries, or extra text.\n"
    "Check the output first to not duplicate results."
    "Preserve the original reading order, including line breaks and the natural layout of tables or forms. Output the text exactly as it appears visually, maintaining the structure."
    "Don't indicate blank space."
    "Don't separate handwritten and printed text."
    "DO NOT confuse between '.' a point and '|' a border."
    "Extract only the raw text and do not add any comment."
    "Extract only the data available."
)
    
    # Create message structure
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image"}
            ]
        }
    ]
    
    # Process input
    texts = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda")

    
    # Generate output
    outputs = model.generate(**inputs, max_new_tokens=250)
    result = processor.decode(outputs[0], skip_special_tokens=True)

    print(result)
    
    # Clean up the output to remove the prompt and assistant text
    if "assistant" in result.lower():
        result = result[result.lower().find("assistant") + len("assistant"):].strip()#hh
    
    # Remove any remaining conversation markers
    result = result.replace("user", "").replace("Output ONLY the raw text as it appears in the image, nothing else."
    "You have an image containing both handwritten and printed text in French and/or English, and also punctuation and underscores.\n"
    "Your task: transcribe EXACTLY all visible text, preserving all characters, accents, punctuation, spacing, and line breaks.\n"
    "Include tables and forms clearly if present.\n"
    "Do NOT add any explanations, comments, summaries, or extra text.\n"
    "Check the output first to not duplicate results."
    "Preserve the original reading order, including line breaks and the natural layout of tables or forms. Output the text exactly as it appears visually, maintaining the structure."
    "Don't indicate blank space."
    "Don't separate handwritten and printed text."
    "DO NOT confuse between '.' a point and '|' a border."
    "Extract only the raw text and do not add any comment."
    "Extract only the data available.", "").strip()

    print(result)
    
    return result

# Create Gradio interface
demo = gr.Interface(
    fn=extract_text,
    inputs=gr.Image(type="filepath", label="Upload Image"),
    outputs=gr.Textbox(label="Extracted Text"),
    title="Handwritten Text Extractor",
    description="Upload an image containing handwritten text to extract its content.",
)

# Launch the app
demo.launch(debug=True)