import gradio as gr
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from util.vision_util import process_vision_info
import json
from PIL import Image

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "datamoon/qwen2-vl-iranian-idcard-ocr",
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "datamoon/qwen2-vl-iranian-idcard-ocr",
    padding_side="left"
)

def process_id_card(image_path):
    try:
        # Prepare the message with image and instruction
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": """From this image which is a persian national id card, 
                return a JSON object with these exact fields:
                {
                    "national_id": "...",
                    "first_name": "...",
                    "last_name": "...",
                    "date_of_birth": "...",
                    "father_name": "...",
                    "expiry_date": "..."
                }
                Return ONLY the JSON object, nothing else."""}
            ]
        }]
        
        # Process vision inputs
        image_inputs, _ = process_vision_info(messages)
        
        # Prepare text input
        text = processor.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        
        # Process inputs
        inputs = processor(
            text=text,
            images=image_inputs,
            return_tensors="pt",
        ).to(model.device)
        
        # Generate response
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False
        )
        
        # Decode and clean output
        generated_text = processor.batch_decode(
            generated_ids[:, inputs.input_ids.shape[1]:],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0]
        
        # Try to extract JSON from the output
        try:
            # Find JSON start and end
            json_start = generated_text.find('{')
            json_end = generated_text.rfind('}') + 1
            json_str = generated_text[json_start:json_end]
            
            # Parse and validate JSON
            result = json.loads(json_str)
            required_fields = [
                "national_id", "first_name", "last_name",
                "date_of_birth", "father_name", "expiry_date"
            ]
            
            for field in required_fields:
                if field not in result:
                    raise ValueError(f"Missing field: {field}")
                    
            return result
            
        except (json.JSONDecodeError, ValueError) as e:
            return {"error": f"Could not parse model output: {str(e)}", "raw_output": generated_text}
            
    except Exception as e:
        return {"error": str(e)}

# Create Gradio interface
iface = gr.Interface(
    fn=process_id_card,
    inputs=gr.Image(type="filepath", label="Upload ID Card Image"),
    outputs=gr.JSON(label="Extracted Information"),
    title="Persian ID Card Reader",
    description="""Upload an image of an Iranian national ID card to extract information.
    The system will return: national_id, first_name, last_name, date_of_birth, father_name, and expiry_date.""",
    examples=[
        ["examples/id1.png"]
    ],
    allow_flagging="never"
)

# Launch with queue for better performance
iface.launch(
    server_name="0.0.0.0",
    share=False,
    debug=False
)