import gradio as gr from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from util.vision_util import process_vision_info import json from PIL import Image # Load model and processor model = Qwen2VLForConditionalGeneration.from_pretrained( "datamoon/qwen2-vl-iranian-idcard-ocr", device_map="auto" ) processor = AutoProcessor.from_pretrained( "datamoon/qwen2-vl-iranian-idcard-ocr", padding_side="left" ) def process_id_card(image_path): try: # Prepare the message with image and instruction messages = [{ "role": "user", "content": [ {"type": "image", "image": image_path}, {"type": "text", "text": """From this image which is a persian national id card, return a JSON object with these exact fields: { "national_id": "...", "first_name": "...", "last_name": "...", "date_of_birth": "...", "father_name": "...", "expiry_date": "..." } Return ONLY the JSON object, nothing else."""} ] }] # Process vision inputs image_inputs, _ = process_vision_info(messages) # Prepare text input text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Process inputs inputs = processor( text=text, images=image_inputs, return_tensors="pt", ).to(model.device) # Generate response generated_ids = model.generate( **inputs, max_new_tokens=256, do_sample=False ) # Decode and clean output generated_text = processor.batch_decode( generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] # Try to extract JSON from the output try: # Find JSON start and end json_start = generated_text.find('{') json_end = generated_text.rfind('}') + 1 json_str = generated_text[json_start:json_end] # Parse and validate JSON result = json.loads(json_str) required_fields = [ "national_id", "first_name", "last_name", "date_of_birth", "father_name", "expiry_date" ] for field in required_fields: if field not in result: raise ValueError(f"Missing field: {field}") return result except (json.JSONDecodeError, ValueError) as e: return {"error": f"Could not parse model output: {str(e)}", "raw_output": generated_text} except Exception as e: return {"error": str(e)} # Create Gradio interface iface = gr.Interface( fn=process_id_card, inputs=gr.Image(type="filepath", label="Upload ID Card Image"), outputs=gr.JSON(label="Extracted Information"), title="Persian ID Card Reader", description="""Upload an image of an Iranian national ID card to extract information. The system will return: national_id, first_name, last_name, date_of_birth, father_name, and expiry_date.""", examples=[ ["examples/id1.png"] ], allow_flagging="never" ) # Launch with queue for better performance iface.launch( server_name="0.0.0.0", share=False, debug=False )