from transformers import MllamaForConditionalGeneration, AutoProcessor from PIL import Image import torch import gradio as gr import spaces # Initialize model and processor ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( ckpt, torch_dtype=torch.bfloat16 ).to("cuda") processor = AutoProcessor.from_pretrained(ckpt) @spaces.GPU def extract_text(image): # Convert image to RGB image = Image.open(image).convert("RGB") prompt = ( "Output ONLY the raw text as it appears in the image, nothing else." "You have an image containing both handwritten and printed text in French and/or English, and also punctuation and underscores.\n" "Your task: transcribe EXACTLY all visible text, preserving all characters, accents, punctuation, spacing, and line breaks.\n" "Include tables and forms clearly if present.\n" "Do NOT add any explanations, comments, summaries, or extra text.\n" "Check the output first to not duplicate results." "Preserve the original reading order, including line breaks and the natural layout of tables or forms. Output the text exactly as it appears visually, maintaining the structure." "Don't indicate blank space." "Don't separate handwritten and printed text." "DO NOT confuse between '.' a point and '|' a border." "Extract only the raw text and do not add any comment." "Extract only the data available." ) # Create message structure messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image"} ] } ] # Process input texts = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda") # Generate output outputs = model.generate(**inputs, max_new_tokens=250) result = processor.decode(outputs[0], skip_special_tokens=True) print(result) # Clean up the output to remove the prompt and assistant text if "assistant" in result.lower(): result = result[result.lower().find("assistant") + len("assistant"):].strip()#hh # Remove any remaining conversation markers result = result.replace("user", "").replace("Output ONLY the raw text as it appears in the image, nothing else." "You have an image containing both handwritten and printed text in French and/or English, and also punctuation and underscores.\n" "Your task: transcribe EXACTLY all visible text, preserving all characters, accents, punctuation, spacing, and line breaks.\n" "Include tables and forms clearly if present.\n" "Do NOT add any explanations, comments, summaries, or extra text.\n" "Check the output first to not duplicate results." "Preserve the original reading order, including line breaks and the natural layout of tables or forms. Output the text exactly as it appears visually, maintaining the structure." "Don't indicate blank space." "Don't separate handwritten and printed text." "DO NOT confuse between '.' a point and '|' a border." "Extract only the raw text and do not add any comment." "Extract only the data available.", "").strip() print(result) return result # Create Gradio interface demo = gr.Interface( fn=extract_text, inputs=gr.Image(type="filepath", label="Upload Image"), outputs=gr.Textbox(label="Extracted Text"), title="Handwritten Text Extractor", description="Upload an image containing handwritten text to extract its content.", ) # Launch the app demo.launch(debug=True)