Spaces:
Running
on
Zero
Running
on
Zero
from transformers import MllamaForConditionalGeneration, AutoProcessor | |
from PIL import Image | |
import torch | |
import gradio as gr | |
import spaces | |
# Initialize model and processor | |
ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct" | |
model = MllamaForConditionalGeneration.from_pretrained( | |
ckpt, | |
torch_dtype=torch.bfloat16 | |
).to("cuda") | |
processor = AutoProcessor.from_pretrained(ckpt) | |
def extract_text(image): | |
# Convert image to RGB | |
image = Image.open(image).convert("RGB") | |
prompt = ( | |
"Output ONLY the raw text as it appears in the image, nothing else." | |
"You have an image containing both handwritten and printed text in French and/or English, and also punctuation and underscores.\n" | |
"Your task: transcribe EXACTLY all visible text, preserving all characters, accents, punctuation, spacing, and line breaks.\n" | |
"Include tables and forms clearly if present.\n" | |
"Do NOT add any explanations, comments, summaries, or extra text.\n" | |
"Check the output first to not duplicate results." | |
"Preserve the original reading order, including line breaks and the natural layout of tables or forms. Output the text exactly as it appears visually, maintaining the structure." | |
"Don't indicate blank space." | |
"Don't separate handwritten and printed text." | |
"DO NOT confuse between '.' a point and '|' a border." | |
"Extract only the raw text and do not add any comment." | |
"Extract only the data available." | |
) | |
# Create message structure | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": prompt}, | |
{"type": "image"} | |
] | |
} | |
] | |
# Process input | |
texts = processor.apply_chat_template(messages, add_generation_prompt=True) | |
inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda") | |
# Generate output | |
outputs = model.generate(**inputs, max_new_tokens=250) | |
result = processor.decode(outputs[0], skip_special_tokens=True) | |
print(result) | |
# Clean up the output to remove the prompt and assistant text | |
if "assistant" in result.lower(): | |
result = result[result.lower().find("assistant") + len("assistant"):].strip()#hh | |
# Remove any remaining conversation markers | |
result = result.replace("user", "").replace("Output ONLY the raw text as it appears in the image, nothing else." | |
"You have an image containing both handwritten and printed text in French and/or English, and also punctuation and underscores.\n" | |
"Your task: transcribe EXACTLY all visible text, preserving all characters, accents, punctuation, spacing, and line breaks.\n" | |
"Include tables and forms clearly if present.\n" | |
"Do NOT add any explanations, comments, summaries, or extra text.\n" | |
"Check the output first to not duplicate results." | |
"Preserve the original reading order, including line breaks and the natural layout of tables or forms. Output the text exactly as it appears visually, maintaining the structure." | |
"Don't indicate blank space." | |
"Don't separate handwritten and printed text." | |
"DO NOT confuse between '.' a point and '|' a border." | |
"Extract only the raw text and do not add any comment." | |
"Extract only the data available.", "").strip() | |
print(result) | |
return result | |
# Create Gradio interface | |
demo = gr.Interface( | |
fn=extract_text, | |
inputs=gr.Image(type="filepath", label="Upload Image"), | |
outputs=gr.Textbox(label="Extracted Text"), | |
title="Handwritten Text Extractor", | |
description="Upload an image containing handwritten text to extract its content.", | |
) | |
# Launch the app | |
demo.launch(debug=True) |