Spaces:

om440
/

handwritten-to-text-fine-tunned

Running on Zero

App Files Files Community

handwritten-to-text-fine-tunned / app.py

om440

Update app.py (#1)

bc15e88 verified about 11 hours ago

raw

history blame contribute delete

3.69 kB

	from transformers import MllamaForConditionalGeneration, AutoProcessor
	from PIL import Image
	import torch
	import gradio as gr
	import spaces

	# Initialize model and processor
	ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct"
	model = MllamaForConditionalGeneration.from_pretrained(
	ckpt,
	torch_dtype=torch.bfloat16
	).to("cuda")
	processor = AutoProcessor.from_pretrained(ckpt)

	@spaces.GPU
	def extract_text(image):
	# Convert image to RGB
	image = Image.open(image).convert("RGB")
	prompt = (
	"Output ONLY the raw text as it appears in the image, nothing else."
	"You have an image containing both handwritten and printed text in French and/or English, and also punctuation and underscores.\n"
	"Your task: transcribe EXACTLY all visible text, preserving all characters, accents, punctuation, spacing, and line breaks.\n"
	"Include tables and forms clearly if present.\n"
	"Do NOT add any explanations, comments, summaries, or extra text.\n"
	"Check the output first to not duplicate results."
	"Preserve the original reading order, including line breaks and the natural layout of tables or forms. Output the text exactly as it appears visually, maintaining the structure."
	"Don't indicate blank space."
	"Don't separate handwritten and printed text."
	"DO NOT confuse between '.' a point and '\|' a border."
	"Extract only the raw text and do not add any comment."
	"Extract only the data available."
	)

	# Create message structure
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image"}
	]
	}
	]

	# Process input
	texts = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda")


	# Generate output
	outputs = model.generate(**inputs, max_new_tokens=250)
	result = processor.decode(outputs[0], skip_special_tokens=True)

	print(result)

	# Clean up the output to remove the prompt and assistant text
	if "assistant" in result.lower():
	result = result[result.lower().find("assistant") + len("assistant"):].strip()#hh

	# Remove any remaining conversation markers
	result = result.replace("user", "").replace("Output ONLY the raw text as it appears in the image, nothing else."
	"You have an image containing both handwritten and printed text in French and/or English, and also punctuation and underscores.\n"
	"Your task: transcribe EXACTLY all visible text, preserving all characters, accents, punctuation, spacing, and line breaks.\n"
	"Include tables and forms clearly if present.\n"
	"Do NOT add any explanations, comments, summaries, or extra text.\n"
	"Check the output first to not duplicate results."
	"Preserve the original reading order, including line breaks and the natural layout of tables or forms. Output the text exactly as it appears visually, maintaining the structure."
	"Don't indicate blank space."
	"Don't separate handwritten and printed text."
	"DO NOT confuse between '.' a point and '\|' a border."
	"Extract only the raw text and do not add any comment."
	"Extract only the data available.", "").strip()

	print(result)

	return result

	# Create Gradio interface
	demo = gr.Interface(
	fn=extract_text,
	inputs=gr.Image(type="filepath", label="Upload Image"),
	outputs=gr.Textbox(label="Extracted Text"),
	title="Handwritten Text Extractor",
	description="Upload an image containing handwritten text to extract its content.",
	)

	# Launch the app
	demo.launch(debug=True)