Spaces:

Emeritus-21
/

handwritten-text-recognition

Runtime error

App Files Files

xet

Community

Emeritus-21 commited on Aug 21

Commit

61e3d24

verified ·

1 Parent(s): 5e48658

Create app.py

Browse files

Files changed (1) hide show

app.py +316 -0

app.py ADDED Viewed

	@@ -0,0 +1,316 @@

+# app.py — HTR Space (full) with downloads (PDF/DOCX/MP3) + webcam support (Gradio 4.x)
+import os
+import time
+from threading import Thread
+import gradio as gr
+import spaces
+from PIL import Image
+import torch
+from transformers import (
+    AutoProcessor,
+    AutoModelForImageTextToText,
+    Qwen2_5_VLForConditionalGeneration,
+    TextIteratorStreamer,
+)
+# ---------------------------
+# Models
+# ---------------------------
+MODEL_PATHS = {
+    "Model 1 (Complex handwrittings )": (
+        "prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it",
+        Qwen2_5_VLForConditionalGeneration,
+    ),
+    "Model 2 (simple and scanned handwritting )": (
+        "nanonets/Nanonets-OCR-s",
+        Qwen2_5_VLForConditionalGeneration,
+    ),
+    "Model 3 (structured handwritting)": (
+        "Emeritus-21/Finetuned-full-HTR-model",
+        AutoModelForImageTextToText,
+    ),
+}
+MAX_NEW_TOKENS_DEFAULT = 512
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# ---------------------------
+# Preload models at startup
+# ---------------------------
+_loaded_processors = {}
+_loaded_models = {}
+print("🚀 Preloading models into GPU/CPU memory...")
+for name, (repo_id, cls) in MODEL_PATHS.items():
+    try:
+        print(f"Loading {name} ...")
+        processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
+        model = cls.from_pretrained(
+            repo_id,
+            trust_remote_code=True,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            low_cpu_mem_usage=True,
+        ).to(device).eval()
+        _loaded_processors[name] = processor
+        _loaded_models[name] = model
+        print(f"✅ {name} ready.")
+    except Exception as e:
+        print(f"⚠️ Failed to load {name}: {e}")
+# ---------------------------
+# Warmup (GPU)
+# ---------------------------
+@spaces.GPU
+def warmup():
+    try:
+        default_model_choice = list(MODEL_PATHS.keys())[0]
+        processor = _loaded_processors[default_model_choice]
+        model = _loaded_models[default_model_choice]
+        tokenizer = getattr(processor, "tokenizer", None)
+        messages = [{"role": "user", "content": [{"type": "text", "text": "Warmup."}]}]
+        if tokenizer and hasattr(tokenizer, "apply_chat_template"):
+            chat_prompt = tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+        else:
+            chat_prompt = "Warmup."
+        inputs = processor(
+            text=[chat_prompt],
+            images=None,
+            return_tensors="pt"
+        ).to(device)
+        with torch.inference_mode():
+            _ = model.generate(**inputs, max_new_tokens=1)
+        return f"GPU warm and {default_model_choice} ready."
+    except Exception as e:
+        return f"Warmup skipped: {e}"
+# ---------------------------
+# OCR Function (RAW ONLY)
+# ---------------------------
+@spaces.GPU
+def ocr_image(
+    image: Image.Image,
+    model_choice: str,
+    query: str = None,
+    max_new_tokens: int = MAX_NEW_TOKENS_DEFAULT,
+    temperature: float = 0.1,
+    top_p: float = 1.0,
+    top_k: int = 0,
+    repetition_penalty: float = 1.0,
+):
+    if image is None:
+        yield "Please upload or capture an image."
+        return
+    if model_choice not in _loaded_models:
+        yield f"Invalid model: {model_choice}"
+        return
+    processor = _loaded_processors[model_choice]
+    model = _loaded_models[model_choice]
+    tokenizer = getattr(processor, "tokenizer", None)
+    if query and query.strip():
+        prompt = query.strip()
+    else:
+        prompt = (
+            "You are a professional Handwritten OCR system.\n"
+            "TASK: Read the handwritten image and transcribe the text EXACTLY as written.\n"
+            "- Preserve original structure and line breaks.\n"
+            "- Keep spacing, bullet points, numbering, and indentation.\n"
+            "- Render tables as Markdown tables if present.\n"
+            "- Do NOT autocorrect spelling or grammar.\n"
+            "- Do NOT merge lines.\n"
+            "Return RAW transcription only."
+        )
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    # Build chat prompt (prefer tokenizer chat template if available)
+    if tokenizer and hasattr(tokenizer, "apply_chat_template"):
+        chat_prompt = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+    else:
+        # fallback: just use plain prompt
+        chat_prompt = prompt
+    # Processor packs both text + image for VLMs
+    inputs = processor(
+        text=[chat_prompt],
+        images=[image],
+        return_tensors="pt"
+    ).to(device)
+    # Use tokenizer (if present) in streamer for correct detokenization
+    streamer = TextIteratorStreamer(
+        tokenizer if tokenizer is not None else None,
+        skip_prompt=True,
+        skip_special_tokens=True,
+    )
+    generation_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        new_text = new_text.replace("<|im_end|>", "")
+        buffer += new_text
+        # small sleep to smooth streaming
+        time.sleep(0.01)
+        yield buffer
+# ---------------------------
+# Export Helpers
+# ---------------------------
+from reportlab.platypus import SimpleDocTemplate, Paragraph
+from reportlab.lib.styles import getSampleStyleSheet
+from docx import Document
+from gtts import gTTS
+def _safe_text(text: str) -> str:
+    return (text or "").strip()
+def save_as_pdf(text):
+    text = _safe_text(text)
+    if not text:
+        return None
+    filepath = "output.pdf"
+    doc = SimpleDocTemplate(filepath)
+    styles = getSampleStyleSheet()
+    flowables = [Paragraph(t, styles["Normal"]) for t in text.splitlines() if t != ""]
+    if not flowables:
+        flowables = [Paragraph(" ", styles["Normal"])]
+    doc.build(flowables)
+    return filepath
+def save_as_word(text):
+    text = _safe_text(text)
+    if not text:
+        return None
+    filepath = "output.docx"
+    doc = Document()
+    for line in text.splitlines():
+        doc.add_paragraph(line)
+    doc.save(filepath)
+    return filepath
+def save_as_audio(text):
+    text = _safe_text(text)
+    if not text:
+        return None
+    filepath = "output.mp3"
+    # NOTE: gTTS uses an online service; Spaces must have outbound internet enabled.
+    tts = gTTS(text)
+    tts.save(filepath)
+    return filepath
+# ---------------------------
+# Gradio Interface
+# ---------------------------
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## ✍🏾 wilson Handwritten OCR ")
+    model_choice = gr.Radio(
+        choices=list(MODEL_PATHS.keys()),
+        value=list(MODEL_PATHS.keys())[0],
+        label="Select OCR Model",
+    )
+    with gr.Tab("🖼 Image Inference"):
+        query_input = gr.Textbox(
+            label="Custom Prompt (optional)",
+            placeholder="Leave empty for RAW structured output",
+        )
+        # Gradio 4.x: use `sources` instead of deprecated `source`/`tool`
+        # This enables both Upload and Webcam capture. On mobile, users can switch front/back camera
+        # via the browser UI (programmatic 'back' forcing isn't supported across all browsers).
+        image_input = gr.Image(
+            type="pil",
+            label="Upload / Capture Handwritten Image",
+            sources=["upload", "webcam"],
+        )
+        with gr.Accordion("⚙️ Advanced Options", open=False):
+            max_new_tokens = gr.Slider(1, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=1, label="Max new tokens")
+            temperature = gr.Slider(0.1, 2.0, value=0.1, step=0.05, label="Temperature")
+            top_p = gr.Slider(0.05, 1.0, value=1.0, step=0.05, label="Top-p (nucleus)")
+            top_k = gr.Slider(0, 1000, value=0, step=1, label="Top-k")
+            repetition_penalty = gr.Slider(0.8, 2.0, value=1.0, step=0.05, label="Repetition penalty")
+        with gr.Row():
+            extract_btn = gr.Button("📤 Extract RAW Text", variant="primary")
+            clear_btn = gr.Button("🧹 Clear")
+        raw_output = gr.Textbox(
+            label="📜 RAW Structured Output (exact as written)",
+            lines=18,
+            show_copy_button=True,
+        )
+        with gr.Row():
+            pdf_btn = gr.Button("⬇️ Download as PDF")
+            word_btn = gr.Button("⬇️ Download as Word")
+            audio_btn = gr.Button("🔊 Download as Audio")
+        pdf_file = gr.File(label="PDF File")
+        word_file = gr.File(label="Word File")
+        audio_file = gr.File(label="Audio File")
+        extract_btn.click(
+            fn=ocr_image,
+            inputs=[
+                image_input,
+                model_choice,
+                query_input,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+                repetition_penalty,
+            ],
+            outputs=[raw_output],
+            api_name="ocr_image",
+        )
+        pdf_btn.click(fn=save_as_pdf, inputs=[raw_output], outputs=[pdf_file])
+        word_btn.click(fn=save_as_word, inputs=[raw_output], outputs=[word_file])
+        audio_btn.click(fn=save_as_audio, inputs=[raw_output], outputs=[audio_file])
+        clear_btn.click(
+            fn=lambda: ("", None, "", MAX_NEW_TOKENS_DEFAULT, 0.1, 1.0, 0, 1.0),
+            outputs=[raw_output, image_input, query_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        )
+if __name__ == "__main__":
+    # queue helps with GPU models; SSR off avoids hydration mismatches on Spaces
+    demo.queue(max_size=50).launch(share=True, ssr_mode=False, show_error=True)