Spaces:

Emeritus-21
/

handwritten-text-recognition

Running on Zero

App Files Files Community

Emeritus-21 commited on Aug 22

Commit

cd9abb6

verified ·

1 Parent(s): b49181f

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -89

app.py CHANGED Viewed

@@ -1,7 +1,4 @@
-# app.py — HTR Space (GPU-only, no webcam, mobile-ready)
 import os
-from threading import Thread
 import gradio as gr
 from PIL import Image
 import torch
@@ -9,9 +6,13 @@ from transformers import AutoProcessor, AutoModelForImageTextToText, Qwen2_5_VLF
 from reportlab.platypus import SimpleDocTemplate, Paragraph
 from reportlab.lib.styles import getSampleStyleSheet
 from docx import Document
 MAX_NEW_TOKENS_DEFAULT = 512
-DEVICE = "cuda"  # GPU-only
 # ---------------------------
 # Models config
@@ -44,12 +45,20 @@ def load_model(name):
     return processor, model
 # ---------------------------
-# Helpers
 # ---------------------------
-def _default_prompt(query: str | None) -> str:
-    if query and query.strip():
-        return query.strip()
-    return (
         "You are a professional Handwritten OCR system.\n"
         "TASK: Read the handwritten image and transcribe the text EXACTLY as written.\n"
         "- Preserve original structure and line breaks.\n"
@@ -60,50 +69,7 @@ def _default_prompt(query: str | None) -> str:
         "Return RAW transcription only."
     )
-def _build_inputs_plain(processor, image: Image.Image, prompt: str):
-    return processor(text=[prompt], images=[image], return_tensors="pt").to(DEVICE)
-def _decode_text(model, processor, tokenizer, output_ids):
-    text = ""
-    try:
-        if hasattr(processor, "batch_decode"):
-            text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
-            return text
-    except Exception:
-        pass
-    try:
-        if tokenizer is not None:
-            text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
-            return text
-    except Exception:
-        pass
-    try:
-        model_tok = getattr(model, "tokenizer", None)
-        if model_tok is not None:
-            text = model_tok.batch_decode(output_ids, skip_special_tokens=True)[0]
-            return text
-    except Exception:
-        pass
-    return str(output_ids)
-# ---------------------------
-# GPU OCR function
-# ---------------------------
-from spaces import GPU
-@GPU
-def ocr_image_gpu(image: Image.Image, model_choice: str, query: str = None,
-                  max_new_tokens: int = MAX_NEW_TOKENS_DEFAULT, temperature: float = 0.1,
-                  top_p: float = 1.0, top_k: int = 0, repetition_penalty: float = 1.0):
-    if image is None:
-        return "Please upload an image."
-    if model_choice not in MODEL_PATHS:
-        return f"Invalid model: {model_choice}"
-    processor, model = load_model(model_choice)
-    prompt = _default_prompt(query)
-    batch = _build_inputs_plain(processor, image, prompt)
     with torch.inference_mode():
         output_ids = model.generate(
@@ -116,11 +82,17 @@ def ocr_image_gpu(image: Image.Image, model_choice: str, query: str = None,
             repetition_penalty=repetition_penalty,
         )
-    decoded = _decode_text(model, processor, None, output_ids)
-    return decoded.replace("<|im_end|>", "").strip()
 # ---------------------------
-# Export functions
 # ---------------------------
 def _safe_text(text: str) -> str:
     return (text or "").strip()
@@ -153,15 +125,10 @@ def save_as_audio(text):
     text = _safe_text(text)
     if not text:
         return None
-    try:
-        from gTTS import gTTS
-        filepath = "output.mp3"
-        tts = gTTS(text)
-        tts.save(filepath)
-        return filepath
-    except Exception as e:
-        print(f"gTTS failed: {e}")
-        return None
 # ---------------------------
 # Gradio UI
@@ -180,46 +147,45 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         placeholder="Leave empty for RAW structured output",
     )
-    image_input = gr.Image(type="pil", label="Upload Image")
-    # Advanced Options
-    with gr.Accordion("⚙️ Advanced Options", open=False):
-        max_new_tokens = gr.Slider(1, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=1, label="Max new tokens")
-        temperature = gr.Slider(0.1, 2.0, value=0.1, step=0.05, label="Temperature")
-        top_p = gr.Slider(0.05, 1.0, value=1.0, step=0.05, label="Top-p (nucleus)")
-        top_k = gr.Slider(0, 1000, value=0, step=1, label="Top-k")
-        repetition_penalty = gr.Slider(0.8, 2.0, value=1.0, step=0.05, label="Repetition penalty")
-    # ✅ Extract Button ABOVE output
     extract_btn = gr.Button("📤 Extract RAW Text", variant="primary")
     raw_output = gr.Textbox(
         label="📜 RAW Structured Output (exact as written)",
         lines=18,
         show_copy_button=True,
     )
-    pdf_btn = gr.Button("⬇️ Download as PDF")
-    word_btn = gr.Button("⬇️ Download as Word")
-    audio_btn = gr.Button("🔊 Download as Audio")
-    # ---------------------------
-    # Button Callbacks
-    # ---------------------------
-    def on_extract(uploaded, model, query, max_tokens, temp, top_p, top_k, rep):
-        return ocr_image_gpu(uploaded, model, query, max_tokens, temp, top_p, top_k, rep)
     extract_btn.click(
-        fn=on_extract,
         inputs=[image_input, model_choice, query_input,
                 max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output]
     )
-    pdf_btn.click(fn=save_as_pdf, inputs=[raw_output], outputs=[pdf_btn])
-    word_btn.click(fn=save_as_word, inputs=[raw_output], outputs=[word_btn])
-    audio_btn.click(fn=save_as_audio, inputs=[raw_output], outputs=[audio_btn])
     clear_btn = gr.Button("🧹 Clear")
     clear_btn.click(
         fn=lambda: ("", None, "", MAX_NEW_TOKENS_DEFAULT, 0.1, 1.0, 0, 1.0),

 import os
 import gradio as gr
 from PIL import Image
 import torch
 from reportlab.platypus import SimpleDocTemplate, Paragraph
 from reportlab.lib.styles import getSampleStyleSheet
 from docx import Document
+from gtts import gTTS
+# ---------------------------
+# Device & constants
+# ---------------------------
+DEVICE = "cuda"  # Force GPU usage
 MAX_NEW_TOKENS_DEFAULT = 512
 # ---------------------------
 # Models config
     return processor, model
 # ---------------------------
+# OCR function (GPU ready)
 # ---------------------------
+@gr.utils.space_decorator  # Spaces decorator to detect GPU
+def ocr_image_gpu(image: Image.Image, model_choice: str, query: str = None,
+                  max_new_tokens: int = MAX_NEW_TOKENS_DEFAULT, temperature: float = 0.1,
+                  top_p: float = 1.0, top_k: int = 0, repetition_penalty: float = 1.0):
+    if image is None:
+        return "Please upload an image."
+    if model_choice not in MODEL_PATHS:
+        return f"Invalid model: {model_choice}"
+    processor, model = load_model(model_choice)
+    prompt = query.strip() if query and query.strip() else (
         "You are a professional Handwritten OCR system.\n"
         "TASK: Read the handwritten image and transcribe the text EXACTLY as written.\n"
         "- Preserve original structure and line breaks.\n"
         "Return RAW transcription only."
     )
+    batch = processor(text=[prompt], images=[image], return_tensors="pt").to(DEVICE)
     with torch.inference_mode():
         output_ids = model.generate(
             repetition_penalty=repetition_penalty,
         )
+    # decode safely
+    text = ""
+    if hasattr(processor, "batch_decode"):
+        text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
+    elif hasattr(model, "tokenizer") and model.tokenizer is not None:
+        text = model.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+    text = text.replace("<|im_end|>", "").strip()
+    return text
 # ---------------------------
+# Export helpers
 # ---------------------------
 def _safe_text(text: str) -> str:
     return (text or "").strip()
     text = _safe_text(text)
     if not text:
         return None
+    filepath = "output.mp3"
+    tts = gTTS(text)
+    tts.save(filepath)
+    return filepath
 # ---------------------------
 # Gradio UI
         placeholder="Leave empty for RAW structured output",
     )
+    image_input = gr.Image(type="pil", label="Upload Image (desktop/mobile)")
+    # Buttons first
     extract_btn = gr.Button("📤 Extract RAW Text", variant="primary")
     raw_output = gr.Textbox(
         label="📜 RAW Structured Output (exact as written)",
         lines=18,
         show_copy_button=True,
     )
+    pdf_file = gr.File(label="PDF File")
+    word_file = gr.File(label="Word File")
+    audio_file = gr.File(label="Audio File")
+    with gr.Accordion("⚙️ Advanced Options", open=False):
+        max_new_tokens = gr.Slider(1, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=1, label="Max new tokens")
+        temperature = gr.Slider(0.1, 2.0, value=0.1, step=0.05, label="Temperature")
+        top_p = gr.Slider(0.05, 1.0, value=1.0, step=0.05, label="Top-p (nucleus)")
+        top_k = gr.Slider(0, 1000, value=0, step=1, label="Top-k")
+        repetition_penalty = gr.Slider(0.8, 2.0, value=1.0, step=0.05, label="Repetition penalty")
+    # Extract text
     extract_btn.click(
+        fn=ocr_image_gpu,
         inputs=[image_input, model_choice, query_input,
                 max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output]
     )
+    # Export buttons
+    pdf_btn = gr.Button("⬇️ Download as PDF")
+    pdf_btn.click(fn=save_as_pdf, inputs=[raw_output], outputs=[pdf_file])
+    word_btn = gr.Button("⬇️ Download as Word")
+    word_btn.click(fn=save_as_word, inputs=[raw_output], outputs=[word_file])
+    audio_btn = gr.Button("🔊 Download as Audio")
+    audio_btn.click(fn=save_as_audio, inputs=[raw_output], outputs=[audio_file])
+    # Clear button
     clear_btn = gr.Button("🧹 Clear")
     clear_btn.click(
         fn=lambda: ("", None, "", MAX_NEW_TOKENS_DEFAULT, 0.1, 1.0, 0, 1.0),