Spaces:

Emeritus-21
/

handwritten-text-recognition

Running on Zero

App Files Files

xet

Community

Emeritus-21 commited on Aug 22

Commit

b49181f

verified ·

1 Parent(s): 37e07da

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -26

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py — HTR Space (GPU-ready, single-model load, mobile-ready)
 import os
 from threading import Thread
@@ -9,10 +9,9 @@ from transformers import AutoProcessor, AutoModelForImageTextToText, Qwen2_5_VLF
 from reportlab.platypus import SimpleDocTemplate, Paragraph
 from reportlab.lib.styles import getSampleStyleSheet
 from docx import Document
-from spaces import GPU  # <-- required for Spaces GPU
 MAX_NEW_TOKENS_DEFAULT = 512
-DEVICE = "cuda"
 # ---------------------------
 # Models config
@@ -88,14 +87,16 @@ def _decode_text(model, processor, tokenizer, output_ids):
     return str(output_ids)
 # ---------------------------
-# OCR function for GPU
 # ---------------------------
 @GPU
 def ocr_image_gpu(image: Image.Image, model_choice: str, query: str = None,
                   max_new_tokens: int = MAX_NEW_TOKENS_DEFAULT, temperature: float = 0.1,
                   top_p: float = 1.0, top_k: int = 0, repetition_penalty: float = 1.0):
     if image is None:
-        return "Please upload or capture an image."
     if model_choice not in MODEL_PATHS:
         return f"Invalid model: {model_choice}"
@@ -179,9 +180,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         placeholder="Leave empty for RAW structured output",
     )
-    image_input = gr.Image(type="pil", label="Upload Image (desktop/mobile)")
-    webcam_input = gr.Image(type="pil", label="Take Photo (mobile/desktop)")
     with gr.Accordion("⚙️ Advanced Options", open=False):
         max_new_tokens = gr.Slider(1, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=1, label="Max new tokens")
         temperature = gr.Slider(0.1, 2.0, value=0.1, step=0.05, label="Temperature")
@@ -189,41 +190,40 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         top_k = gr.Slider(0, 1000, value=0, step=1, label="Top-k")
         repetition_penalty = gr.Slider(0.8, 2.0, value=1.0, step=0.05, label="Repetition penalty")
     raw_output = gr.Textbox(
         label="📜 RAW Structured Output (exact as written)",
         lines=18,
         show_copy_button=True,
     )
-    pdf_file = gr.File(label="PDF File")
-    word_file = gr.File(label="Word File")
-    audio_file = gr.File(label="Audio File")
-    def choose_image(uploaded, webcam):
-        return webcam if webcam is not None else uploaded
-    extract_btn = gr.Button("📤 Extract RAW Text", variant="primary")
     extract_btn.click(
-        fn=lambda uploaded, webcam, model, query, max_tokens, temp, top_p, top_k, rep:
-            ocr_image_gpu(choose_image(uploaded, webcam), model, query, max_tokens, temp, top_p, top_k, rep),
-        inputs=[image_input, webcam_input, model_choice, query_input,
                 max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output]
     )
-    pdf_btn = gr.Button("⬇️ Download as PDF")
-    pdf_btn.click(fn=save_as_pdf, inputs=[raw_output], outputs=[pdf_file])
-    word_btn = gr.Button("⬇️ Download as Word")
-    word_btn.click(fn=save_as_word, inputs=[raw_output], outputs=[word_file])
-    audio_btn = gr.Button("🔊 Download as Audio")
-    audio_btn.click(fn=save_as_audio, inputs=[raw_output], outputs=[audio_file])
     clear_btn = gr.Button("🧹 Clear")
     clear_btn.click(
-        fn=lambda: ("", None, None, "", MAX_NEW_TOKENS_DEFAULT, 0.1, 1.0, 0, 1.0),
-        outputs=[raw_output, image_input, webcam_input, query_input,
                  max_new_tokens, temperature, top_p, top_k, repetition_penalty],
     )

+# app.py — HTR Space (GPU-only, no webcam, mobile-ready)
 import os
 from threading import Thread
 from reportlab.platypus import SimpleDocTemplate, Paragraph
 from reportlab.lib.styles import getSampleStyleSheet
 from docx import Document
 MAX_NEW_TOKENS_DEFAULT = 512
+DEVICE = "cuda"  # GPU-only
 # ---------------------------
 # Models config
     return str(output_ids)
 # ---------------------------
+# GPU OCR function
 # ---------------------------
+from spaces import GPU
 @GPU
 def ocr_image_gpu(image: Image.Image, model_choice: str, query: str = None,
                   max_new_tokens: int = MAX_NEW_TOKENS_DEFAULT, temperature: float = 0.1,
                   top_p: float = 1.0, top_k: int = 0, repetition_penalty: float = 1.0):
     if image is None:
+        return "Please upload an image."
     if model_choice not in MODEL_PATHS:
         return f"Invalid model: {model_choice}"
         placeholder="Leave empty for RAW structured output",
     )
+    image_input = gr.Image(type="pil", label="Upload Image")
+    # Advanced Options
     with gr.Accordion("⚙️ Advanced Options", open=False):
         max_new_tokens = gr.Slider(1, 2048, value=MAX_NEW_TOKENS_DEFAULT, step=1, label="Max new tokens")
         temperature = gr.Slider(0.1, 2.0, value=0.1, step=0.05, label="Temperature")
         top_k = gr.Slider(0, 1000, value=0, step=1, label="Top-k")
         repetition_penalty = gr.Slider(0.8, 2.0, value=1.0, step=0.05, label="Repetition penalty")
+    # ✅ Extract Button ABOVE output
+    extract_btn = gr.Button("📤 Extract RAW Text", variant="primary")
     raw_output = gr.Textbox(
         label="📜 RAW Structured Output (exact as written)",
         lines=18,
         show_copy_button=True,
     )
+    pdf_btn = gr.Button("⬇️ Download as PDF")
+    word_btn = gr.Button("⬇️ Download as Word")
+    audio_btn = gr.Button("🔊 Download as Audio")
+    # ---------------------------
+    # Button Callbacks
+    # ---------------------------
+    def on_extract(uploaded, model, query, max_tokens, temp, top_p, top_k, rep):
+        return ocr_image_gpu(uploaded, model, query, max_tokens, temp, top_p, top_k, rep)
     extract_btn.click(
+        fn=on_extract,
+        inputs=[image_input, model_choice, query_input,
                 max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[raw_output]
     )
+    pdf_btn.click(fn=save_as_pdf, inputs=[raw_output], outputs=[pdf_btn])
+    word_btn.click(fn=save_as_word, inputs=[raw_output], outputs=[word_btn])
+    audio_btn.click(fn=save_as_audio, inputs=[raw_output], outputs=[audio_btn])
     clear_btn = gr.Button("🧹 Clear")
     clear_btn.click(
+        fn=lambda: ("", None, "", MAX_NEW_TOKENS_DEFAULT, 0.1, 1.0, 0, 1.0),
+        outputs=[raw_output, image_input, query_input,
                  max_new_tokens, temperature, top_p, top_k, repetition_penalty],
     )