Spaces:

prithivMLmods
/

Multimodal-OCR2

Running on Zero

App Files Files Community

prithivMLmods commited on Jun 20

Commit

5d40470

verified ·

1 Parent(s): ddc0e4b

initial commit (#1)

Browse files

- initial commit (f32c616c2efd2941b1ed6d239721513531329842)

Files changed (12) hide show

.gitattributes +5 -0
app.py +383 -0
images/1.png +3 -0
images/2.jpg +3 -0
images/3.png +3 -0
images/4.png +0 -0
images/5.jpg +0 -0
images/6.jpg +0 -0
images/7.jpg +0 -0
requirements.txt +15 -0
videos/1.mp4 +3 -0
videos/2.mp4 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/1.png filter=lfs diff=lfs merge=lfs -text
+images/2.jpg filter=lfs diff=lfs merge=lfs -text
+images/3.png filter=lfs diff=lfs merge=lfs -text
+videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/2.mp4 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import os
+import random
+import uuid
+import json
+import time
+import asyncio
+from threading import Thread
+import gradio as gr
+import spaces
+import torch
+import numpy as np
+from PIL import Image, ImageOps
+import cv2
+from transformers import (
+    Qwen2VLForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+    AutoModelForVision2Seq,
+    AutoProcessor,
+    TextIteratorStreamer,
+)
+from transformers.image_utils import load_image
+from docling_core.types.doc import DoclingDocument, DocTagsDocument
+import re
+import ast
+import html
+# Constants for text generation
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load Nanonets-OCR-s
+MODEL_ID_M = "nanonets/Nanonets-OCR-s"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
+model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_M,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+# Load MonkeyOCR
+MODEL_ID_G = "echo840/MonkeyOCR"
+SUBFOLDER = "Recognition"
+processor_g = AutoProcessor.from_pretrained(
+    MODEL_ID_G,
+    trust_remote_code=True,
+    subfolder=SUBFOLDER
+)
+model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_G,
+    trust_remote_code=True,
+    subfolder=SUBFOLDER,
+    torch_dtype=torch.float16
+).to(device).eval()
+# Load typhoon-ocr-7b
+MODEL_ID_L = "scb10x/typhoon-ocr-7b"
+processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
+model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_L,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+#--------------------------------------------------#
+# Load SmolDocling-256M-preview
+MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
+processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+model_x = AutoModelForVision2Seq.from_pretrained(
+    MODEL_ID_X,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+#--------------------------------------------------#
+# Preprocessing functions for SmolDocling-256M
+def add_random_padding(image, min_percent=0.1, max_percent=0.10):
+    """Add random padding to an image based on its size."""
+    image = image.convert("RGB")
+    width, height = image.size
+    pad_w_percent = random.uniform(min_percent, max_percent)
+    pad_h_percent = random.uniform(min_percent, max_percent)
+    pad_w = int(width * pad_w_percent)
+    pad_h = int(height * pad_h_percent)
+    corner_pixel = image.getpixel((0, 0))  # Top-left corner
+    padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
+    return padded_image
+def normalize_values(text, target_max=500):
+    """Normalize numerical values in text to a target maximum."""
+    def normalize_list(values):
+        max_value = max(values) if values else 1
+        return [round((v / max_value) * target_max) for v in values]
+    def process_match(match):
+        num_list = ast.literal_eval(match.group(0))
+        normalized = normalize_list(num_list)
+        return "".join([f"<loc_{num}>" for num in normalized])
+    pattern = r"\[([\d\.\s,]+)\]"
+    normalized_text = re.sub(pattern, process_match, text)
+    return normalized_text
+def downsample_video(video_path):
+    """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
+@spaces.GPU
+def generate_image(model_name: str, text: str, image: Image.Image,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
+    """Generate responses for image input using the selected model."""
+    # Model selection
+    if model_name == "Nanonets-OCR-s":
+        processor = processor_m
+        model = model_m
+    elif model_name == "MonkeyOCR-Recognition":
+        processor = processor_g
+        model = model_g
+    elif model_name == "SmolDocling-256M-preview":
+        processor = processor_x
+        model = model_x
+    elif model_name == "Typhoon-OCR-7B":
+        processor = processor_l
+        model = model_l
+    else:
+        yield "Invalid model selected."
+        return
+    if image is None:
+        yield "Please upload an image."
+        return
+    # Prepare images as a list (single image for image inference)
+    images = [image]
+    # SmolDocling-256M specific preprocessing
+    if model_name == "SmolDocling-256M-preview":
+        if "OTSL" in text or "code" in text:
+            images = [add_random_padding(img) for img in images]
+        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+            text = normalize_values(text, target_max=500)
+    # Unified message structure for all models
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"} for _ in images] + [
+                {"type": "text", "text": text}
+            ]
+        }
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    # Generation with streaming
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Stream output and collect full response
+    buffer = ""
+    full_output = ""
+    for new_text in streamer:
+        full_output += new_text
+        buffer += new_text.replace("<|im_end|>", "")
+        yield buffer
+    # SmolDocling-256M specific postprocessing
+    if model_name == "SmolDocling-256M-preview":
+        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+            if "<chart>" in cleaned_output:
+                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+            markdown_output = doc.export_to_markdown()
+            yield f"**MD Output:**\n\n{markdown_output}"
+        else:
+            yield cleaned_output
+@spaces.GPU
+def generate_video(model_name: str, text: str, video_path: str,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
+    """Generate responses for video input using the selected model."""
+    # Model selection
+    if model_name == "Nanonets-OCR-s":
+        processor = processor_m
+        model = model_m
+    elif model_name == "MonkeyOCR-Recognition":
+        processor = processor_g
+        model = model_g
+    elif model_name == "SmolDocling-256M-preview":
+        processor = processor_x
+        model = model_x
+    elif model_name == "Typhoon-OCR-7B":
+        processor = processor_l
+        model = model_l
+    else:
+        yield "Invalid model selected."
+        return
+    if video_path is None:
+        yield "Please upload a video."
+        return
+    # Extract frames from video
+    frames = downsample_video(video_path)
+    images = [frame for frame, _ in frames]
+    # SmolDocling-256M specific preprocessing
+    if model_name == "SmolDocling-256M-preview":
+        if "OTSL" in text or "code" in text:
+            images = [add_random_padding(img) for img in images]
+        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+            text = normalize_values(text, target_max=500)
+    # Unified message structure for all models
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"} for _ in images] + [
+                {"type": "text", "text": text}
+            ]
+        }
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    # Generation with streaming
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Stream output and collect full response
+    buffer = ""
+    full_output = ""
+    for new_text in streamer:
+        full_output += new_text
+        buffer += new_text.replace("<|im_end|>", "")
+        yield buffer
+    # SmolDocling-256M specific postprocessing
+    if model_name == "SmolDocling-256M-preview":
+        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+            if "<chart>" in cleaned_output:
+                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+            markdown_output = doc.export_to_markdown()
+            yield f"**MD Output:**\n\n{markdown_output}"
+        else:
+            yield cleaned_output
+# Define examples for image and video inference
+image_examples = [
+    ["OCR the image", "images/2.jpg"],
+    ["Convert this page to docling", "images/1.png"],
+    ["Convert this page to docling", "images/3.png"],
+    ["Convert chart to OTSL.", "images/4.png"],
+    ["Convert code to text", "images/5.jpg"],
+    ["Convert this table to OTSL.", "images/6.jpg"],
+    ["Convert formula to late.", "images/7.jpg"],
+]
+video_examples = [
+    ["Explain the video in detail.", "videos/1.mp4"],
+    ["Explain the video in detail.", "videos/2.mp4"]
+]
+css = """
+.submit-btn {
+    background-color: #2980b9 !important;
+    color: white !important;
+}
+.submit-btn:hover {
+    background-color: #3498db !important;
+}
+"""
+# Create the Gradio Interface
+with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
+    with gr.Row():
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("Image Inference"):
+                    image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    image_upload = gr.Image(type="pil", label="Image")
+                    image_submit = gr.Button("Submit", elem_classes="submit-btn")
+                    gr.Examples(
+                        examples=image_examples,
+                        inputs=[image_query, image_upload]
+                    )
+                with gr.TabItem("Video Inference"):
+                    video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    video_upload = gr.Video(label="Video")
+                    video_submit = gr.Button("Submit", elem_classes="submit-btn")
+                    gr.Examples(
+                        examples=video_examples,
+                        inputs=[video_query, video_upload]
+                    )
+            with gr.Accordion("Advanced options", open=False):
+                max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
+                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
+                top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
+        with gr.Column():
+            output = gr.Textbox(label="Output", interactive=False, lines=3, scale=2)
+            model_choice = gr.Radio(
+                choices=["SmolDocling-256M-preview", "Nanonets-OCR-s", "MonkeyOCR-Recognition", "Typhoon-OCR-7B"],
+                label="Select Model",
+                value="Nanonets-OCR-s"
+            )
+            gr.Markdown("**Model Info 💻**")
+            gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
+            gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
+            gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
+            gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
+    image_submit.click(
+        fn=generate_image,
+        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=output
+    )
+    video_submit.click(
+        fn=generate_video,
+        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=output
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=40).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

images/1.png ADDED Viewed

Git LFS Details

SHA256: 077c718039fba1cfd412d6716129febb1b8bdc54f77f3a2dccc6ed4176846252
Pointer size: 131 Bytes
Size of remote file: 270 kB

images/2.jpg ADDED Viewed

Git LFS Details

SHA256: 02e9aa9ccdfe57430119b7ae7dc7a2d9967df58450c059bac795ac32aecf5900
Pointer size: 131 Bytes
Size of remote file: 332 kB

images/3.png ADDED Viewed

Git LFS Details

SHA256: bee89438e58beb702aa6940c002d3ff7e5dfd2bae8e697164e718f2170014d6f
Pointer size: 131 Bytes
Size of remote file: 431 kB

images/4.png ADDED Viewed

images/5.jpg ADDED Viewed

images/6.jpg ADDED Viewed

images/7.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+gradio
+transformers
+transformers-stream-generator
+qwen-vl-utils
+torchvision
+docling-core
+torch
+requests
+huggingface_hub
+albumentations
+spaces
+accelerate
+pillow
+opencv-python
+av

videos/1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9127aaafccef6f02fce6812bc9c89e1e4026832cf133492481952cc4b94cb595
+size 791367

videos/2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bdf85ced4e76f2afd1a66b2c41e93868ccd9f928a02105de5e7db3c8651c692e
+size 1040341