yolov11_pylaia_catmus

Sleeping

App Files Files Community

wjm55 commited on Apr 2

Commit

052c825

1 Parent(s): d5d8604

Add YOLOv11 model integration and Gradio interface for text detection

Browse files

Files changed (2) hide show

app.py +158 -95
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import streamlit as st
 import warnings
 warnings.simplefilter("ignore", UserWarning)
@@ -18,13 +17,39 @@ import cv2
 import numpy as np
 import pandas as pd
 import logging
-from typing import List, Optional
 # Configure logging
 logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
 # Load YOLOv8 model
-model = YOLO('model.pt')
 images = Path(mkdtemp())
 DEFAULT_HEIGHT = 128
 TEXT_DIRECTION = "LTR"
@@ -36,6 +61,13 @@ CONFIDENCE_PATTERN = r"(?P<confidence>[0-9.]+)"  # For line
 TEXT_PATTERN = r"\s*(?P<text>.*)\s*"
 LINE_PREDICTION = re.compile(rf"{IMAGE_ID_PATTERN} {CONFIDENCE_PATTERN} {TEXT_PATTERN}")
 def get_width(image, height=DEFAULT_HEIGHT):
     aspect_ratio = image.width / image.height
     return height * aspect_ratio
@@ -65,7 +97,8 @@ def simplify_polygons(polygons: List[np.ndarray], approx_level: float = 0.01) ->
         result.append(approx.squeeze())
     return result
-def predict(model_name, input_img):
     model_dir = 'catmus-medieval'
     temperature = 2.0
     batch_size = 1
@@ -121,96 +154,126 @@ def predict(model_name, input_img):
         predictions = Path(pred_stdout.name).read_text().strip().splitlines()
     _, score, text = LINE_PREDICTION.match(predictions[0]).groups()
-    if TEXT_DIRECTION == "RTL":
-        return input_img, {"text": get_display(text), "score": score}
-    else:
-        return input_img, {"text": text, "score": score}
-def process_image(image):
-    # Perform inference on an image, select textline only
-    results = model(image, classes=0)
-    img_cv2 = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-    masks = results[0].masks
-    polygons = []
-    texts = []
-    if masks is not None:
-        # Get masks data and original image dimensions
-        masks = masks.data.cpu().numpy()
-        img_height, img_width = img_cv2.shape[:2]
-        # Get bounding boxes in xyxy format
-        boxes = results[0].boxes.xyxy.cpu().numpy()
-        # Sort by y-coordinate of the top-left corner
-        sorted_indices = np.argsort(boxes[:, 1])
-        masks = masks[sorted_indices]
-        boxes = boxes[sorted_indices]
-        for i, (mask, box) in enumerate(zip(masks, boxes)):
-            # Scale the mask to original image size
-            mask = cv2.resize(mask.squeeze(), (img_width, img_height), interpolation=cv2.INTER_LINEAR)
-            mask = (mask > 0.5).astype(np.uint8) * 255  # Apply threshold
-            # Convert mask to polygon
-            contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-            if contours:
-                # Get the largest contour
-                largest_contour = max(contours, key=cv2.contourArea)
-                simplified_polygon = simplify_polygons([largest_contour])[0]
-                if simplified_polygon is not None:
-                    # Crop the image using the bounding box for text recognition
-                    x1, y1, x2, y2 = map(int, box)
-                    crop_img = img_cv2[y1:y2, x1:x2]
-                    crop_pil = Image.fromarray(cv2.cvtColor(crop_img, cv2.COLOR_BGR2RGB))
-                    # Recognize text using PyLaia model
-                    predicted = predict('pylaia-samaritan_v1', crop_pil)
-                    texts.append(predicted[1]["text"])
-                    # Convert polygon to list of points for display
-                    poly_points = simplified_polygon.reshape(-1, 2).astype(int).tolist()
-                    polygons.append(f"Line {i+1}: {poly_points}")
-                    # Draw polygon on the image
-                    cv2.polylines(img_cv2, [simplified_polygon.reshape(-1, 1, 2).astype(int)],
-                                 True, (0, 255, 0), 2)
-    # Convert image back to RGB for display in Streamlit
-    img_result = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2RGB)
-    # Combine polygons and texts into a DataFrame for table display
-    table_data = pd.DataFrame({"Polygons": polygons, "Recognized Text": texts})
-    return Image.fromarray(img_result), table_data
-def segment_and_recognize(image):
-    segmented_image, table_data = process_image(image)
-    return segmented_image, table_data
-# Streamlit app layout
-st.set_page_config(layout="wide")  # Use full page width
-st.title("YOLOv11 Text Line Segmentation & PyLaia Text Recognition on CATMuS/medieval")
-# File uploader
-uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
-# Process the image if uploaded
-if uploaded_image is not None:
-    image = Image.open(uploaded_image)
-    if st.button("Segment and Recognize"):
-        # Perform segmentation and recognition
-        segmented_image, table_data = segment_and_recognize(image)
-        # Layout: Image on the left, Table on the right
-        col1, col2 = st.columns([2, 3])  # Adjust the ratio if needed
-        with col1:
-            st.image(segmented_image, caption="Segmented Image with Polygon Masks", use_container_width=True)
-        with col2:
-            st.table(table_data)

 import warnings
 warnings.simplefilter("ignore", UserWarning)
 import numpy as np
 import pandas as pd
 import logging
+from typing import List, Optional, Tuple, Dict
+from huggingface_hub import hf_hub_download
+import gradio as gr
+import supervision as sv
+import os
+import spaces
+import torch
+# Define models
+MODEL_OPTIONS = {
+    "YOLOv11-Nano": "medieval-yolov11n.pt",
+    "YOLOv11-Small": "medieval-yolov11s.pt",
+    "YOLOv11-Medium": "medieval-yolov11m.pt",
+    "YOLOv11-Large": "medieval-yolov11l.pt",
+    "YOLOv11-XLarge": "medieval-yolov11x.pt"
+}
+# Dictionary to store loaded models
+models: Dict[str, YOLO] = {}
+# Load all models
+for name, model_file in MODEL_OPTIONS.items():
+    model_path = hf_hub_download(
+        repo_id="biglam/medieval-manuscript-yolov11",
+        filename=model_file
+    )
+    models[name] = YOLO(model_path)
 # Configure logging
 logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
 # Load YOLOv8 model
+model = YOLO(model_path)
 images = Path(mkdtemp())
 DEFAULT_HEIGHT = 128
 TEXT_DIRECTION = "LTR"
 TEXT_PATTERN = r"\s*(?P<text>.*)\s*"
 LINE_PREDICTION = re.compile(rf"{IMAGE_ID_PATTERN} {CONFIDENCE_PATTERN} {TEXT_PATTERN}")
+# Create annotators
+LABEL_ANNOTATOR = sv.LabelAnnotator(text_color=sv.Color.BLACK)
+BOX_ANNOTATOR = sv.BoxAnnotator()
+# Select device
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
 def get_width(image, height=DEFAULT_HEIGHT):
     aspect_ratio = image.width / image.height
     return height * aspect_ratio
         result.append(approx.squeeze())
     return result
+def predict_text(input_img):
+    """PyLaia text recognition function"""
     model_dir = 'catmus-medieval'
     temperature = 2.0
     batch_size = 1
         predictions = Path(pred_stdout.name).read_text().strip().splitlines()
     _, score, text = LINE_PREDICTION.match(predictions[0]).groups()
+    return text, float(score)
+@spaces.GPU
+def detect_and_recognize(image, model_name, conf_threshold, iou_threshold):
+    if image is None:
+        return None, ""
+    # Get model path
+    model_path = hf_hub_download(
+        repo_id="biglam/medieval-manuscript-yolov11",
+        filename=MODEL_OPTIONS[model_name]
+    )
+    # Load model
+    model = YOLO(model_path)
+    # Perform inference
+    results = model.predict(
+        image,
+        conf=conf_threshold,
+        iou=iou_threshold,
+        classes=0,
+        device=device
+    )[0]
+    # Convert results to supervision Detections
+    boxes = results.boxes.xyxy.cpu().numpy()
+    confidence = results.boxes.conf.cpu().numpy()
+    class_ids = results.boxes.cls.cpu().numpy().astype(int)
+    # Sort boxes by y-coordinate
+    sorted_indices = np.argsort(boxes[:, 1])
+    boxes = boxes[sorted_indices]
+    confidence = confidence[sorted_indices]
+    # Create Detections object
+    detections = sv.Detections(
+        xyxy=boxes,
+        confidence=confidence,
+        class_id=class_ids
+    )
+    # Create labels
+    labels = [
+        f"Line {i+1} ({conf:.2f})"
+        for i, conf in enumerate(confidence)
+    ]
+    # Annotate image
+    annotated_image = image.copy()
+    annotated_image = BOX_ANNOTATOR.annotate(scene=annotated_image, detections=detections)
+    annotated_image = LABEL_ANNOTATOR.annotate(scene=annotated_image, detections=detections, labels=labels)
+    # Create text summary
+    text_summary = "\n".join([f"Line {i+1}: Confidence {conf:.2f}" for i, conf in enumerate(confidence)])
+    return annotated_image, text_summary
+def gradio_reset():
+    return None, None, ""
+if __name__ == "__main__":
+    print(f"Using device: {device}")
+    with gr.Blocks() as demo:
+        gr.Markdown("# Medieval Manuscript Text Detection")
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(
+                    label="Input Image",
+                    type="numpy"
+                )
+                with gr.Accordion("Detection Settings", open=True):
+                    model_selector = gr.Dropdown(
+                        choices=list(MODEL_OPTIONS.keys()),
+                        value=list(MODEL_OPTIONS.keys())[0],
+                        label="Model",
+                        info="Select YOLO model variant"
+                    )
+                    with gr.Row():
+                        conf_threshold = gr.Slider(
+                            label="Confidence Threshold",
+                            minimum=0.0,
+                            maximum=1.0,
+                            step=0.05,
+                            value=0.25,
+                        )
+                        iou_threshold = gr.Slider(
+                            label="IoU Threshold",
+                            minimum=0.0,
+                            maximum=1.0,
+                            step=0.05,
+                            value=0.45,
+                        )
+                with gr.Row():
+                    clear_btn = gr.Button("Clear")
+                    detect_btn = gr.Button("Detect", variant="primary")
+            with gr.Column():
+                output_image = gr.Image(
+                    label="Detection Result",
+                    type="numpy"
+                )
+                text_output = gr.Textbox(
+                    label="Detection Summary",
+                    lines=10
+                )
+        # Connect buttons to functions
+        detect_btn.click(
+            detect_and_recognize,
+            inputs=[input_image, model_selector, conf_threshold, iou_threshold],
+            outputs=[output_image, text_output]
+        )
+        clear_btn.click(
+            gradio_reset,
+            inputs=None,
+            outputs=[input_image, output_image, text_output]
+        )
+    demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)

requirements.txt CHANGED Viewed

@@ -20,3 +20,5 @@ python-bidi==0.6.0
 streamlit==1.44.0
 transformers==4.50.3
 ultralytics==8.3.99

 streamlit==1.44.0
 transformers==4.50.3
 ultralytics==8.3.99
+gradio
+supervision