kraken-yiddish / yolo2xml.py

Create yolo2xml.py

bd0e32e verified 5 months ago

13.7 kB

	from typing import Dict, List
	import os
	import sys
	import glob
	import argparse
	import datetime
	import shutil
	import numpy as np
	import cv2
	from PIL import Image
	from ultralytics import YOLO
	from huggingface_hub import hf_hub_download

	# XML generation imports
	import xml.etree.ElementTree as ET
	from xml.dom import minidom

	# Define models
	MODEL_OPTIONS = {
	"YOLOv11-Nano": "yolov11n-seg.pt",
	"YOLOv11-Small": "yolov11s-seg.pt",
	"YOLOv11-Medium": "yolov11m-seg.pt",
	"YOLOv11-Large": "yolov11l-seg.pt",
	"YOLOv11-XLarge": "yolov11x-seg.pt"
	}

	# Dictionary to store loaded models
	models: Dict[str, YOLO] = {}

	# Load specified model or default to Nano
	def load_model(model_name: str = "YOLOv11-Nano") -> YOLO:
	if model_name not in models:
	model_file = MODEL_OPTIONS[model_name]
	model_path = hf_hub_download(
	repo_id="wjbmattingly/kraken-yiddish",
	filename=model_file
	)
	models[model_name] = YOLO(model_path)
	return models[model_name]

	def process_image(
	image_path: str,
	model_name: str = "YOLOv11-Medium",
	conf_threshold: float = 0.25,
	iou_threshold: float = 0.45
	) -> tuple:
	"""Process an image and return detection results and annotated image"""

	# Read the image
	image = cv2.imread(image_path)
	if image is None:
	raise ValueError(f"Cannot read image: {image_path}")

	# Convert BGR to RGB for YOLO
	image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

	# Get image dimensions
	height, width = image.shape[:2]

	# Get the selected model
	model = load_model(model_name)

	# Perform inference with YOLO
	results = model(
	image_rgb,
	conf=conf_threshold,
	iou=iou_threshold,
	verbose=False,
	device='cpu'
	)

	# Get the first result
	result = results[0]

	# Create annotated image for visualization
	annotated_image = result.plot(
	conf=True,
	line_width=None,
	font_size=None,
	boxes=True,
	masks=True,
	probs=True,
	labels=True
	)

	# Convert back to BGR for saving with OpenCV
	annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)

	return result, annotated_image, width, height

	def create_page_xml(
	image_filename: str,
	result,
	width: int,
	height: int
	) -> str:
	"""Create PAGE XML structure from YOLO results"""

	# Create the root element
	root = ET.Element("PcGts", {
	"xmlns": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15",
	"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
	"xsi:schemaLocation": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"
	})

	# Add metadata
	metadata = ET.SubElement(root, "Metadata")
	ET.SubElement(metadata, "Creator").text = "escriptorium"

	# Use a future date like in the example
	future_date = (datetime.datetime.now() + datetime.timedelta(days=365)).isoformat()
	ET.SubElement(metadata, "Created").text = future_date
	ET.SubElement(metadata, "LastChange").text = future_date

	# Add page element with original image filename
	page = ET.SubElement(root, "Page", {
	"imageFilename": os.path.basename(image_filename),
	"imageWidth": str(width),
	"imageHeight": str(height)
	})

	# Process each detected mask/contour as a separate TextRegion
	has_valid_masks = False

	if hasattr(result, 'masks') and result.masks is not None:
	masks = result.masks.xy

	# Create main text region for the right side (assuming right-to-left Hebrew/Yiddish text)
	# Use a unique timestamp for the ID
	timestamp = int(datetime.datetime.now().timestamp())
	main_region_id = f"eSc_textblock_TextRegion_{timestamp}"

	# Get bounding box of all masks to determine the text region
	all_points_x = []
	all_points_y = []
	valid_masks = []

	# First pass: filter all masks and collect valid points
	for mask_points in masks:
	# Filter out NaN values from mask points
	valid_points = [(p[0], p[1]) for p in mask_points if not (np.isnan(p[0]) or np.isnan(p[1]))]

	if valid_points and len(valid_points) >= 3: # Only proceed if we have enough valid points
	valid_masks.append(valid_points)
	all_points_x.extend([p[0] for p in valid_points])
	all_points_y.extend([p[1] for p in valid_points])
	has_valid_masks = True

	# Calculate the text region coordinates if we have valid points
	if has_valid_masks and all_points_x and all_points_y:
	min_x = max(0, int(min(all_points_x)))
	max_x = min(width, int(max(all_points_x)))
	min_y = max(0, int(min(all_points_y)))
	max_y = min(height, int(max(all_points_y)))

	# Create main text region with calculated bounds
	main_text_region = ET.SubElement(page, "TextRegion", {
	"id": main_region_id,
	"custom": "structure {type:text_zone;}"
	})

	# Add coordinates for the text region (use rectangle format)
	region_points = f"{min_x},{min_y} {max_x},{min_y} {max_x},{max_y} {min_x},{max_y}"
	ET.SubElement(main_text_region, "Coords", {"points": region_points})

	# Process each valid mask
	for i, valid_points in enumerate(valid_masks):
	# Create text line with auto-incrementing ID
	line_id = f"eSc_line_r2l{i+1}" if i > 0 else "eSc_line_line_1610719743362_3154"
	text_line = ET.SubElement(main_text_region, "TextLine", {
	"id": line_id,
	"custom": "structure {type:text_line;}"
	})

	# Format mask points for PAGE XML format
	# Convert to int to avoid scientific notation
	points_str = " ".join([f"{int(p[0])},{int(p[1])}" for p in valid_points])

	# Add coordinates to the text line
	line_coords = ET.SubElement(text_line, "Coords", {
	"points": points_str
	})

	# Calculate baseline points spanning the entire width of the polygon
	# Sort points by x-value to find the left and right boundaries
	points_by_x = sorted(valid_points, key=lambda p: p[0])
	leftmost_point = points_by_x[0]
	rightmost_point = points_by_x[-1]

	# Sort points by y-value (ascending) to find the bottom area of the line
	sorted_by_y = sorted(valid_points, key=lambda p: p[1])

	# Take points in the bottom third, but ensure we have at least one point
	bottom_third_index = max(0, int(len(sorted_by_y) * 0.67))
	bottom_points = sorted_by_y[bottom_third_index:]

	if not bottom_points: # Fallback if no bottom points
	bottom_points = sorted_by_y # Use all points

	# Find the average y-value of bottom points for a straight baseline
	avg_y = sum(p[1] for p in bottom_points) / len(bottom_points)

	# Create baseline with two points spanning the full width
	left_x = leftmost_point[0]
	right_x = rightmost_point[0]

	# Create baseline string with exactly two points
	baseline_str = f"{int(left_x)},{int(avg_y)} {int(right_x)},{int(avg_y)}"

	# Add baseline
	baseline = ET.SubElement(text_line, "Baseline", {
	"points": baseline_str
	})

	# Add empty text equivalent
	text_equiv = ET.SubElement(text_line, "TextEquiv")
	ET.SubElement(text_equiv, "Unicode")

	# Create a second text region for the left side
	# This is to mimic the structure in the example but with empty content
	left_region = ET.SubElement(page, "TextRegion", {
	"id": f"eSc_textblock_r1",
	"custom": "structure {type:text_zone;}"
	})

	# Left region takes up the left side of the page
	left_region_points = f"0,0 {min_x-10},{min_y} {min_x-10},{max_y} 0,{max_y}"
	ET.SubElement(left_region, "Coords", {"points": left_region_points})

	# If no valid masks were found, create a default text region covering the whole page
	if not has_valid_masks:
	print("Warning: No valid masks detected. Creating a default text region.")
	default_region = ET.SubElement(page, "TextRegion", {
	"id": f"eSc_textblock_default_{int(datetime.datetime.now().timestamp())}",
	"custom": "structure {type:text_zone;}"
	})
	default_points = f"0,0 {width},0 {width},{height} 0,{height}"
	ET.SubElement(default_region, "Coords", {"points": default_points})

	# Convert to string with pretty formatting
	xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")

	return xmlstr

	def save_results(image_path: str, annotated_image: np.ndarray, xml_content: str):
	"""Save the original image to output/ and XML file to annotations/ directory"""

	# Create output and annotations directories if they don't exist
	output_dir = "output"
	annotations_dir = "annotations"
	os.makedirs(output_dir, exist_ok=True)
	os.makedirs(annotations_dir, exist_ok=True)

	# Get the base filename without extension
	base_name = os.path.basename(image_path)
	file_name_no_ext = os.path.splitext(base_name)[0]

	# Copy the original image to output directory
	output_image_path = os.path.join(output_dir, f"{file_name_no_ext}.jpg")
	# Use shutil.copy to directly copy the file instead of reading/writing
	shutil.copy(image_path, output_image_path)

	# Save the XML file to annotations directory
	output_xml_path = os.path.join(annotations_dir, f"{file_name_no_ext}.xml")
	with open(output_xml_path, "w", encoding="utf-8") as f:
	f.write(xml_content)

	print(f"Results saved to:")
	print(f" Image: {output_image_path}")
	print(f" XML: {output_xml_path}")

	def main():
	parser = argparse.ArgumentParser(description="Convert YOLO segmentation to PAGE XML format")
	parser.add_argument("image_path", help="Path to the input image or directory of images")
	parser.add_argument("--model", default="YOLOv11-Medium", choices=MODEL_OPTIONS.keys(),
	help="Model to use for detection")
	parser.add_argument("--conf", type=float, default=0.25,
	help="Confidence threshold for detection")
	parser.add_argument("--iou", type=float, default=0.45,
	help="IoU threshold for detection")
	parser.add_argument("--batch", action="store_true",
	help="Process all images in the directory if image_path is a directory")

	args = parser.parse_args()

	# Check if the path is a directory and batch mode is enabled
	if os.path.isdir(args.image_path) and args.batch:
	# Get all image files in the directory
	image_files = []
	for extension in ['.jpg', '.jpeg', '.png', '.tif', '.tiff']:
	image_files.extend(glob.glob(os.path.join(args.image_path, f"*{extension}")))
	image_files.extend(glob.glob(os.path.join(args.image_path, f"*{extension.upper()}")))

	if not image_files:
	print(f"No image files found in directory: {args.image_path}")
	sys.exit(1)

	print(f"Found {len(image_files)} images to process")

	# Process each image
	for i, image_path in enumerate(image_files):
	print(f"Processing {i+1}/{len(image_files)}: {os.path.basename(image_path)}")
	try:
	# Process the image
	result, annotated_image, width, height = process_image(
	image_path,
	args.model,
	args.conf,
	args.iou
	)

	# Create PAGE XML
	xml_content = create_page_xml(image_path, result, width, height)

	# Save results
	save_results(image_path, annotated_image, xml_content)

	except Exception as e:
	print(f"Error processing {image_path}: {e}")
	import traceback
	traceback.print_exc()
	else:
	# Process a single image
	try:
	# Process the image
	result, annotated_image, width, height = process_image(
	args.image_path,
	args.model,
	args.conf,
	args.iou
	)

	# Create PAGE XML
	xml_content = create_page_xml(args.image_path, result, width, height)

	# Save results
	save_results(args.image_path, annotated_image, xml_content)

	except Exception as e:
	print(f"Error: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	if __name__ == "__main__":
	main()