Spaces:
Running
Running
""" | |
python script/mapping.py --gray /Users/jimmyzhengyz/Documents/Research/ui2code_demo/public/assets/debug/bboxes.json --uied /Users/jimmyzhengyz/Documents/Research/ui2code_demo/public/assets/demo1_output/ip/demo1_filtered.json --debug overlay.png --debug-src public/assets/demo1.png | |
""" | |
import json, argparse, numpy as np, cv2 | |
from pathlib import Path | |
from typing import List, Dict | |
from collections import defaultdict | |
from sklearn.linear_model import RANSACRegressor | |
from scipy.spatial.distance import cdist | |
from scipy.optimize import linear_sum_assignment | |
import sys | |
CIOU_STRICT = -0.9 # Min CIoU score for a valid one-to-one mapping | |
FILTER_MIN_WH = 10 # UIED filter: ignore boxes smaller than this | |
# Tools | |
def ciou(a, b): | |
""" | |
Calculate Complete IoU (CIoU) between two bounding boxes. | |
`a`, `b`: bounding boxes in format (x, y, w, h). | |
Returns a value between -1 and 1. Higher is better. | |
""" | |
# Epsilon to prevent division by zero | |
epsilon = 1e-7 | |
# Standard IoU | |
xa, ya, wa, ha = a | |
xb, yb, wb, hb = b | |
x1, y1 = max(xa, xb), max(ya, yb) | |
x2, y2 = min(xa + wa, xb + wb), min(ya + ha, yb + hb) | |
intersection_area = max(0, x2 - x1) * max(0, y2 - y1) | |
union_area = (wa * ha) + (wb * hb) - intersection_area | |
iou_val = intersection_area / (union_area + epsilon) | |
# Center points distance | |
center_a = center(a) | |
center_b = center(b) | |
center_distance_sq = np.sum((center_a - center_b) ** 2) | |
# Enclosing box diagonal | |
enclose_x1 = min(xa, xb) | |
enclose_y1 = min(ya, yb) | |
enclose_x2 = max(xa + wa, xb + wb) | |
enclose_y2 = max(ya + ha, yb + hb) | |
enclose_diag_sq = ((enclose_x2 - enclose_x1) ** 2) + ((enclose_y2 - enclose_y1) ** 2) | |
distance_penalty = center_distance_sq / (enclose_diag_sq + epsilon) | |
# Aspect ratio consistency | |
arctan_a = np.arctan(wa / (ha + epsilon)) | |
arctan_b = np.arctan(wb / (hb + epsilon)) | |
v = (4 / (np.pi ** 2)) * ((arctan_a - arctan_b) ** 2) | |
# Trade-off parameter alpha | |
with np.errstate(divide='ignore', invalid='ignore'): | |
alpha = v / (1 - iou_val + v + epsilon) | |
alpha = 0 if np.isnan(alpha) else alpha # if iou=1 and v=0, alpha is nan. | |
aspect_ratio_penalty = alpha * v | |
# CIOU | |
ciou_val = iou_val - distance_penalty - aspect_ratio_penalty | |
return ciou_val | |
def center(box): | |
x, y, w, h = box | |
return np.array([x + w / 2, y + h / 2]) | |
def load_regions_and_placeholders(p: Path, W_img, H_img): | |
""" | |
Loads region and placeholder data from the specified JSON file. | |
The file is expected to have 'regions' and 'placeholders' keys with | |
proportional bbox values, which are converted to absolute pixel values. | |
""" | |
data = json.loads(p.read_text()) | |
def to_pixels(b): | |
return (b['x']*W_img, b['y']*H_img, b['w']*W_img, b['h']*H_img) | |
regions = [{**d, "bbox": to_pixels(d)} for d in data.get("regions", [])] | |
placeholders = [{**d, "bbox": to_pixels(d)} for d in data.get("placeholders", [])] | |
if not regions or not placeholders: | |
print(f"Warning: JSON file {p} does not contain 'regions' or 'placeholders' keys.") | |
return regions, placeholders | |
def load_uied_boxes(p: Path): | |
""" | |
Loads UIED component detection data. | |
The JSON file is expected to contain the shape of the image that was | |
processed, which is crucial for calculating scaling factors later. | |
""" | |
data = json.loads(p.read_text()) | |
compos = data.get("compos", []) | |
shape = data.get("img_shape") # e.g., [800, 571, 3] | |
items = [] | |
for d in compos: | |
w, h = d.get("width", 0), d.get("height", 0) | |
if w < FILTER_MIN_WH or h < FILTER_MIN_WH: continue | |
items.append({"id": d["id"], | |
"bbox": (d["column_min"], d["row_min"], w, h)}) | |
# print(d["id"], d["column_min"], d["row_min"], w, h) | |
return items, shape | |
def estimate_global_transform(pixel_placeholders, uied_boxes, uied_shape, W_orig, H_orig): | |
""" | |
Estimates a global affine transform from the UIED coordinate space to the | |
original screenshot's coordinate space. This is used for rough alignment. | |
""" | |
# 1. Calculate base scaling from image dimension ratios | |
H_proc, W_proc, _ = uied_shape | |
scale_x = W_orig / W_proc | |
scale_y = H_orig / H_proc | |
# 2. Apply this scaling to all UIED boxes | |
uied_scaled = [{**u, "bbox": (u["bbox"][0]*scale_x, u["bbox"][1]*scale_y, u["bbox"][2]*scale_x, u["bbox"][3]*scale_y)} for u in uied_boxes] | |
# 3. Estimate residual translation (dx, dy) by matching centers | |
if not pixel_placeholders or not uied_scaled: | |
return scale_x, scale_y, 0, 0 | |
ph_centers = np.array([center(p["bbox"]) for p in pixel_placeholders]) | |
uied_scaled_centers = np.array([center(u["bbox"]) for u in uied_scaled]) | |
indices = cdist(ph_centers, uied_scaled_centers).argmin(axis=1) | |
translations = ph_centers - uied_scaled_centers[indices] | |
dx, dy = np.median(translations, axis=0) | |
return scale_x, scale_y, dx, dy | |
def apply_affine_transform(box, scale_x, scale_y, dx, dy): | |
x, y, w, h = box | |
return (x * scale_x + dx, y * scale_y + dy, w * scale_x, h * scale_y) | |
# Mapping Function | |
def find_local_mapping_and_transform(placeholders, uied_boxes, uied_shape, W_orig, H_orig): | |
""" | |
Finds the optimal one-to-one mapping and the local affine transform for a given | |
subset of placeholders and UIED boxes. | |
""" | |
if not placeholders or not uied_boxes: | |
return {}, (1, 1, 0, 0) | |
# 1. Estimate local affine transform | |
# 1a. Calculate base scaling from image dimension ratios | |
H_proc, W_proc, _ = uied_shape | |
scale_x = W_orig / W_proc | |
scale_y = H_orig / H_proc | |
# 1b. Apply this scaling to UIED boxes | |
uied_scaled = [{**u, "bbox": (u["bbox"][0]*scale_x, u["bbox"][1]*scale_y, u["bbox"][2]*scale_x, u["bbox"][3]*scale_y)} for u in uied_boxes] | |
# 1c. Estimate residual translation (dx, dy) by matching centers | |
ph_centers = np.array([center(p["bbox"]) for p in placeholders]) | |
uied_scaled_centers = np.array([center(u["bbox"]) for u in uied_scaled]) | |
indices = cdist(ph_centers, uied_scaled_centers).argmin(axis=1) | |
translations = ph_centers - uied_scaled_centers[indices] | |
dx, dy = np.median(translations, axis=0) | |
transform = (scale_x, scale_y, dx, dy) | |
# 2. Apply the final, full transformation to all UIED boxes in this subset | |
uied_tf = [{**u, "bbox_tf": apply_affine_transform(u["bbox"], scale_x, scale_y, dx, dy)} for u in uied_boxes] | |
# 3. Create a cost matrix and find optimal assignment | |
num_gray = len(placeholders) | |
num_uied = len(uied_tf) | |
cost_matrix = np.zeros((num_gray, num_uied)) | |
for i in range(num_gray): | |
for j in range(num_uied): | |
cost_matrix[i, j] = -ciou(placeholders[i]["bbox"], uied_tf[j]["bbox_tf"]) | |
row_ind, col_ind = linear_sum_assignment(cost_matrix) | |
# 4. Create the one-to-one mapping | |
mapping = {} | |
for r, c in zip(row_ind, col_ind): | |
score = -cost_matrix[r, c] | |
if score >= CIOU_STRICT: | |
g_id = placeholders[r]["id"] | |
u_id = uied_tf[c]["id"] | |
mapping[g_id] = u_id | |
return mapping, transform | |
def generate_debug_overlay(img_path, all_uied_boxes, region_results, uied_shape, out_png): | |
""" | |
Generates a debug image by drawing the mapped UIED boxes on the original screenshot. | |
This version uses a simple scaling based on image dimensions, without any translation. | |
""" | |
canvas = cv2.imread(str(img_path)) | |
if canvas is None: | |
print(f"Error: Could not read debug source image at {img_path}.") | |
return | |
# Use a fixed red color for all bounding boxes for consistency | |
color = (0, 0, 255) # Red in BGR | |
# 1. Calculate simple scaling factors from the provided image shapes. | |
H_proc, W_proc, _ = uied_shape | |
H_orig, W_orig, _ = canvas.shape | |
scale_x = W_orig / W_proc | |
scale_y = H_orig / H_proc | |
# 2. Draw all mapped UIED boxes using only this simple scaling. | |
for region_id, result in region_results.items(): | |
mapping = result.get("mapping", {}) | |
for g_id, uid in mapping.items(): | |
u_box = next((box for box in all_uied_boxes if box["id"] == uid), None) | |
if u_box is None: continue | |
# Apply simple scaling directly, without any translation offset. | |
x_proc, y_proc, w_proc, h_proc = u_box["bbox"] | |
x = x_proc * scale_x | |
y = y_proc * scale_y | |
w = w_proc * scale_x | |
h = h_proc * scale_y | |
cv2.rectangle(canvas, (int(x), int(y)), (int(x + w), int(y + h)), color, 2) | |
cv2.putText(canvas, f"uied_{uid}", (int(x), int(y) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) | |
cv2.imwrite(str(out_png), canvas) | |
def main(): | |
args = get_args() | |
run_id = args.run_id | |
# --- Dynamic Path Construction --- | |
base_dir = Path(__file__).parent.resolve() | |
tmp_dir = base_dir / 'data' / 'tmp' / run_id | |
gray_json_path = tmp_dir / f"{run_id}_bboxes.json" | |
uied_json_path = tmp_dir / "ip" / f"{run_id}.json" | |
mapping_output_path = tmp_dir / f"mapping_full_{run_id}.json" | |
debug_src_path = tmp_dir / f"{run_id}.png" | |
debug_overlay_path = tmp_dir / f"overlay_test_{run_id}.png" | |
# --- Input Validation --- | |
if not gray_json_path.exists(): | |
sys.exit(f"Error: Placeholder JSON not found at {gray_json_path}") | |
if not uied_json_path.exists(): | |
sys.exit(f"Error: UIED JSON not found at {uied_json_path}") | |
if not debug_src_path.exists(): | |
sys.exit(f"Error: Source image for coordinate conversion not found at {debug_src_path}") | |
print(f"--- Starting Mapping for run_id: {run_id} ---") | |
# 1. Load the original screenshot to get its absolute dimensions | |
orig_img = cv2.imread(str(debug_src_path)) | |
if orig_img is None: | |
sys.exit(f"Error: Could not read debug source image at {debug_src_path}.") | |
H_orig, W_orig, _ = orig_img.shape | |
# 2. Load proportional data and convert to absolute pixel coordinates | |
pixel_regions, pixel_placeholders = load_regions_and_placeholders(gray_json_path, W_orig, H_orig) | |
# 3. Load UIED data | |
all_uied_boxes, uied_shape = load_uied_boxes(uied_json_path) | |
if not pixel_placeholders or not all_uied_boxes: | |
print("Error: Could not proceed without placeholder and UIED data.") | |
return | |
# 4. Estimate a GLOBAL transform for rough, initial alignment of all UIED boxes | |
g_scale_x, g_scale_y, g_dx, g_dy = estimate_global_transform(pixel_placeholders, all_uied_boxes, uied_shape, W_orig, H_orig) | |
print(f"Estimated Global Transform: scale_x={g_scale_x:.3f}, scale_y={g_scale_y:.3f}, dx={g_dx:.1f}, dy={g_dy:.1f}") | |
# Apply the global transform to all UIED boxes to get them into the main coordinate space | |
uied_tf_global = [{**u, "bbox_tf": apply_affine_transform(u["bbox"], g_scale_x, g_scale_y, g_dx, g_dy)} for u in all_uied_boxes] | |
# 5. Loop through regions and perform LOCALIZED matching and transform estimation | |
final_results = {} | |
total_placeholders_count = len(pixel_placeholders) | |
total_mappings_count = 0 | |
for region in pixel_regions: | |
# Filter placeholders for the current region | |
region_placeholders = [p for p in pixel_placeholders if p.get("region_id") == region["id"]] | |
if not region_placeholders: | |
continue | |
# Filter UIED boxes for the current region using the globally transformed coordinates | |
rx, ry, rw, rh = region["bbox"] | |
region_uied_ids = { | |
u['id'] for u in uied_tf_global | |
if rx <= center(u["bbox_tf"])[0] <= rx + rw and ry <= center(u["bbox_tf"])[1] <= ry + rh | |
} | |
# Get the original uied boxes that correspond to this region | |
region_uied_boxes = [u for u in all_uied_boxes if u['id'] in region_uied_ids] | |
if not region_uied_boxes: | |
print(f"Warning: No UIED boxes found in region {region['id']} after global alignment.") | |
continue | |
# Find the precise LOCAL mapping and transform for this region | |
region_mapping, region_transform = find_local_mapping_and_transform( | |
region_placeholders, region_uied_boxes, uied_shape, W_orig, H_orig | |
) | |
if region_mapping: | |
total_mappings_count += len(region_mapping) | |
l_scale_x, l_scale_y, l_dx, l_dy = region_transform | |
final_results[region["id"]] = { | |
"transform": { "scale_x": l_scale_x, "scale_y": l_scale_y, "dx": l_dx, "dy": l_dy }, | |
"mapping": region_mapping | |
} | |
# 6. Report and save results | |
print(f"Successfully created {total_mappings_count} one-to-one mappings out of {total_placeholders_count} placeholders.") | |
mapping_output_path.write_text(json.dumps(final_results, indent=2, ensure_ascii=False)) | |
print(f"Mapping data written to {mapping_output_path}") | |
# Always generate the debug image if the source exists | |
generate_debug_overlay(debug_src_path, all_uied_boxes, final_results, uied_shape, debug_overlay_path) | |
print(f"--- Mapping Complete for run_id: {run_id} ---") | |
def get_args(): | |
ap = argparse.ArgumentParser(description="Map UIED components to placeholder boxes.") | |
ap.add_argument('--run_id', required=True, type=str, help="A unique identifier for the processing run.") | |
return ap.parse_args() | |
if __name__ == "__main__": | |
main() | |