Spaces:

Jimmyzheng-10
/

ScreenCoder

Running

App Files Files Community

ScreenCoder / screencoder /image_box_detection.py

Jimmyzheng-10

update

ddb2dc9 3 months ago

raw

history blame

9.25 kB

	import argparse, asyncio, cv2, json, os, sys
	from pathlib import Path
	import numpy as np
	from playwright.async_api import async_playwright

	# ---------- Main logic ----------
	async def extract_bboxes_from_html(html_path: Path):
	async with async_playwright() as p:
	browser = await p.chromium.launch()
	ctx = await browser.new_context(
	viewport={"width": 1280, "height": 720},
	)
	page = await ctx.new_page()
	await page.goto(html_path.resolve().as_uri())

	metrics = await page.evaluate("""
	() => {
	const region_containers = Array.from(document.querySelectorAll('.box[id]'));
	const region_bboxes = region_containers.map(el => {
	const rect = el.getBoundingClientRect();
	return { id: el.id, x: rect.x, y: rect.y, w: rect.width, h: rect.height };
	});

	const placeholder_bboxes = [];
	let ph_id_counter = 0;
	const all_potential_placeholders = document.querySelectorAll('img[src="placeholder.png"]');

	for (const el of all_potential_placeholders) {
	// Apply the same filters as before
	if (el.tagName === 'SVG') continue;
	if (el.innerText && el.innerText.trim() !== '') continue;

	const el_rect = el.getBoundingClientRect();
	const el_center = { x: el_rect.left + el_rect.width / 2, y: el_rect.top + el_rect.height / 2 };

	// Find which region this placeholder is inside
	let containing_region_id = null;
	for (const region_el of region_containers) {
	const region_rect = region_el.getBoundingClientRect();
	if (el_center.x >= region_rect.left && el_center.x <= region_rect.right &&
	el_center.y >= region_rect.top && el_center.y <= region_rect.bottom) {
	containing_region_id = region_el.id;
	break; // Assume non-overlapping regions
	}
	}

	if (containing_region_id) {
	placeholder_bboxes.push({
	id: 'ph' + ph_id_counter++,
	x: el_rect.x,
	y: el_rect.y,
	w: el_rect.width,
	h: el_rect.height,
	region_id: containing_region_id
	});
	}
	}

	const layout_rect = document.documentElement.getBoundingClientRect();
	return {
	region_bboxes,
	placeholder_bboxes,
	layout_width: layout_rect.width,
	layout_height: layout_rect.height
	};
	}
	""")
	await browser.close()
	return metrics['region_bboxes'], metrics['placeholder_bboxes'], metrics['layout_width'], metrics['layout_height']


	def draw_bboxes_on_image(img, region_bboxes, placeholder_bboxes):
	"""Draw region (green) and placeholder (red) boxes with labels on img."""
	boxed = img.copy()
	H, W = img.shape[:2]

	# --- Helper to draw a single box with label ---
	def draw_box_with_label(b, color, label_text):
	x, y, w, h = b["x"], b["y"], b["w"], b["h"]
	# Boundary correction
	x_draw, y_draw = max(0, x), max(0, y)
	w_draw, h_draw = min(w, W - x_draw), min(h, H - y_draw)
	cv2.rectangle(boxed, (x_draw, y_draw), (x_draw + w_draw, y_draw + h_draw), color, 3) # Thicker lines

	font = cv2.FONT_HERSHEY_SIMPLEX
	font_scale = 0.8
	font_thickness = 2
	text_color = (255, 255, 255)

	(text_width, text_height), baseline = cv2.getTextSize(label_text, font, font_scale, font_thickness)

	# Position for the label background. Put it just above the box.
	label_y_start = y - text_height - baseline - 5
	if label_y_start < 0: # Adjust if the label goes off the top of the image
	label_y_start = y + 5

	label_x_start = x
	label_y_end = label_y_start + text_height + baseline

	cv2.rectangle(boxed, (label_x_start, label_y_start), (label_x_start + text_width, label_y_end), color, cv2.FILLED)
	cv2.putText(boxed, label_text, (label_x_start + 2, label_y_start + text_height), font, font_scale, text_color, font_thickness)

	# --- Draw Regions (Green) ---
	for b in region_bboxes:
	draw_box_with_label(b, color=(0, 255, 0), label_text=f'Area_{b.get("id", "")}')

	# --- Draw Placeholders (Red) ---
	for b in placeholder_bboxes:
	draw_box_with_label(b, color=(0, 0, 255), label_text=f'{b.get("region_id")}_{b.get("id")}')

	return boxed


	def main():
	args = get_args()
	run_id = args.run_id

	# --- Dynamic Path Construction ---
	base_dir = Path(__file__).parent.resolve()
	tmp_dir = base_dir / 'data' / 'tmp' / run_id
	output_dir = base_dir / 'data' / 'output' / run_id

	html_path = output_dir / f"{run_id}_layout.html"
	screenshot_path = tmp_dir / f"{run_id}.png"
	output_json_path = tmp_dir / f"{run_id}_bboxes.json"
	debug_image_path = tmp_dir / f"debug_gray_bboxes_{run_id}.png"

	if not html_path.exists():
	sys.exit(f"Error: HTML file not found at {html_path}")
	if not screenshot_path.exists():
	sys.exit(f"Error: Screenshot not found at {screenshot_path}")

	print(f"--- Starting Image Box Detection for run_id: {run_id} ---")

	# Read original screenshot
	img = cv2.imread(str(screenshot_path))
	if img is None:
	sys.exit(f"Error: Cannot read image {screenshot_path}")
	if img.std() < 5:
	print("Warning: The screenshot is almost pure color, it may not be the original screenshot with real thumbnails.")

	H, W = img.shape[:2]

	# Parse HTML → Get bboxes
	region_bboxes, placeholder_bboxes, layout_width, layout_height = asyncio.run(
	extract_bboxes_from_html(html_path)
	)
	if not placeholder_bboxes:
	# This is not necessarily an error; some UIs might not have placeholders.
	print("Info: No gray placeholder blocks found.")

	# Calculate separate scale factors for X and Y to handle aspect ratio differences
	scale_x = W / layout_width if layout_width > 0 else 1
	scale_y = H / layout_height if layout_height > 0 else 1

	if abs(scale_x - scale_y) > 0.05:
	print(f"[*] Detected different X/Y scales. X: {scale_x:.2f}, Y: {scale_y:.2f}")
	elif abs(scale_x - 1.0) > 0.05:
	print(f"[*] Detected uniform scale: {scale_x:.2f}")


	# Scale all bboxes to the original image coordinate system
	scaled_regions = []
	for b in region_bboxes:
	scaled_regions.append({
	**b,
	"x": int(b['x'] * scale_x), "y": int(b['y'] * scale_y),
	"w": int(b['w'] * scale_x), "h": int(b['h'] * scale_y)
	})

	scaled_placeholders = []
	for b in placeholder_bboxes:
	scaled_placeholders.append({
	**b,
	"x": int(b['x'] * scale_x), "y": int(b['y'] * scale_y),
	"w": int(b['w'] * scale_x), "h": int(b['h'] * scale_y)
	})

	# Draw boxes using the now-scaled data
	overlay = draw_bboxes_on_image(img, scaled_regions, scaled_placeholders)

	# Save debug image
	debug_image_path.parent.mkdir(parents=True, exist_ok=True)
	cv2.imwrite(str(debug_image_path), overlay)
	print(f"Success: BBox overlay saved to {debug_image_path}")


	# Convert absolute pixel coordinates to proportions for the final JSON output
	proportional_regions = []
	for b in scaled_regions:
	proportional_regions.append({
	**b,
	"x": b["x"] / W, "y": b["y"] / H,
	"w": b["w"] / W, "h": b["h"] / H
	})

	proportional_placeholders = []
	for b in scaled_placeholders:
	proportional_placeholders.append({
	**b,
	"x": b["x"] / W, "y": b["y"] / H,
	"w": b["w"] / W, "h": b["h"] / H
	})

	# Print/save bbox array
	print("\n=== BBox (proportional to image dimensions) ===")
	output_data = {
	"regions": proportional_regions,
	"placeholders": proportional_placeholders
	}
	output_json = json.dumps(output_data, indent=2, ensure_ascii=False)
	print(output_json)

	output_json_path.parent.mkdir(parents=True, exist_ok=True)
	output_json_path.write_text(output_json)
	print(f"Success: BBox list saved to {output_json_path}")
	print(f"--- Image Box Detection Complete for run_id: {run_id} ---")

	def get_args():
	parser = argparse.ArgumentParser(
	description="Extracts placeholder bounding boxes from an HTML file and maps them to a screenshot."
	)
	parser.add_argument('--run_id', required=True, type=str,
	help="A unique identifier for the processing run.")
	return parser.parse_args()

	# ---------- CLI ----------
	if __name__ == "__main__":
	main()