Spaces:

Jimmyzheng-10
/

ScreenCoder

Running

File size: 6,669 Bytes

import argparse
import json
from pathlib import Path
from bs4 import BeautifulSoup
import cv2
import re
import sys

def main():
    args = get_args()
    run_id = args.run_id

    # --- Dynamic Path Construction ---
    base_dir = Path(__file__).parent.resolve()
    tmp_dir = base_dir / 'data' / 'tmp' / run_id
    output_dir = base_dir / 'data' / 'output' / run_id

    mapping_path = tmp_dir / f"mapping_full_{run_id}.json"
    uied_path = tmp_dir / "ip" / f"{run_id}.json"
    original_image_path = tmp_dir / f"{run_id}.png"
    # This is the input HTML with placeholders
    gray_html_path = output_dir / f"{run_id}_layout.html"
    # This will be the final output of the entire pipeline
    final_html_path = output_dir / f"{run_id}_layout_final.html"

    # --- Input Validation ---
    if not all([p.exists() for p in [mapping_path, uied_path, original_image_path, gray_html_path]]):
        print("Error: One or more required input files are missing.", file=sys.stderr)
        if not mapping_path.exists(): print(f"- Missing: {mapping_path}", file=sys.stderr)
        if not uied_path.exists(): print(f"- Missing: {uied_path}", file=sys.stderr)
        if not original_image_path.exists(): print(f"- Missing: {original_image_path}", file=sys.stderr)
        if not gray_html_path.exists(): print(f"- Missing: {gray_html_path}", file=sys.stderr)
        sys.exit(1)

    print(f"--- Starting Image Replacement for run_id: {run_id} ---")

    # --- Phase 1: Crop and Save All Images First ---
    
    # 1. Load data
    mapping_data = json.loads(mapping_path.read_text())
    uied_data = json.loads(uied_path.read_text())
    original_image = cv2.imread(str(original_image_path))
    
    if original_image is None:
        raise ValueError(f"Could not load the original image from {original_image_path}")

    # Get image shapes to calculate a simple, global scaling factor
    H_proc, W_proc, _ = uied_data['img_shape']
    H_orig, W_orig, _ = original_image.shape
    scale_x = W_orig / W_proc
    scale_y = H_orig / H_proc
    print(f"Using global scaling for cropping: scale_x={scale_x:.3f}, scale_y={scale_y:.3f}")

    uied_boxes = {
        comp['id']: (comp['column_min'], comp['row_min'], comp['width'], comp['height'])
        for comp in uied_data['compos']
    }

    # 2. Create a directory for cropped images
    crop_dir = final_html_path.parent / f"cropped_images_{run_id}"
    crop_dir.mkdir(exist_ok=True)
    print(f"Saving cropped images to: {crop_dir.resolve()}")

    # 3. Iterate through mappings and save cropped images to files
    for region_id, region_data in mapping_data.items():
        for placeholder_id, uied_id in region_data['mapping'].items():
            if uied_id not in uied_boxes:
                print(f"Warning: UIED ID {uied_id} from mapping not found. Skipping placeholder {placeholder_id}.")
                continue

            uied_bbox = uied_boxes[uied_id]
            
            x_proc, y_proc, w_proc, h_proc = uied_bbox
            x_tf = x_proc * scale_x
            y_tf = y_proc * scale_y
            w_tf = w_proc * scale_x
            h_tf = h_proc * scale_y

            x1, y1 = int(x_tf), int(y_tf)
            x2, y2 = int(x_tf + w_tf), int(y_tf + h_tf)
            
            h_img, w_img, _ = original_image.shape
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(w_img, x2), min(h_img, y2)
            
            cropped_img = original_image[y1:y2, x1:x2]
            
            if cropped_img.size == 0:
                print(f"Warning: Cropped image for {placeholder_id} is empty. Skipping.")
                continue
            
            output_path = crop_dir / f"{placeholder_id}.png"
            cv2.imwrite(str(output_path), cropped_img)

    # --- Phase 2: Use BeautifulSoup to Replace Placeholders by Order ---
    
    print("\nStarting offline HTML processing with BeautifulSoup...")
    html_content = gray_html_path.read_text()
    soup = BeautifulSoup(html_content, 'html.parser')

    # 1. Find all placeholder elements by their class, in document order.
    placeholder_elements = soup.find_all('img', src="placeholder.png")

    # 2. Get the placeholder IDs from the mapping file in the correct, sorted order.
    def natural_sort_key(s):
        return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]

    ordered_placeholder_ids = []
    # Sort region IDs numerically to process them in order
    for region_id in sorted(mapping_data.keys(), key=int):
        region_mapping = mapping_data[region_id]['mapping']
        # Sort the placeholder IDs within each region naturally (e.g., ph1, ph2, ph10)
        sorted_ph_ids = sorted(region_mapping.keys(), key=natural_sort_key)
        ordered_placeholder_ids.extend(sorted_ph_ids)
    
    # 3. Check for count mismatches
    if len(placeholder_elements) != len(ordered_placeholder_ids):
        print(f"Warning: Mismatch in counts! Found {len(placeholder_elements)} placeholder images in HTML, but {len(ordered_placeholder_ids)} mappings.")
    else:
        print(f"Found {len(placeholder_elements)} placeholder images to replace.")

    # 4. Iterate through both lists, create a proper <img> tag, and replace the placeholder.
    for i, ph_element in enumerate(placeholder_elements):
        if i >= len(ordered_placeholder_ids):
            print(f"Warning: More placeholder images in HTML than mappings. Stopping at image {i+1}.")
            break
        
        ph_id = ordered_placeholder_ids[i]
        # Fix: Use the correct relative path from HTML file to image directory
        relative_img_path = f"{crop_dir.name}/{ph_id}.png"
        
        # Debug: Print the path being used
        print(f"Setting image path for {ph_id}: {relative_img_path}")
        
        # --- Update the img tag's src attribute ---
        # Since we're now working with img tags instead of div tags,
        # we just need to update the src attribute
        ph_element['src'] = relative_img_path

    # Save the modified HTML
    final_html_path.write_text(str(soup))
    
    print(f"\nSuccessfully replaced {min(len(placeholder_elements), len(ordered_placeholder_ids))} placeholders.")
    print(f"Final HTML generated at {final_html_path.resolve()}")
    print(f"--- Image Replacement Complete for run_id: {run_id} ---")

def get_args():
    parser = argparse.ArgumentParser(description="Replace placeholder divs in an HTML file with cropped images based on UIED mappings.")
    parser.add_argument("--run_id", type=str, required=True, help="A unique identifier for the processing run.")
    return parser.parse_args()

if __name__ == "__main__":
    main()