Spaces:
Running
Running
File size: 6,669 Bytes
a383d0e 0246ff9 a383d0e 0246ff9 a383d0e 0246ff9 a383d0e 0246ff9 a383d0e 0246ff9 a383d0e 1cc14d1 a383d0e 1cc14d1 a383d0e 1cc14d1 a383d0e 1cc14d1 a383d0e 1cc14d1 a383d0e 1cc14d1 a383d0e 1cc14d1 a383d0e 0246ff9 a383d0e 0246ff9 a383d0e 0246ff9 a383d0e 0246ff9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import argparse
import json
from pathlib import Path
from bs4 import BeautifulSoup
import cv2
import re
import sys
def main():
args = get_args()
run_id = args.run_id
# --- Dynamic Path Construction ---
base_dir = Path(__file__).parent.resolve()
tmp_dir = base_dir / 'data' / 'tmp' / run_id
output_dir = base_dir / 'data' / 'output' / run_id
mapping_path = tmp_dir / f"mapping_full_{run_id}.json"
uied_path = tmp_dir / "ip" / f"{run_id}.json"
original_image_path = tmp_dir / f"{run_id}.png"
# This is the input HTML with placeholders
gray_html_path = output_dir / f"{run_id}_layout.html"
# This will be the final output of the entire pipeline
final_html_path = output_dir / f"{run_id}_layout_final.html"
# --- Input Validation ---
if not all([p.exists() for p in [mapping_path, uied_path, original_image_path, gray_html_path]]):
print("Error: One or more required input files are missing.", file=sys.stderr)
if not mapping_path.exists(): print(f"- Missing: {mapping_path}", file=sys.stderr)
if not uied_path.exists(): print(f"- Missing: {uied_path}", file=sys.stderr)
if not original_image_path.exists(): print(f"- Missing: {original_image_path}", file=sys.stderr)
if not gray_html_path.exists(): print(f"- Missing: {gray_html_path}", file=sys.stderr)
sys.exit(1)
print(f"--- Starting Image Replacement for run_id: {run_id} ---")
# --- Phase 1: Crop and Save All Images First ---
# 1. Load data
mapping_data = json.loads(mapping_path.read_text())
uied_data = json.loads(uied_path.read_text())
original_image = cv2.imread(str(original_image_path))
if original_image is None:
raise ValueError(f"Could not load the original image from {original_image_path}")
# Get image shapes to calculate a simple, global scaling factor
H_proc, W_proc, _ = uied_data['img_shape']
H_orig, W_orig, _ = original_image.shape
scale_x = W_orig / W_proc
scale_y = H_orig / H_proc
print(f"Using global scaling for cropping: scale_x={scale_x:.3f}, scale_y={scale_y:.3f}")
uied_boxes = {
comp['id']: (comp['column_min'], comp['row_min'], comp['width'], comp['height'])
for comp in uied_data['compos']
}
# 2. Create a directory for cropped images
crop_dir = final_html_path.parent / f"cropped_images_{run_id}"
crop_dir.mkdir(exist_ok=True)
print(f"Saving cropped images to: {crop_dir.resolve()}")
# 3. Iterate through mappings and save cropped images to files
for region_id, region_data in mapping_data.items():
for placeholder_id, uied_id in region_data['mapping'].items():
if uied_id not in uied_boxes:
print(f"Warning: UIED ID {uied_id} from mapping not found. Skipping placeholder {placeholder_id}.")
continue
uied_bbox = uied_boxes[uied_id]
x_proc, y_proc, w_proc, h_proc = uied_bbox
x_tf = x_proc * scale_x
y_tf = y_proc * scale_y
w_tf = w_proc * scale_x
h_tf = h_proc * scale_y
x1, y1 = int(x_tf), int(y_tf)
x2, y2 = int(x_tf + w_tf), int(y_tf + h_tf)
h_img, w_img, _ = original_image.shape
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(w_img, x2), min(h_img, y2)
cropped_img = original_image[y1:y2, x1:x2]
if cropped_img.size == 0:
print(f"Warning: Cropped image for {placeholder_id} is empty. Skipping.")
continue
output_path = crop_dir / f"{placeholder_id}.png"
cv2.imwrite(str(output_path), cropped_img)
# --- Phase 2: Use BeautifulSoup to Replace Placeholders by Order ---
print("\nStarting offline HTML processing with BeautifulSoup...")
html_content = gray_html_path.read_text()
soup = BeautifulSoup(html_content, 'html.parser')
# 1. Find all placeholder elements by their class, in document order.
placeholder_elements = soup.find_all('img', src="placeholder.png")
# 2. Get the placeholder IDs from the mapping file in the correct, sorted order.
def natural_sort_key(s):
return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]
ordered_placeholder_ids = []
# Sort region IDs numerically to process them in order
for region_id in sorted(mapping_data.keys(), key=int):
region_mapping = mapping_data[region_id]['mapping']
# Sort the placeholder IDs within each region naturally (e.g., ph1, ph2, ph10)
sorted_ph_ids = sorted(region_mapping.keys(), key=natural_sort_key)
ordered_placeholder_ids.extend(sorted_ph_ids)
# 3. Check for count mismatches
if len(placeholder_elements) != len(ordered_placeholder_ids):
print(f"Warning: Mismatch in counts! Found {len(placeholder_elements)} placeholder images in HTML, but {len(ordered_placeholder_ids)} mappings.")
else:
print(f"Found {len(placeholder_elements)} placeholder images to replace.")
# 4. Iterate through both lists, create a proper <img> tag, and replace the placeholder.
for i, ph_element in enumerate(placeholder_elements):
if i >= len(ordered_placeholder_ids):
print(f"Warning: More placeholder images in HTML than mappings. Stopping at image {i+1}.")
break
ph_id = ordered_placeholder_ids[i]
# Fix: Use the correct relative path from HTML file to image directory
relative_img_path = f"{crop_dir.name}/{ph_id}.png"
# Debug: Print the path being used
print(f"Setting image path for {ph_id}: {relative_img_path}")
# --- Update the img tag's src attribute ---
# Since we're now working with img tags instead of div tags,
# we just need to update the src attribute
ph_element['src'] = relative_img_path
# Save the modified HTML
final_html_path.write_text(str(soup))
print(f"\nSuccessfully replaced {min(len(placeholder_elements), len(ordered_placeholder_ids))} placeholders.")
print(f"Final HTML generated at {final_html_path.resolve()}")
print(f"--- Image Replacement Complete for run_id: {run_id} ---")
def get_args():
parser = argparse.ArgumentParser(description="Replace placeholder divs in an HTML file with cropped images based on UIED mappings.")
parser.add_argument("--run_id", type=str, required=True, help="A unique identifier for the processing run.")
return parser.parse_args()
if __name__ == "__main__":
main()
|