ObjectClear

Running

App Files Files Community

jixin0101 commited on Jul 6

Commit

7d4b8c8

0 Parent(s):

Clean history

Browse files

Files changed (31) hide show

.gitattributes +38 -0
Logo.png +3 -0
README.md +12 -0
app.py +677 -0
examples/test0.png +3 -0
examples/test1.png +3 -0
examples/test2.png +3 -0
examples/test3.png +3 -0
examples/test4.png +3 -0
examples/test5.png +3 -0
examples/test6.png +3 -0
examples/test7.png +3 -0
examples/test8.png +3 -0
examples/test9.png +3 -0
model.py +115 -0
pipeline_objectclear.py +0 -0
requirements.txt +10 -0
tools/__init__.py +0 -0
tools/__pycache__/__init__.cpython-310.pyc +0 -0
tools/__pycache__/base_segmenter.cpython-310.pyc +0 -0
tools/__pycache__/download_util.cpython-310.pyc +0 -0
tools/__pycache__/interact_tools.cpython-310.pyc +0 -0
tools/__pycache__/mask_painter.cpython-310.pyc +0 -0
tools/__pycache__/misc.cpython-310.pyc +0 -0
tools/__pycache__/painter.cpython-310.pyc +0 -0
tools/base_segmenter.py +129 -0
tools/download_util.py +109 -0
tools/interact_tools.py +99 -0
tools/mask_painter.py +288 -0
tools/misc.py +131 -0
tools/painter.py +215 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,38 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

Logo.png ADDED Viewed

Git LFS Details

SHA256: 8cfd430ff41ed80e783027809fabbb2dcd742c76e7f96469da4f7274d003f514
Pointer size: 130 Bytes
Size of remote file: 52 kB

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: ObjectClear
+emoji: 🪄
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 5.30.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,677 @@

+import gradio as gr
+import os
+from PIL import Image
+import torch
+from diffusers.utils import load_image, check_min_version
+from pipeline_objectclear import ObjectClearPipeline
+from tools.download_util import load_file_from_url
+from tools.painter import mask_painter
+import argparse
+from safetensors.torch import load_file
+from model import CLIPImageEncoder, PostfuseModule
+import numpy as np
+import torchvision.transforms.functional as TF
+from scipy.ndimage import convolve, zoom
+import cv2
+import time
+from huggingface_hub import hf_hub_download
+import spaces
+from tools.interact_tools import SamControler
+from tools.misc import get_device
+import json
+check_min_version("0.30.2")
+def parse_augment():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--device', type=str, default=None)
+    parser.add_argument('--sam_model_type', type=str, default="vit_h")
+    parser.add_argument('--port', type=int, default=8000, help="only useful when running gradio applications")
+    args = parser.parse_args()
+    if not args.device:
+        args.device = str(get_device())
+    return args
+def pad_to_multiple(image: np.ndarray, multiple: int = 8):
+    h, w = image.shape[:2]
+    pad_h = (multiple - h % multiple) % multiple
+    pad_w = (multiple - w % multiple) % multiple
+    if image.ndim == 3:
+        padded = np.pad(image, ((0, pad_h), (0, pad_w), (0,0)), mode='reflect')
+    else:
+        padded = np.pad(image, ((0, pad_h), (0, pad_w)), mode='reflect')
+    return padded, h, w
+def crop_to_original(image: np.ndarray, h: int, w: int):
+    return image[:h, :w]
+def wavelet_blur_np(image: np.ndarray, radius: int):
+    kernel = np.array([
+        [0.0625, 0.125, 0.0625],
+        [0.125,  0.25,  0.125],
+        [0.0625, 0.125, 0.0625]
+    ], dtype=np.float32)
+    blurred = np.empty_like(image)
+    for c in range(image.shape[0]):
+        blurred_c = convolve(image[c], kernel, mode='nearest')
+        if radius > 1:
+            blurred_c = zoom(zoom(blurred_c, 1 / radius, order=1), radius, order=1)
+        blurred[c] = blurred_c
+    return blurred
+def wavelet_decomposition_np(image: np.ndarray, levels=5):
+    high_freq = np.zeros_like(image)
+    for i in range(levels):
+        radius = 2 ** i
+        low_freq = wavelet_blur_np(image, radius)
+        high_freq += (image - low_freq)
+        image = low_freq
+    return high_freq, low_freq
+def wavelet_reconstruction_np(content_feat: np.ndarray, style_feat: np.ndarray):
+    content_high, _ = wavelet_decomposition_np(content_feat)
+    _, style_low = wavelet_decomposition_np(style_feat)
+    return content_high + style_low
+def wavelet_color_fix_np(fused: np.ndarray, mask: np.ndarray) -> np.ndarray:
+    fused_np = fused.astype(np.float32) / 255.0
+    mask_np = mask.astype(np.float32) / 255.0
+    fused_np = fused_np.transpose(2, 0, 1)
+    mask_np = mask_np.transpose(2, 0, 1)
+    result_np = wavelet_reconstruction_np(fused_np, mask_np)
+    result_np = result_np.transpose(1, 2, 0)
+    result_np = np.clip(result_np * 255.0, 0, 255).astype(np.uint8)
+    return result_np
+def fuse_with_wavelet(ori: np.ndarray, removed: np.ndarray, attn_map: np.ndarray, multiple: int = 8):
+    H, W = ori.shape[:2]
+    attn_map = attn_map.astype(np.float32)
+    _, attn_map = cv2.threshold(attn_map, 128, 255, cv2.THRESH_BINARY)
+    am = attn_map.astype(np.float32)
+    am = am/255.0
+    am_up = cv2.resize(am, (W, H), interpolation=cv2.INTER_NEAREST)
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (21,21))
+    am_d = cv2.dilate(am_up, kernel, iterations=1)
+    am_d = cv2.GaussianBlur(am_d.astype(np.float32), (9,9), sigmaX=2)
+    am_merged = np.maximum(am_up, am_d)
+    am_merged = np.clip(am_merged, 0, 1)
+    attn_up_3c = np.stack([am_merged]*3, axis=-1)
+    attn_up_ori_3c = np.stack([am_up]*3, axis=-1)
+    ori_out = ori * (1 - attn_up_ori_3c)
+    rem_out = removed * (1 - attn_up_ori_3c)
+    ori_pad, h0, w0 = pad_to_multiple(ori_out, multiple)
+    rem_pad, _, _   = pad_to_multiple(rem_out, multiple)
+    wave_rgb = wavelet_color_fix_np(ori_pad, rem_pad)
+    wave = crop_to_original(wave_rgb, h0, w0)
+    # fusion
+    fused = (wave * (1 - attn_up_3c) + removed * attn_up_3c).astype(np.uint8)
+    return fused
+def resize_by_short_side(image, target_short=512, resample=Image.BICUBIC):
+    w, h = image.size
+    if w < h:
+        new_w = target_short
+        new_h = int(h * target_short / w)
+        new_h = (new_h + 15) // 16 * 16
+    else:
+        new_h = target_short
+        new_w = int(w * target_short / h)
+        new_w = (new_w + 15) // 16 * 16
+    return image.resize((new_w, new_h), resample=resample)
+# convert points input to prompt state
+def get_prompt(click_state, click_input):
+    inputs = json.loads(click_input)
+    points = click_state[0]
+    labels = click_state[1]
+    for input in inputs:
+        points.append(input[:2])
+        labels.append(input[2])
+    click_state[0] = points
+    click_state[1] = labels
+    prompt = {
+        "prompt_type":["click"],
+        "input_point":click_state[0],
+        "input_label":click_state[1],
+        "multimask_output":"True",
+    }
+    return prompt
+# use sam to get the mask
+@spaces.GPU
+def sam_refine(image_state, point_prompt, click_state, evt:gr.SelectData):
+    if point_prompt == "Positive":
+        coordinate = "[[{},{},1]]".format(evt.index[0], evt.index[1])
+    else:
+        coordinate = "[[{},{},0]]".format(evt.index[0], evt.index[1])
+    # prompt for sam model
+    model.samcontroler.sam_controler.reset_image()
+    model.samcontroler.sam_controler.set_image(image_state["origin_image"])
+    prompt = get_prompt(click_state=click_state, click_input=coordinate)
+    mask, logit, painted_image = model.first_frame_click(
+                                                      image=image_state["origin_image"],
+                                                      points=np.array(prompt["input_point"]),
+                                                      labels=np.array(prompt["input_label"]),
+                                                      multimask=prompt["multimask_output"],
+                                                      )
+    image_state["mask"] = mask
+    image_state["logit"] = logit
+    image_state["painted_image"] = painted_image
+    return painted_image, image_state, click_state
+def add_multi_mask(image_state, interactive_state, mask_dropdown):
+    mask = image_state["mask"]
+    interactive_state["masks"].append(mask)
+    interactive_state["mask_names"].append("mask_{:03d}".format(len(interactive_state["masks"])))
+    mask_dropdown.append("mask_{:03d}".format(len(interactive_state["masks"])))
+    select_frame = show_mask(image_state, interactive_state, mask_dropdown)
+    return interactive_state, gr.update(choices=interactive_state["mask_names"], value=mask_dropdown), select_frame, [[],[]]
+def clear_click(image_state, click_state):
+    click_state = [[],[]]
+    input_image = image_state["origin_image"]
+    return input_image, click_state
+def remove_multi_mask(interactive_state, click_state, image_state):
+    interactive_state["mask_names"]= []
+    interactive_state["masks"] = []
+    click_state = [[],[]]
+    input_image = image_state["origin_image"]
+    return interactive_state, gr.update(choices=[],value=[]), input_image, click_state
+def show_mask(image_state, interactive_state, mask_dropdown):
+    mask_dropdown.sort()
+    if image_state["origin_image"] is not None:
+        select_frame = image_state["origin_image"]
+        for i in range(len(mask_dropdown)):
+            mask_number = int(mask_dropdown[i].split("_")[1]) - 1
+            mask = interactive_state["masks"][mask_number]
+            select_frame = mask_painter(select_frame, mask.astype('uint8'), mask_color=mask_number+2)
+        return select_frame
+@spaces.GPU
+def upload_and_reset(image_input, interactive_state):
+    click_state = [[], []]
+    interactive_state["mask_names"]= []
+    interactive_state["masks"] = []
+    image_state, image_info, image_input = update_image_state_on_upload(image_input)
+    return (
+        image_state,
+        image_info,
+        image_input,
+        interactive_state,
+        click_state,
+        gr.update(choices=[], value=[]),
+    )
+def update_image_state_on_upload(image_input):
+    frame = image_input
+    image_size = (frame.size[1], frame.size[0])
+    frame_np = np.array(frame)
+    image_state = {
+        "origin_image": frame_np,
+        "painted_image": frame_np.copy(),
+        "mask": np.zeros((image_size[0], image_size[1]), np.uint8),
+        "logit": None,
+    }
+    image_info = f"Image Name: uploaded.png,\nImage Size: {image_size}"
+    model.samcontroler.sam_controler.reset_image()
+    model.samcontroler.sam_controler.set_image(frame_np)
+    return image_state, image_info, image_input
+# SAM generator
+class MaskGenerator():
+    def __init__(self, sam_checkpoint, args):
+        self.args = args
+        self.samcontroler = SamControler(sam_checkpoint, args.sam_model_type, args.device)
+    def first_frame_click(self, image: np.ndarray, points:np.ndarray, labels: np.ndarray, multimask=True):
+        mask, logit, painted_image = self.samcontroler.first_frame_click(image, points, labels, multimask)
+        return mask, logit, painted_image
+# args, defined in track_anything.py
+args = parse_augment()
+sam_checkpoint_url_dict = {
+    'vit_h': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
+    'vit_l': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
+    'vit_b': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
+}
+checkpoint_folder = os.path.join('/home/user/app/', 'pretrained_models')
+sam_checkpoint = load_file_from_url(sam_checkpoint_url_dict[args.sam_model_type], checkpoint_folder)
+# initialize sams
+model = MaskGenerator(sam_checkpoint, args)
+# Build pipeline
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+pipe = ObjectClearPipeline.from_pretrained_with_custom_modules(
+    "jixin0101/ObjectClear",
+    torch_dtype=torch.float16,
+    save_cross_attn=True,
+    cache_dir="/home/jovyan/shared/jixinzhao/models",
+)
+pipe.to(device)
+@spaces.GPU
+def process(image_state, interactive_state, mask_dropdown, guidance_scale, seed, num_inference_steps, strength
+            ):
+    generator = torch.Generator(device="cuda").manual_seed(seed)
+    image_np = image_state["origin_image"]
+    image = Image.fromarray(image_np)
+    if interactive_state["masks"]:
+        if len(mask_dropdown) == 0:
+            mask_dropdown = ["mask_001"]
+        mask_dropdown.sort()
+        template_mask = interactive_state["masks"][int(mask_dropdown[0].split("_")[1]) - 1] * (int(mask_dropdown[0].split("_")[1]))
+        for i in range(1,len(mask_dropdown)):
+            mask_number = int(mask_dropdown[i].split("_")[1]) - 1
+            template_mask = np.clip(template_mask+interactive_state["masks"][mask_number]*(mask_number+1), 0, mask_number+1)
+        image_state["mask"]= template_mask
+    else:
+        template_mask = image_state["mask"]
+    mask = Image.fromarray((template_mask).astype(np.uint8) * 255)
+    image_or = image.copy()
+    image = image.convert("RGB")
+    mask = mask.convert("RGB")
+    image = resize_by_short_side(image, 512, resample=Image.BICUBIC)
+    mask = resize_by_short_side(mask, 512, resample=Image.NEAREST)
+    w, h = image.size
+    result = pipe(
+        prompt="remove the instance of object",
+        image=image,
+        mask_image=mask,
+        generator=generator,
+        num_inference_steps=num_inference_steps,
+        strength=strength,
+        guidance_scale=guidance_scale,
+        height=h,
+        width=w,
+    )
+    inpainted_img = result[0].images[0]
+    attn_map = result[1]
+    attn_np = attn_map.mean(dim=1)[0].cpu().numpy() * 255.
+    fused_img = fuse_with_wavelet(np.array(image), np.array(inpainted_img), attn_np)
+    fused_img_pil = Image.fromarray(fused_img.astype(np.uint8))
+    return fused_img_pil.resize((image_or.size[:2])), (image.resize((image_or.size[:2])), fused_img_pil.resize((image_or.size[:2])))
+import base64
+with open("./Logo.png", "rb") as f:
+    img_bytes = f.read()
+img_b64 = base64.b64encode(img_bytes).decode()
+html_img = f'''
+<div style="display:flex; justify-content:center; align-items:center; width:100%;">
+    <img src="data:image/png;base64,{img_b64}" style="border:none; width:200px; height:auto;"/>
+</div>
+'''
+tutorial_url = "https://github.com/zjx0101/ObjectClear/releases/download/media/tutorial.mp4"
+assets_path = os.path.join('/home/user/app/hugging_face/', "assets/")
+load_file_from_url(tutorial_url, assets_path)
+description = r"""
+<b>Official Gradio demo</b> for <a href='https://github.com/zjx0101/ObjectClear' target='_blank'><b>ObjectClear: Complete Object Removal via Object-Effect Attention</b></a>.<br>
+🔥 ObjectClear is an object removal model that can jointly eliminate the target object and its associated effects leveraging Object-Effect Attention, while preserving background consistency.<br>
+🖼️ Try to drop your image, assign the target masks with a few clicks, and get the object removal results!<br>
+*Note: Due to online GPU memory constraints, all input images will be resized during inference so that the shortest side is 512 pixels.<br>*
+"""
+article = r"""<h3>
+<b>If ObjectClear is helpful, please help to star the <a href='https://github.com/zjx0101/ObjectClear' target='_blank'>Github Repo</a>. Thanks!</b></h3>
+<hr>
+📑 **Citation**
+<br>
+If our work is useful for your research, please consider citing:
+```bibtex
+@InProceedings{zhao2025ObjectClear,
+    title     = {{ObjectClear}: Complete Object Removal via Object-Effect Attention},
+    author    = {Zhao, Jixin and Zhou, Shangchen and Wang, Zhouxia and Yang, Peiqing and Loy, Chen Change},
+    booktitle = {arXiv preprint arXiv:2505.22636},
+    year      = {2025}
+    }
+```
+📧 **Contact**
+<br>
+If you have any questions, please feel free to reach me out at <b>[email protected]</b>.
+<br>
+👏 **Acknowledgement**
+<br>
+This demo is adapted from [MatAnyone](https://github.com/pq-yang/MatAnyone), and leveraging segmentation capabilities from [Segment Anything](https://github.com/facebookresearch/segment-anything). Thanks for their awesome works!
+"""
+custom_css = """
+#input-image {
+    aspect-ratio: 1 / 1;
+    width: 100%;
+    max-width: 100%;
+    height: auto;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}
+#input-image img {
+    max-width: 100%;
+    max-height: 100%;
+    object-fit: contain;
+    display: block;
+}
+#main-columns {
+    gap: 60px;
+}
+#main-columns > .gr-column {
+    flex: 1;
+}
+#compare-image {
+    width: 100%;
+    aspect-ratio: 1 / 1;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    margin: 0;
+    padding: 0;
+    max-width: 100%;
+    box-sizing: border-box;
+}
+#compare-image svg.svelte-zyxd38 {
+    position: absolute !important;
+    top: 50% !important;
+    left: 50% !important;
+    transform: translate(-50%, -50%) !important;
+}
+#compare-image .icon.svelte-1oiin9d {
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+}
+#compare-image {
+    position: relative;
+    overflow: hidden;
+}
+.new_button {background-color: #171717 !important; color: #ffffff !important; border: none !important;}
+.new_button:hover {background-color: #4b4b4b !important;}
+#start-button {
+    background: linear-gradient(135deg, #2575fc 0%, #6a11cb 100%);
+    color: white;
+    border: none;
+    padding: 12px 24px;
+    font-size: 16px;
+    font-weight: bold;
+    border-radius: 12px;
+    cursor: pointer;
+    box-shadow: 0 0 12px rgba(100, 100, 255, 0.7);
+    transition: all 0.3s ease;
+}
+#start-button:hover {
+    transform: scale(1.05);
+    box-shadow: 0 0 20px rgba(100, 100, 255, 1);
+}
+<style>
+.button-wrapper {
+    width: 30%;
+    text-align: center;
+}
+.wide-button {
+    width: 83% !important;
+    background-color: black !important;
+    color: white !important;
+    border: none !important;
+    padding: 8px 0 !important;
+    font-size: 16px !important;
+    display: inline-block;
+    margin: 30px 0px 0px 50px ;
+}
+.wide-button:hover {
+    background-color: #656262 !important;
+}
+</style>
+"""
+with gr.Blocks(css=custom_css) as demo:
+    gr.HTML(html_img)
+    gr.Markdown(description)
+    with gr.Group(elem_classes="gr-monochrome-group", visible=True):
+        with gr.Row():
+            with gr.Accordion('SAM Settings (click to expand)', open=False):
+                with gr.Row():
+                    point_prompt = gr.Radio(
+                        choices=["Positive", "Negative"],
+                        value="Positive",
+                        label="Point Prompt",
+                        info="Click to add positive or negative point for target mask",
+                        interactive=True,
+                        min_width=100,
+                        scale=1)
+                    mask_dropdown = gr.Dropdown(multiselect=True, value=[], label="Mask Selection", info="Choose 1~all mask(s) added in Step 2")
+    with gr.Row(elem_id="main-columns"):
+        with gr.Column():
+            click_state = gr.State([[],[]])
+            interactive_state = gr.State(
+                {
+                    "mask_names": [],
+                    "masks": []
+                }
+            )
+            image_state = gr.State(
+                {
+                "origin_image": None,
+                "painted_image": None,
+                "mask": None,
+                "logit": None
+                }
+            )
+            image_info = gr.Textbox(label="Image Info", visible=False)
+            input_image = gr.Image(
+                label='Input',
+                type='pil',
+                sources=["upload"],
+                image_mode='RGB',
+                interactive=True,
+                elem_id="input-image"
+            )
+            with gr.Row(equal_height=True, elem_classes="mask_button_group"):
+                clear_button_click = gr.Button(value="Clear Clicks",elem_classes="new_button", min_width=100)
+                add_mask_button = gr.Button(value="Add Mask", elem_classes="new_button", min_width=100)
+                remove_mask_button = gr.Button(value="Delete Mask", elem_classes="new_button", min_width=100)
+            submit_button_component = gr.Button(
+                value='Start ObjectClear', elem_id="start-button"
+            )
+            with gr.Accordion('ObjectClear Settings', open=True):
+                strength = gr.Radio(
+                    choices=[0.99, 1.0],
+                    value=0.99,
+                    label="Strength",
+                    info="0.99 better preserves the background and color; use 1.0 if object/shadow is not fully removed (default: 0.99)"
+                )
+                guidance_scale = gr.Slider(
+                    minimum=1, maximum=10, step=0.5, value=2.5,
+                    label="Guidance Scale",
+                    info="Higher = stronger removal; lower = better background preservation (default: 2.5)"
+                )
+                seed = gr.Slider(
+                    minimum=0, maximum=1000000, step=1, value=300000,
+                    label="Seed Value",
+                    info="Different seeds can lead to noticeably different object removal results (default: 300000)"
+                )
+                num_inference_steps = gr.Slider(
+                    minimum=1, maximum=40, step=1, value=20,
+                    label="Num Inference Steps",
+                    info="Higher values may improve quality but take longer (default: 20)"
+                )
+        with gr.Column():
+            output_image_component = gr.Image(
+                type='pil', image_mode='RGB', label='Output', format="png", elem_id="input-image")
+            output_compare_image_component = gr.ImageSlider(
+                label="Comparison",
+                type="pil",
+                format='png',
+                elem_id="compare-image"
+            )
+        input_image.upload(
+            fn=upload_and_reset,
+            inputs=[input_image, interactive_state],
+            outputs=[
+                image_state,
+                image_info,
+                input_image,
+                interactive_state,
+                click_state,
+                mask_dropdown,
+            ]
+        )
+        # click select image to get mask using sam
+        input_image.select(
+            fn=sam_refine,
+            inputs=[image_state, point_prompt, click_state],
+            outputs=[input_image, image_state, click_state]
+        )
+        # add different mask
+        add_mask_button.click(
+            fn=add_multi_mask,
+            inputs=[image_state, interactive_state, mask_dropdown],
+            outputs=[interactive_state, mask_dropdown, input_image, click_state]
+        )
+        remove_mask_button.click(
+            fn=remove_multi_mask,
+            inputs=[interactive_state, click_state, image_state],
+            outputs=[interactive_state, mask_dropdown, input_image, click_state]
+        )
+        # points clear
+        clear_button_click.click(
+            fn = clear_click,
+            inputs = [image_state, click_state,],
+            outputs = [input_image, click_state],
+        )
+    submit_button_component.click(
+        fn=process,
+        inputs=[
+            image_state,
+            interactive_state,
+            mask_dropdown,
+            guidance_scale,
+            seed,
+            num_inference_steps,
+            strength
+        ],
+        outputs=[
+            output_image_component, output_compare_image_component
+        ]
+    )
+    with gr.Accordion("📕 Video Tutorial (click to expand)", open=False, elem_classes="custom-bg"):
+        with gr.Row():
+            gr.Video(value="/home/user/app/hugging_face/assets/tutorial.mp4", elem_classes="video")
+    gr.Markdown("---")
+    gr.Markdown("## Examples")
+    example_images = [
+        os.path.join(os.path.dirname(__file__), "examples", f"test{i}.png")
+        for i in range(10)
+    ]
+    examples_data = [
+        [example_images[i], None] for i in range(len(example_images))
+    ]
+    examples = gr.Examples(
+        examples=examples_data,
+        inputs=[input_image, interactive_state],
+        outputs=[image_state, image_info, input_image,
+                interactive_state, click_state, mask_dropdown],
+        fn=upload_and_reset,
+        run_on_click=True,
+        cache_examples=False,
+        label="Click below to load example images"
+    )
+    gr.Markdown(article)
+    def pre_update_input_image():
+        return gr.update(value=None)
+    demo.load(
+        fn=pre_update_input_image,
+        inputs=[],
+        outputs=[input_image]
+    )
+demo.launch(debug=True, show_error=True)

examples/test0.png ADDED Viewed

Git LFS Details

SHA256: 66cb4a2ef645cdb1e2e9c68892b6e94c38211673f97c9eaa09c6f1998788cee4
Pointer size: 131 Bytes
Size of remote file: 724 kB

examples/test1.png ADDED Viewed

Git LFS Details

SHA256: 097677dbe298b5b20f580ed7f42684b5ea2d8a2b011c567a88a346897ddd2b1a
Pointer size: 131 Bytes
Size of remote file: 617 kB

examples/test2.png ADDED Viewed

Git LFS Details

SHA256: 738793c28578dd0acf7fcf1111d58b307045984a1e5dbdedc65f6ce1644f11dc
Pointer size: 131 Bytes
Size of remote file: 467 kB

examples/test3.png ADDED Viewed

Git LFS Details

SHA256: b5dbd3dccc28294bcdf719b7b5a1e098f46d0e74cf5fbdc05ee3419f3d9ffd2c
Pointer size: 131 Bytes
Size of remote file: 817 kB

examples/test4.png ADDED Viewed

Git LFS Details

SHA256: 348a0175866d26b31b4035fb9864efe5578a483b66b6e951879756b5c04c7190
Pointer size: 131 Bytes
Size of remote file: 602 kB

examples/test5.png ADDED Viewed

Git LFS Details

SHA256: 59e3857564e9aafd6d1d3aceb3944da0a2e94682e47460315934dfdf623ce758
Pointer size: 131 Bytes
Size of remote file: 522 kB

examples/test6.png ADDED Viewed

Git LFS Details

SHA256: 0ac8ba5cfe64e48caa296640a83980f3b4e177432bc3e3512e715c736895100f
Pointer size: 131 Bytes
Size of remote file: 548 kB

examples/test7.png ADDED Viewed

Git LFS Details

SHA256: 3dac86ced73d33ae75143978112757695afd763a134bb9b7bde344fe22d46897
Pointer size: 131 Bytes
Size of remote file: 570 kB

examples/test8.png ADDED Viewed

Git LFS Details

SHA256: dfbe01aa72c61b03ae5c9f57bdb31abd95460ec20b9dc03260847b0cc668ec85
Pointer size: 131 Bytes
Size of remote file: 291 kB

examples/test9.png ADDED Viewed

Git LFS Details

SHA256: ba42d15d3c7a4419bc0dad7896edacec91d80c8dff8dc35a0ea74251957eb5e1
Pointer size: 131 Bytes
Size of remote file: 849 kB

model.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from transformers.models.clip.modeling_clip import (
+    CLIPTextTransformer,
+    CLIPPreTrainedModel,
+    CLIPModel,
+)
+class CLIPImageEncoder(CLIPPreTrainedModel):
+    @staticmethod
+    def from_pretrained(
+        global_model_name_or_path,
+        cache_dir
+    ):
+        model = CLIPModel.from_pretrained(
+            global_model_name_or_path,
+            subfolder="image_prompt_encoder",
+            cache_dir=cache_dir
+        )
+        vision_model = model.vision_model
+        visual_projection = model.visual_projection
+        vision_processor = T.Normalize(
+            (0.48145466, 0.4578275, 0.40821073),
+            (0.26862954, 0.26130258, 0.27577711),
+        )
+        return CLIPImageEncoder(
+            vision_model,
+            visual_projection,
+            vision_processor,
+        )
+    def __init__(
+        self,
+        vision_model,
+        visual_projection,
+        vision_processor,
+    ):
+        super().__init__(vision_model.config)
+        self.vision_model = vision_model
+        self.visual_projection = visual_projection
+        self.vision_processor = vision_processor
+        self.image_size = vision_model.config.image_size
+    def forward(self, object_pixel_values):
+        b, c, h, w = object_pixel_values.shape
+        if h != self.image_size or w != self.image_size:
+            h, w = self.image_size, self.image_size
+            object_pixel_values = F.interpolate(
+                object_pixel_values, (h, w), mode="bilinear", antialias=True
+            )
+        object_pixel_values = self.vision_processor(object_pixel_values)
+        object_embeds = self.vision_model(object_pixel_values)[1]
+        object_embeds = self.visual_projection(object_embeds)
+        object_embeds = object_embeds.view(b, 1, -1)
+        return object_embeds
+class MLP(nn.Module):
+    def __init__(self, in_dim, out_dim, hidden_dim, use_residual=True):
+        super().__init__()
+        if use_residual:
+            assert in_dim == out_dim
+        self.layernorm = nn.LayerNorm(in_dim)
+        self.fc1 = nn.Linear(in_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, out_dim)
+        self.use_residual = use_residual
+        self.act_fn = nn.GELU()
+    def forward(self, x):
+        residual = x
+        x = self.layernorm(x)
+        x = self.fc1(x)
+        x = self.act_fn(x)
+        x = self.fc2(x)
+        if self.use_residual:
+            x = x + residual
+        return x
+class PostfuseModule(nn.Module):
+    def __init__(self, embed_dim, embed_dim_img):
+        super().__init__()
+        self.mlp1 = MLP(embed_dim_img, embed_dim, embed_dim, use_residual=False)
+        self.mlp2 = MLP(embed_dim, embed_dim, embed_dim, use_residual=True)
+        self.layer_norm = nn.LayerNorm(embed_dim)
+    @property
+    def dtype(self):
+        try:
+            return next(self.parameters()).dtype
+        except StopIteration:
+            return torch.float32
+    def fuse_fn(self, object_embeds):
+        text_object_embeds = self.mlp1(object_embeds)
+        text_object_embeds = self.mlp2(text_object_embeds)
+        text_object_embeds = self.layer_norm(text_object_embeds)
+        return text_object_embeds
+    def forward(
+        self,
+        text_embeds,
+        object_embeds,
+        fuse_index,
+    ) -> torch.Tensor:
+        text_object_embed = self.fuse_fn(object_embeds)
+        text_embeds_new = text_embeds.clone()
+        text_embeds_new[:, fuse_index, :] = text_object_embed.squeeze(1)
+        return text_embeds_new

pipeline_objectclear.py ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch==2.2.0
+torchvision
+numpy==1.26.4
+opencv-python
+pillow
+transformers
+scipy
+diffusers
+segment-anything
+matplotlib

tools/__init__.py ADDED Viewed

File without changes

tools/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (168 Bytes). View file

tools/__pycache__/base_segmenter.cpython-310.pyc ADDED Viewed

Binary file (4.11 kB). View file

tools/__pycache__/download_util.cpython-310.pyc ADDED Viewed

Binary file (3.5 kB). View file

tools/__pycache__/interact_tools.cpython-310.pyc ADDED Viewed

Binary file (2.49 kB). View file

tools/__pycache__/mask_painter.cpython-310.pyc ADDED Viewed

Binary file (6.52 kB). View file

tools/__pycache__/misc.cpython-310.pyc ADDED Viewed

Binary file (4.34 kB). View file

tools/__pycache__/painter.cpython-310.pyc ADDED Viewed

Binary file (4.81 kB). View file

tools/base_segmenter.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import time
+import torch
+import cv2
+from PIL import Image, ImageDraw, ImageOps
+import numpy as np
+from typing import Union
+from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
+import matplotlib.pyplot as plt
+import PIL
+from .mask_painter import mask_painter
+class BaseSegmenter:
+    def __init__(self, SAM_checkpoint, model_type, device='cuda:0'):
+        """
+        device: model device
+        SAM_checkpoint: path of SAM checkpoint
+        model_type: vit_b, vit_l, vit_h
+        """
+        print(f"Initializing BaseSegmenter to {device}")
+        assert model_type in ['vit_b', 'vit_l', 'vit_h'], 'model_type must be vit_b, vit_l, or vit_h'
+        self.device = device
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.model = sam_model_registry[model_type](checkpoint=SAM_checkpoint)
+        self.model.to(device=self.device)
+        self.predictor = SamPredictor(self.model)
+        self.embedded = False
+    @torch.no_grad()
+    def set_image(self, image: np.ndarray):
+        # PIL.open(image_path) 3channel: RGB
+        # image embedding: avoid encode the same image multiple times
+        self.orignal_image = image
+        if self.embedded:
+            print('repeat embedding, please reset_image.')
+            return
+        self.predictor.set_image(image)
+        self.embedded = True
+        return
+    @torch.no_grad()
+    def reset_image(self):
+        # reset image embeding
+        self.predictor.reset_image()
+        self.embedded = False
+    def predict(self, prompts, mode, multimask=True):
+        """
+        image: numpy array, h, w, 3
+        prompts: dictionary, 3 keys: 'point_coords', 'point_labels', 'mask_input'
+        prompts['point_coords']: numpy array [N,2]
+        prompts['point_labels']: numpy array [1,N]
+        prompts['mask_input']: numpy array [1,256,256]
+        mode: 'point' (points only), 'mask' (mask only), 'both' (consider both)
+        mask_outputs: True (return 3 masks), False (return 1 mask only)
+        whem mask_outputs=True, mask_input=logits[np.argmax(scores), :, :][None, :, :]
+        """
+        assert self.embedded, 'prediction is called before set_image (feature embedding).'
+        assert mode in ['point', 'mask', 'both'], 'mode must be point, mask, or both'
+        if mode == 'point':
+            masks, scores, logits = self.predictor.predict(point_coords=prompts['point_coords'],
+                                point_labels=prompts['point_labels'],
+                                multimask_output=multimask)
+        elif mode == 'mask':
+            masks, scores, logits = self.predictor.predict(mask_input=prompts['mask_input'],
+                                multimask_output=multimask)
+        elif mode == 'both':   # both
+            masks, scores, logits = self.predictor.predict(point_coords=prompts['point_coords'],
+                                point_labels=prompts['point_labels'],
+                                mask_input=prompts['mask_input'],
+                                multimask_output=multimask)
+        else:
+            raise("Not implement now!")
+        # masks (n, h, w), scores (n,), logits (n, 256, 256)
+        return masks, scores, logits
+if __name__ == "__main__":
+    # load and show an image
+    image = cv2.imread('/hhd3/gaoshang/truck.jpg')
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # numpy array (h, w, 3)
+    # initialise BaseSegmenter
+    SAM_checkpoint= '/ssd1/gaomingqi/checkpoints/sam_vit_h_4b8939.pth'
+    model_type = 'vit_h'
+    device = "cuda:4"
+    base_segmenter = BaseSegmenter(SAM_checkpoint=SAM_checkpoint, model_type=model_type, device=device)
+    # image embedding (once embedded, multiple prompts can be applied)
+    base_segmenter.set_image(image)
+    # examples
+    # point only ------------------------
+    mode = 'point'
+    prompts = {
+        'point_coords': np.array([[500, 375], [1125, 625]]),
+        'point_labels': np.array([1, 1]),
+    }
+    masks, scores, logits = base_segmenter.predict(prompts, mode, multimask=False)  # masks (n, h, w), scores (n,), logits (n, 256, 256)
+    painted_image = mask_painter(image, masks[np.argmax(scores)].astype('uint8'), background_alpha=0.8)
+    painted_image = cv2.cvtColor(painted_image, cv2.COLOR_RGB2BGR)  # numpy array (h, w, 3)
+    cv2.imwrite('/hhd3/gaoshang/truck_point.jpg', painted_image)
+    # both ------------------------
+    mode = 'both'
+    mask_input  = logits[np.argmax(scores), :, :]
+    prompts = {'mask_input': mask_input [None, :, :]}
+    prompts = {
+        'point_coords': np.array([[500, 375], [1125, 625]]),
+        'point_labels': np.array([1, 0]),
+        'mask_input': mask_input[None, :, :]
+    }
+    masks, scores, logits = base_segmenter.predict(prompts, mode, multimask=True)  # masks (n, h, w), scores (n,), logits (n, 256, 256)
+    painted_image = mask_painter(image, masks[np.argmax(scores)].astype('uint8'), background_alpha=0.8)
+    painted_image = cv2.cvtColor(painted_image, cv2.COLOR_RGB2BGR)  # numpy array (h, w, 3)
+    cv2.imwrite('/hhd3/gaoshang/truck_both.jpg', painted_image)
+    # mask only ------------------------
+    mode = 'mask'
+    mask_input  = logits[np.argmax(scores), :, :]
+    prompts = {'mask_input': mask_input[None, :, :]}
+    masks, scores, logits = base_segmenter.predict(prompts, mode, multimask=True)  # masks (n, h, w), scores (n,), logits (n, 256, 256)
+    painted_image = mask_painter(image, masks[np.argmax(scores)].astype('uint8'), background_alpha=0.8)
+    painted_image = cv2.cvtColor(painted_image, cv2.COLOR_RGB2BGR)  # numpy array (h, w, 3)
+    cv2.imwrite('/hhd3/gaoshang/truck_mask.jpg', painted_image)

tools/download_util.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import math
+import os
+import requests
+from torch.hub import download_url_to_file, get_dir
+from tqdm import tqdm
+from urllib.parse import urlparse
+def sizeof_fmt(size, suffix='B'):
+    """Get human readable file size.
+    Args:
+        size (int): File size.
+        suffix (str): Suffix. Default: 'B'.
+    Return:
+        str: Formated file siz.
+    """
+    for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
+        if abs(size) < 1024.0:
+            return f'{size:3.1f} {unit}{suffix}'
+        size /= 1024.0
+    return f'{size:3.1f} Y{suffix}'
+def download_file_from_google_drive(file_id, save_path):
+    """Download files from google drive.
+    Ref:
+    https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive  # noqa E501
+    Args:
+        file_id (str): File id.
+        save_path (str): Save path.
+    """
+    session = requests.Session()
+    URL = 'https://docs.google.com/uc?export=download'
+    params = {'id': file_id}
+    response = session.get(URL, params=params, stream=True)
+    token = get_confirm_token(response)
+    if token:
+        params['confirm'] = token
+        response = session.get(URL, params=params, stream=True)
+    # get file size
+    response_file_size = session.get(URL, params=params, stream=True, headers={'Range': 'bytes=0-2'})
+    print(response_file_size)
+    if 'Content-Range' in response_file_size.headers:
+        file_size = int(response_file_size.headers['Content-Range'].split('/')[1])
+    else:
+        file_size = None
+    save_response_content(response, save_path, file_size)
+def get_confirm_token(response):
+    for key, value in response.cookies.items():
+        if key.startswith('download_warning'):
+            return value
+    return None
+def save_response_content(response, destination, file_size=None, chunk_size=32768):
+    if file_size is not None:
+        pbar = tqdm(total=math.ceil(file_size / chunk_size), unit='chunk')
+        readable_file_size = sizeof_fmt(file_size)
+    else:
+        pbar = None
+    with open(destination, 'wb') as f:
+        downloaded_size = 0
+        for chunk in response.iter_content(chunk_size):
+            downloaded_size += chunk_size
+            if pbar is not None:
+                pbar.update(1)
+                pbar.set_description(f'Download {sizeof_fmt(downloaded_size)} / {readable_file_size}')
+            if chunk:  # filter out keep-alive new chunks
+                f.write(chunk)
+        if pbar is not None:
+            pbar.close()
+def load_file_from_url(url, model_dir=None, progress=True, file_name=None):
+    """Load file form http url, will download models if necessary.
+    Ref:https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py
+    Args:
+        url (str): URL to be downloaded.
+        model_dir (str): The path to save the downloaded model. Should be a full path. If None, use pytorch hub_dir.
+            Default: None.
+        progress (bool): Whether to show the download progress. Default: True.
+        file_name (str): The downloaded file name. If None, use the file name in the url. Default: None.
+    Returns:
+        str: The path to the downloaded file.
+    """
+    if model_dir is None:  # use the pytorch hub_dir
+        hub_dir = get_dir()
+        model_dir = os.path.join(hub_dir, 'checkpoints')
+    os.makedirs(model_dir, exist_ok=True)
+    parts = urlparse(url)
+    filename = os.path.basename(parts.path)
+    if file_name is not None:
+        filename = file_name
+    cached_file = os.path.abspath(os.path.join(model_dir, filename))
+    if not os.path.exists(cached_file):
+        print(f'Downloading: "{url}" to {cached_file}\n')
+        download_url_to_file(url, cached_file, hash_prefix=None, progress=progress)
+    return cached_file

tools/interact_tools.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import time
+import torch
+import cv2
+from PIL import Image, ImageDraw, ImageOps
+import numpy as np
+from typing import Union
+from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
+import matplotlib.pyplot as plt
+import PIL
+from .mask_painter import mask_painter as mask_painter2
+from .base_segmenter import BaseSegmenter
+from .painter import mask_painter, point_painter
+import os
+import requests
+import sys
+mask_color = 3
+mask_alpha = 0.7
+contour_color = 1
+contour_width = 5
+point_color_ne = 8
+point_color_ps = 50
+point_alpha = 0.9
+point_radius = 15
+contour_color = 2
+contour_width = 5
+class SamControler():
+    def __init__(self, SAM_checkpoint, model_type, device):
+        '''
+        initialize sam controler
+        '''
+        self.sam_controler = BaseSegmenter(SAM_checkpoint, model_type, device)
+    # def seg_again(self, image: np.ndarray):
+    #     '''
+    #     it is used when interact in video
+    #     '''
+    #     self.sam_controler.reset_image()
+    #     self.sam_controler.set_image(image)
+    #     return
+    def first_frame_click(self, image: np.ndarray, points:np.ndarray, labels: np.ndarray, multimask=True,mask_color=3):
+        '''
+        it is used in first frame in video
+        return: mask, logit, painted image(mask+point)
+        '''
+        # self.sam_controler.set_image(image)
+        origal_image = self.sam_controler.orignal_image
+        neg_flag = labels[-1]
+        if neg_flag==1:
+            #find neg
+            prompts = {
+                'point_coords': points,
+                'point_labels': labels,
+            }
+            masks, scores, logits = self.sam_controler.predict(prompts, 'point', multimask)
+            mask, logit = masks[np.argmax(scores)], logits[np.argmax(scores), :, :]
+            prompts = {
+                'point_coords': points,
+                'point_labels': labels,
+                'mask_input': logit[None, :, :]
+            }
+            masks, scores, logits = self.sam_controler.predict(prompts, 'both', multimask)
+            mask, logit = masks[np.argmax(scores)], logits[np.argmax(scores), :, :]
+        else:
+           #find positive
+            prompts = {
+                'point_coords': points,
+                'point_labels': labels,
+            }
+            masks, scores, logits = self.sam_controler.predict(prompts, 'point', multimask)
+            mask, logit = masks[np.argmax(scores)], logits[np.argmax(scores), :, :]
+        assert len(points)==len(labels)
+        painted_image = mask_painter(image, mask.astype('uint8'), mask_color, mask_alpha, contour_color, contour_width)
+        painted_image = point_painter(painted_image, np.squeeze(points[np.argwhere(labels>0)],axis = 1), point_color_ne, point_alpha, point_radius, contour_color, contour_width)
+        painted_image = point_painter(painted_image, np.squeeze(points[np.argwhere(labels<1)],axis = 1), point_color_ps, point_alpha, point_radius, contour_color, contour_width)
+        painted_image = Image.fromarray(painted_image)
+        return mask, logit, painted_image

tools/mask_painter.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import cv2
+import torch
+import numpy as np
+from PIL import Image
+import copy
+import time
+def colormap(rgb=True):
+	color_list = np.array(
+		[
+			0.000, 0.000, 0.000,
+			1.000, 1.000, 1.000,
+			1.000, 0.498, 0.313,
+			0.392, 0.581, 0.929,
+			0.000, 0.447, 0.741,
+			0.850, 0.325, 0.098,
+			0.929, 0.694, 0.125,
+			0.494, 0.184, 0.556,
+			0.466, 0.674, 0.188,
+			0.301, 0.745, 0.933,
+			0.635, 0.078, 0.184,
+			0.300, 0.300, 0.300,
+			0.600, 0.600, 0.600,
+			1.000, 0.000, 0.000,
+			1.000, 0.500, 0.000,
+			0.749, 0.749, 0.000,
+			0.000, 1.000, 0.000,
+			0.000, 0.000, 1.000,
+			0.667, 0.000, 1.000,
+			0.333, 0.333, 0.000,
+			0.333, 0.667, 0.000,
+			0.333, 1.000, 0.000,
+			0.667, 0.333, 0.000,
+			0.667, 0.667, 0.000,
+			0.667, 1.000, 0.000,
+			1.000, 0.333, 0.000,
+			1.000, 0.667, 0.000,
+			1.000, 1.000, 0.000,
+			0.000, 0.333, 0.500,
+			0.000, 0.667, 0.500,
+			0.000, 1.000, 0.500,
+			0.333, 0.000, 0.500,
+			0.333, 0.333, 0.500,
+			0.333, 0.667, 0.500,
+			0.333, 1.000, 0.500,
+			0.667, 0.000, 0.500,
+			0.667, 0.333, 0.500,
+			0.667, 0.667, 0.500,
+			0.667, 1.000, 0.500,
+			1.000, 0.000, 0.500,
+			1.000, 0.333, 0.500,
+			1.000, 0.667, 0.500,
+			1.000, 1.000, 0.500,
+			0.000, 0.333, 1.000,
+			0.000, 0.667, 1.000,
+			0.000, 1.000, 1.000,
+			0.333, 0.000, 1.000,
+			0.333, 0.333, 1.000,
+			0.333, 0.667, 1.000,
+			0.333, 1.000, 1.000,
+			0.667, 0.000, 1.000,
+			0.667, 0.333, 1.000,
+			0.667, 0.667, 1.000,
+			0.667, 1.000, 1.000,
+			1.000, 0.000, 1.000,
+			1.000, 0.333, 1.000,
+			1.000, 0.667, 1.000,
+			0.167, 0.000, 0.000,
+			0.333, 0.000, 0.000,
+			0.500, 0.000, 0.000,
+			0.667, 0.000, 0.000,
+			0.833, 0.000, 0.000,
+			1.000, 0.000, 0.000,
+			0.000, 0.167, 0.000,
+			0.000, 0.333, 0.000,
+			0.000, 0.500, 0.000,
+			0.000, 0.667, 0.000,
+			0.000, 0.833, 0.000,
+			0.000, 1.000, 0.000,
+			0.000, 0.000, 0.167,
+			0.000, 0.000, 0.333,
+			0.000, 0.000, 0.500,
+			0.000, 0.000, 0.667,
+			0.000, 0.000, 0.833,
+			0.000, 0.000, 1.000,
+			0.143, 0.143, 0.143,
+			0.286, 0.286, 0.286,
+			0.429, 0.429, 0.429,
+			0.571, 0.571, 0.571,
+			0.714, 0.714, 0.714,
+			0.857, 0.857, 0.857
+		]
+	).astype(np.float32)
+	color_list = color_list.reshape((-1, 3)) * 255
+	if not rgb:
+		color_list = color_list[:, ::-1]
+	return color_list
+color_list = colormap()
+color_list = color_list.astype('uint8').tolist()
+def vis_add_mask(image, background_mask, contour_mask, background_color, contour_color, background_alpha, contour_alpha):
+	background_color = np.array(background_color)
+	contour_color = np.array(contour_color)
+	# background_mask = 1 - background_mask
+	# contour_mask = 1 - contour_mask
+	for i in range(3):
+		image[:, :, i] = image[:, :, i] * (1-background_alpha+background_mask*background_alpha) \
+			+ background_color[i] * (background_alpha-background_mask*background_alpha)
+		image[:, :, i] = image[:, :, i] * (1-contour_alpha+contour_mask*contour_alpha) \
+			+ contour_color[i] * (contour_alpha-contour_mask*contour_alpha)
+	return image.astype('uint8')
+def mask_generator_00(mask, background_radius, contour_radius):
+	# no background width when '00'
+	# distance map
+	dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
+	dist_transform_back = cv2.distanceTransform(1-mask, cv2.DIST_L2, 3)
+	dist_map = dist_transform_fore - dist_transform_back
+	# ...:::!!!:::...
+	contour_radius += 2
+	contour_mask = np.abs(np.clip(dist_map, -contour_radius, contour_radius))
+	contour_mask = contour_mask / np.max(contour_mask)
+	contour_mask[contour_mask>0.5] = 1.
+	return mask, contour_mask
+def mask_generator_01(mask, background_radius, contour_radius):
+	# no background width when '00'
+	# distance map
+	dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
+	dist_transform_back = cv2.distanceTransform(1-mask, cv2.DIST_L2, 3)
+	dist_map = dist_transform_fore - dist_transform_back
+	# ...:::!!!:::...
+	contour_radius += 2
+	contour_mask = np.abs(np.clip(dist_map, -contour_radius, contour_radius))
+	contour_mask = contour_mask / np.max(contour_mask)
+	return mask, contour_mask
+def mask_generator_10(mask, background_radius, contour_radius):
+	# distance map
+	dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
+	dist_transform_back = cv2.distanceTransform(1-mask, cv2.DIST_L2, 3)
+	dist_map = dist_transform_fore - dist_transform_back
+	# .....:::::!!!!!
+	background_mask = np.clip(dist_map, -background_radius, background_radius)
+	background_mask = (background_mask - np.min(background_mask))
+	background_mask = background_mask / np.max(background_mask)
+	# ...:::!!!:::...
+	contour_radius += 2
+	contour_mask = np.abs(np.clip(dist_map, -contour_radius, contour_radius))
+	contour_mask = contour_mask / np.max(contour_mask)
+	contour_mask[contour_mask>0.5] = 1.
+	return background_mask, contour_mask
+def mask_generator_11(mask, background_radius, contour_radius):
+	# distance map
+	dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
+	dist_transform_back = cv2.distanceTransform(1-mask, cv2.DIST_L2, 3)
+	dist_map = dist_transform_fore - dist_transform_back
+	# .....:::::!!!!!
+	background_mask = np.clip(dist_map, -background_radius, background_radius)
+	background_mask = (background_mask - np.min(background_mask))
+	background_mask = background_mask / np.max(background_mask)
+	# ...:::!!!:::...
+	contour_radius += 2
+	contour_mask = np.abs(np.clip(dist_map, -contour_radius, contour_radius))
+	contour_mask = contour_mask / np.max(contour_mask)
+	return background_mask, contour_mask
+def mask_painter(input_image, input_mask, background_alpha=0.5, background_blur_radius=7, contour_width=3, contour_color=3, contour_alpha=1, mode='11'):
+	"""
+	Input:
+	input_image: numpy array
+	input_mask: numpy array
+	background_alpha: transparency of background, [0, 1], 1: all black, 0: do nothing
+	background_blur_radius: radius of background blur, must be odd number
+	contour_width: width of mask contour, must be odd number
+	contour_color: color index (in color map) of mask contour, 0: black, 1: white, >1: others
+	contour_alpha: transparency of mask contour, [0, 1], if 0: no contour highlighted
+	mode: painting mode, '00', no blur, '01' only blur contour, '10' only blur background, '11' blur both
+	Output:
+	painted_image: numpy array
+	"""
+	assert input_image.shape[:2] == input_mask.shape, 'different shape'
+	assert background_blur_radius % 2 * contour_width % 2 > 0, 'background_blur_radius and contour_width must be ODD'
+	assert mode in ['00', '01', '10', '11'], 'mode should be 00, 01, 10, or 11'
+	# downsample input image and mask
+	width, height = input_image.shape[0], input_image.shape[1]
+	res = 1024
+	ratio = min(1.0 * res / max(width, height), 1.0)
+	input_image = cv2.resize(input_image, (int(height*ratio), int(width*ratio)))
+	input_mask = cv2.resize(input_mask, (int(height*ratio), int(width*ratio)))
+	# 0: background, 1: foreground
+	msk = np.clip(input_mask, 0, 1)
+	# generate masks for background and contour pixels
+	background_radius = (background_blur_radius - 1) // 2
+	contour_radius = (contour_width - 1) // 2
+	generator_dict = {'00':mask_generator_00, '01':mask_generator_01, '10':mask_generator_10, '11':mask_generator_11}
+	background_mask, contour_mask = generator_dict[mode](msk, background_radius, contour_radius)
+	# paint
+	painted_image = vis_add_mask\
+		(input_image, background_mask, contour_mask, color_list[0], color_list[contour_color], background_alpha, contour_alpha)	# black for background
+	return painted_image
+if __name__ == '__main__':
+	background_alpha = 0.7  	# transparency of background 1: all black, 0: do nothing
+	background_blur_radius = 31	# radius of background blur, must be odd number
+	contour_width = 11       	# contour width, must be odd number
+	contour_color = 3      		# id in color map, 0: black, 1: white, >1: others
+	contour_alpha = 1       	# transparency of background, 0: no contour highlighted
+	# load input image and mask
+	input_image = np.array(Image.open('./test_img/painter_input_image.jpg').convert('RGB'))
+	input_mask = np.array(Image.open('./test_img/painter_input_mask.jpg').convert('P'))
+	# paint
+	overall_time_1 = 0
+	overall_time_2 = 0
+	overall_time_3 = 0
+	overall_time_4 = 0
+	overall_time_5 = 0
+	for i in range(50):
+		t2 = time.time()
+		painted_image_00 = mask_painter(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, mode='00')
+		e2 = time.time()
+		t3 = time.time()
+		painted_image_10 = mask_painter(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, mode='10')
+		e3 = time.time()
+		t1 = time.time()
+		painted_image = mask_painter(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha)
+		e1 = time.time()
+		t4 = time.time()
+		painted_image_01 = mask_painter(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, mode='01')
+		e4 = time.time()
+		t5 = time.time()
+		painted_image_11 = mask_painter(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, mode='11')
+		e5 = time.time()
+		overall_time_1 += (e1 - t1)
+		overall_time_2 += (e2 - t2)
+		overall_time_3 += (e3 - t3)
+		overall_time_4 += (e4 - t4)
+		overall_time_5 += (e5 - t5)
+	print(f'average time w gaussian: {overall_time_1/50}')
+	print(f'average time w/o gaussian00: {overall_time_2/50}')
+	print(f'average time w/o gaussian10: {overall_time_3/50}')
+	print(f'average time w/o gaussian01: {overall_time_4/50}')
+	print(f'average time w/o gaussian11: {overall_time_5/50}')
+	# save
+	painted_image_00 = Image.fromarray(painted_image_00)
+	painted_image_00.save('./test_img/painter_output_image_00.png')
+	painted_image_10 = Image.fromarray(painted_image_10)
+	painted_image_10.save('./test_img/painter_output_image_10.png')
+	painted_image_01 = Image.fromarray(painted_image_01)
+	painted_image_01.save('./test_img/painter_output_image_01.png')
+	painted_image_11 = Image.fromarray(painted_image_11)
+	painted_image_11.save('./test_img/painter_output_image_11.png')

tools/misc.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import re
+import random
+import time
+import torch
+import torch.nn as nn
+import logging
+import numpy as np
+from os import path as osp
+def constant_init(module, val, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+initialized_logger = {}
+def get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=None):
+    """Get the root logger.
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added.
+    Args:
+        logger_name (str): root logger name. Default: 'basicsr'.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the root logger.
+        log_level (int): The root logger level. Note that only the process of
+            rank 0 is affected, while other processes will set the level to
+            "Error" and be silent most of the time.
+    Returns:
+        logging.Logger: The root logger.
+    """
+    logger = logging.getLogger(logger_name)
+    # if the logger has been initialized, just return it
+    if logger_name in initialized_logger:
+        return logger
+    format_str = '%(asctime)s %(levelname)s: %(message)s'
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(logging.Formatter(format_str))
+    logger.addHandler(stream_handler)
+    logger.propagate = False
+    if log_file is not None:
+        logger.setLevel(log_level)
+        # add file handler
+        # file_handler = logging.FileHandler(log_file, 'w')
+        file_handler = logging.FileHandler(log_file, 'a') #Shangchen: keep the previous log
+        file_handler.setFormatter(logging.Formatter(format_str))
+        file_handler.setLevel(log_level)
+        logger.addHandler(file_handler)
+    initialized_logger[logger_name] = True
+    return logger
+IS_HIGH_VERSION = [int(m) for m in list(re.findall(r"^([0-9]+)\.([0-9]+)\.([0-9]+)([^0-9][a-zA-Z0-9]*)?(\+git.*)?$",\
+    torch.__version__)[0][:3])] >= [1, 12, 0]
+def gpu_is_available():
+    if IS_HIGH_VERSION:
+        if torch.backends.mps.is_available():
+            return True
+    return True if torch.cuda.is_available() and torch.backends.cudnn.is_available() else False
+def get_device(gpu_id=None):
+    if gpu_id is None:
+        gpu_str = ''
+    elif isinstance(gpu_id, int):
+        gpu_str = f':{gpu_id}'
+    else:
+        raise TypeError('Input should be int value.')
+    if IS_HIGH_VERSION:
+        if torch.backends.mps.is_available():
+            return torch.device('mps'+gpu_str)
+    return torch.device('cuda'+gpu_str if torch.cuda.is_available() and torch.backends.cudnn.is_available() else 'cpu')
+def set_random_seed(seed):
+    """Set random seeds."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def get_time_str():
+    return time.strftime('%Y%m%d_%H%M%S', time.localtime())
+def scandir(dir_path, suffix=None, recursive=False, full_path=False):
+    """Scan a directory to find the interested files.
+    Args:
+        dir_path (str): Path of the directory.
+        suffix (str | tuple(str), optional): File suffix that we are
+            interested in. Default: None.
+        recursive (bool, optional): If set to True, recursively scan the
+            directory. Default: False.
+        full_path (bool, optional): If set to True, include the dir_path.
+            Default: False.
+    Returns:
+        A generator for all the interested files with relative pathes.
+    """
+    if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+        raise TypeError('"suffix" must be a string or tuple of strings')
+    root = dir_path
+    def _scandir(dir_path, suffix, recursive):
+        for entry in os.scandir(dir_path):
+            if not entry.name.startswith('.') and entry.is_file():
+                if full_path:
+                    return_path = entry.path
+                else:
+                    return_path = osp.relpath(entry.path, root)
+                if suffix is None:
+                    yield return_path
+                elif return_path.endswith(suffix):
+                    yield return_path
+            else:
+                if recursive:
+                    yield from _scandir(entry.path, suffix=suffix, recursive=recursive)
+                else:
+                    continue
+    return _scandir(dir_path, suffix=suffix, recursive=recursive)

tools/painter.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# paint masks, contours, or points on images, with specified colors
+import cv2
+import torch
+import numpy as np
+from PIL import Image
+import copy
+import time
+def colormap(rgb=True):
+	color_list = np.array(
+		[
+			0.000, 0.000, 0.000,
+			1.000, 1.000, 1.000,
+			1.000, 0.498, 0.313,
+			0.392, 0.581, 0.929,
+			0.000, 0.447, 0.741,
+			0.850, 0.325, 0.098,
+			0.929, 0.694, 0.125,
+			0.494, 0.184, 0.556,
+			0.466, 0.674, 0.188,
+			0.301, 0.745, 0.933,
+			0.635, 0.078, 0.184,
+			0.300, 0.300, 0.300,
+			0.600, 0.600, 0.600,
+			1.000, 0.000, 0.000,
+			1.000, 0.500, 0.000,
+			0.749, 0.749, 0.000,
+			0.000, 1.000, 0.000,
+			0.000, 0.000, 1.000,
+			0.667, 0.000, 1.000,
+			0.333, 0.333, 0.000,
+			0.333, 0.667, 0.000,
+			0.333, 1.000, 0.000,
+			0.667, 0.333, 0.000,
+			0.667, 0.667, 0.000,
+			0.667, 1.000, 0.000,
+			1.000, 0.333, 0.000,
+			1.000, 0.667, 0.000,
+			1.000, 1.000, 0.000,
+			0.000, 0.333, 0.500,
+			0.000, 0.667, 0.500,
+			0.000, 1.000, 0.500,
+			0.333, 0.000, 0.500,
+			0.333, 0.333, 0.500,
+			0.333, 0.667, 0.500,
+			0.333, 1.000, 0.500,
+			0.667, 0.000, 0.500,
+			0.667, 0.333, 0.500,
+			0.667, 0.667, 0.500,
+			0.667, 1.000, 0.500,
+			1.000, 0.000, 0.500,
+			1.000, 0.333, 0.500,
+			1.000, 0.667, 0.500,
+			1.000, 1.000, 0.500,
+			0.000, 0.333, 1.000,
+			0.000, 0.667, 1.000,
+			0.000, 1.000, 1.000,
+			0.333, 0.000, 1.000,
+			0.333, 0.333, 1.000,
+			0.333, 0.667, 1.000,
+			0.333, 1.000, 1.000,
+			0.667, 0.000, 1.000,
+			0.667, 0.333, 1.000,
+			0.667, 0.667, 1.000,
+			0.667, 1.000, 1.000,
+			1.000, 0.000, 1.000,
+			1.000, 0.333, 1.000,
+			1.000, 0.667, 1.000,
+			0.167, 0.000, 0.000,
+			0.333, 0.000, 0.000,
+			0.500, 0.000, 0.000,
+			0.667, 0.000, 0.000,
+			0.833, 0.000, 0.000,
+			1.000, 0.000, 0.000,
+			0.000, 0.167, 0.000,
+			0.000, 0.333, 0.000,
+			0.000, 0.500, 0.000,
+			0.000, 0.667, 0.000,
+			0.000, 0.833, 0.000,
+			0.000, 1.000, 0.000,
+			0.000, 0.000, 0.167,
+			0.000, 0.000, 0.333,
+			0.000, 0.000, 0.500,
+			0.000, 0.000, 0.667,
+			0.000, 0.000, 0.833,
+			0.000, 0.000, 1.000,
+			0.143, 0.143, 0.143,
+			0.286, 0.286, 0.286,
+			0.429, 0.429, 0.429,
+			0.571, 0.571, 0.571,
+			0.714, 0.714, 0.714,
+			0.857, 0.857, 0.857
+		]
+	).astype(np.float32)
+	color_list = color_list.reshape((-1, 3)) * 255
+	if not rgb:
+		color_list = color_list[:, ::-1]
+	return color_list
+color_list = colormap()
+color_list = color_list.astype('uint8').tolist()
+def vis_add_mask(image, mask, color, alpha):
+	color = np.array(color_list[color])
+	mask = mask > 0.5
+	image[mask] = image[mask] * (1-alpha) + color * alpha
+	return image.astype('uint8')
+def point_painter(input_image, input_points, point_color=5, point_alpha=0.9, point_radius=15, contour_color=2, contour_width=5):
+	h, w = input_image.shape[:2]
+	point_mask = np.zeros((h, w)).astype('uint8')
+	for point in input_points:
+		point_mask[point[1], point[0]] = 1
+	kernel = cv2.getStructuringElement(2, (point_radius, point_radius))
+	point_mask = cv2.dilate(point_mask, kernel)
+	contour_radius = (contour_width - 1) // 2
+	dist_transform_fore = cv2.distanceTransform(point_mask, cv2.DIST_L2, 3)
+	dist_transform_back = cv2.distanceTransform(1-point_mask, cv2.DIST_L2, 3)
+	dist_map = dist_transform_fore - dist_transform_back
+	# ...:::!!!:::...
+	contour_radius += 2
+	contour_mask = np.abs(np.clip(dist_map, -contour_radius, contour_radius))
+	contour_mask = contour_mask / np.max(contour_mask)
+	contour_mask[contour_mask>0.5] = 1.
+	# paint mask
+	painted_image = vis_add_mask(input_image.copy(), point_mask, point_color, point_alpha)
+	# paint contour
+	painted_image = vis_add_mask(painted_image.copy(), 1-contour_mask, contour_color, 1)
+	return painted_image
+def mask_painter(input_image, input_mask, mask_color=5, mask_alpha=0.7, contour_color=1, contour_width=3):
+	assert input_image.shape[:2] == input_mask.shape, 'different shape between image and mask'
+	# 0: background, 1: foreground
+	mask = np.clip(input_mask, 0, 1)
+	contour_radius = (contour_width - 1) // 2
+	dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
+	dist_transform_back = cv2.distanceTransform(1-mask, cv2.DIST_L2, 3)
+	dist_map = dist_transform_fore - dist_transform_back
+	# ...:::!!!:::...
+	contour_radius += 2
+	contour_mask = np.abs(np.clip(dist_map, -contour_radius, contour_radius))
+	contour_mask = contour_mask / np.max(contour_mask)
+	contour_mask[contour_mask>0.5] = 1.
+	# paint mask
+	painted_image = vis_add_mask(input_image.copy(), mask.copy(), mask_color, mask_alpha)
+	# paint contour
+	painted_image = vis_add_mask(painted_image.copy(), 1-contour_mask, contour_color, 1)
+	return painted_image
+def background_remover(input_image, input_mask):
+	"""
+	input_image: H, W, 3, np.array
+	input_mask: H, W, np.array
+	image_wo_background: PIL.Image
+	"""
+	assert input_image.shape[:2] == input_mask.shape, 'different shape between image and mask'
+	# 0: background, 1: foreground
+	mask = np.expand_dims(np.clip(input_mask, 0, 1), axis=2)*255
+	image_wo_background = np.concatenate([input_image, mask], axis=2)		# H, W, 4
+	image_wo_background = Image.fromarray(image_wo_background).convert('RGBA')
+	return image_wo_background
+if __name__ == '__main__':
+	input_image = np.array(Image.open('images/painter_input_image.jpg').convert('RGB'))
+	input_mask = np.array(Image.open('images/painter_input_mask.jpg').convert('P'))
+	# example of mask painter
+	mask_color = 3
+	mask_alpha = 0.7
+	contour_color = 1
+	contour_width = 5
+	# save
+	painted_image = Image.fromarray(input_image)
+	painted_image.save('images/original.png')
+	painted_image = mask_painter(input_image, input_mask, mask_color, mask_alpha, contour_color, contour_width)
+	# save
+	painted_image = Image.fromarray(input_image)
+	painted_image.save('images/original1.png')
+	# example of point painter
+	input_image = np.array(Image.open('images/painter_input_image.jpg').convert('RGB'))
+	input_points = np.array([[500, 375], [70, 600]])	# x, y
+	point_color = 5
+	point_alpha = 0.9
+	point_radius = 15
+	contour_color = 2
+	contour_width = 5
+	painted_image_1 = point_painter(input_image, input_points, point_color, point_alpha, point_radius, contour_color, contour_width)
+	# save
+	painted_image = Image.fromarray(painted_image_1)
+	painted_image.save('images/point_painter_1.png')
+	input_image = np.array(Image.open('images/painter_input_image.jpg').convert('RGB'))
+	painted_image_2 = point_painter(input_image, input_points, point_color=9, point_radius=20, contour_color=29)
+	# save
+	painted_image = Image.fromarray(painted_image_2)
+	painted_image.save('images/point_painter_2.png')
+	# example of background remover
+	input_image = np.array(Image.open('images/original.png').convert('RGB'))
+	image_wo_background = background_remover(input_image, input_mask)	# return PIL.Image
+	image_wo_background.save('images/image_wo_background.png')