Spaces:

Svane20
/

unet-swin-sky-replacement

Running

App Files Files Community

Svane20 commited on Jun 12

Commit

f28556a

1 Parent(s): 16f064e

Updated model to use PyTorch instead of ONNX

Browse files

Files changed (5) hide show

app.py +10 -165
model.py → models.py +0 -0
models/.gitkeep +0 -0
pipeline.py +80 -0
replacements.py +59 -0

app.py CHANGED Viewed

@@ -1,183 +1,28 @@
 import gradio as gr
-import torch
-from torchvision.transforms import Compose, Resize, ToTensor, Normalize
-import pymatting
 import numpy as np
 from PIL import Image
-from typing import Tuple
-import random
-from pathlib import Path
-from model import SwinMattingModel
-def _load_checkpoint(model, checkpoint_path):
-    # Load the checkpoint
-    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
-    # Check if there are any errors when loading the state dictionary
-    missing_keys, unexpected_keys = model.load_state_dict(checkpoint)
-    if missing_keys:
-        print(missing_keys)
-        raise RuntimeError("Missing keys in checkpoint.")
-    if unexpected_keys:
-        print(unexpected_keys)
-        raise RuntimeError("Unexpected keys in checkpoint.")
-def _load_model(checkpoint, device):
-    model = SwinMattingModel({
-        "encoder": {
-            "model_name": "microsoft/swin-small-patch4-window7-224"
-        },
-        "decoder": {
-            "use_attn": True,
-            "refine_channels": 16
-        }
-    })
-    _load_checkpoint(model, checkpoint)
-    model.to(device)
-    model.eval()
-    return model
-transforms = Compose(
-    [
-        Resize(size=(512, 512)),
-        ToTensor(),
-        Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
-    ],
-)
-share_repo = False
-checkpoint_path = "swin_small_patch4_window7_224_512_v1_latest.pt"
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = _load_model(checkpoint_path, device)
-print(f"Using device: {device}")
-if device.type == "cuda":
-    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
-def _get_foreground_estimation(image, alpha):
-    """
-    Estimate the foreground using the image and the predicted alpha mask.
-    Args:
-        image (np.ndarray): The input image.
-        alpha (np.ndarray): The predicted alpha mask.
-    Returns:
-        np.ndarray: The estimated foreground.
-    """
-    # Normalize the image to [0, 1] range
-    normalized_image = np.array(image) / 255.0
-    # Invert the alpha mask since the pymatting library expects the sky to be the background
-    inverted_alpha = 1 - alpha
-    return pymatting.estimate_foreground_ml(image=normalized_image, alpha=inverted_alpha)
-def _sky_replacement(foreground, alpha_mask):
-    """
-    Perform sky replacement using the estimated foreground and predicted alpha mask.
-    Args:
-        foreground (np.ndarray): The estimated foreground.
-        alpha_mask (np.ndarray): The predicted alpha mask.
-    Returns:
-        np.ndarray: The sky-replaced image.
-    """
-    new_sky_path = Path(__file__).parent / "assets/skies/francesco-ungaro-i75WTJn-RBY-unsplash.jpg"
-    new_sky_img = Image.open(new_sky_path).convert("RGB")
-    # Get the target size from the foreground image
-    h, w = foreground.shape[:2]
-    # Check the size of the sky image
-    sky_width, sky_height = new_sky_img.size
-    # If the sky image is smaller than the target size
-    if sky_width < w or sky_height < h:
-        scale = max(w / sky_width, h / sky_height)
-        new_size = (int(sky_width * scale), int(sky_height * scale))
-        new_sky_img = new_sky_img.resize(new_size, resample=Image.Resampling.LANCZOS)
-        sky_width, sky_height = new_sky_img.size
-    # Determine the maximum possible top-left coordinates for the crop
-    max_left = sky_width - w
-    max_top = sky_height - h
-    # Choose random offsets for left and top within the valid range
-    left = random.randint(a=0, b=max_left) if max_left > 0 else 0
-    top = random.randint(a=0, b=max_top) if max_top > 0 else 0
-    # Crop the sky image to the target size using the random offsets
-    new_sky_img = new_sky_img.crop((left, top, left + w, top + h))
-    new_sky = np.asarray(new_sky_img).astype(np.float32) / 255.0
-    if foreground.dtype != np.float32:
-        foreground = foreground.astype(np.float32) / 255.0
-    if foreground.shape[2] == 4:
-        foreground = foreground[:, :, :3]
-    # Ensure that the alpha mask values are within the range [0, 1]
-    alpha_mask = np.clip(alpha_mask, a_min=0, a_max=1)
-    # Blend the foreground with the new sky using the alpha mask
-    return (1 - alpha_mask[:, :, None]) * foreground + alpha_mask[:, :, None] * new_sky
-def _inference(image):
-    """
-    Perform inference on the input image using the ONNX model.
-    Args:
-        image (Image): The input image.
-    Returns:
-        np.ndarray: The predicted alpha mask.
-    """
-    with torch.inference_mode():
-        output = model(image)
-    # Ensure the output is in valid range [0, 1]
-    output = output.detach().cpu().numpy()
-    output = np.clip(output, a_min=0, a_max=1)
-    return np.squeeze(output, axis=0).squeeze()
 def predict(image):
-    """
-    Perform sky replacement on the input image.
-    Args:
-        image (Image): The input image.
-    Returns:
-        Tuple[Image, Image]: The predicted alpha mask and the sky-replaced image.
-    """
-    image_tensor = transforms(image).unsqueeze(0).to(device)
-    predicted_alpha = _inference(image_tensor)
-    # Downscale the input image to match predicted_alpha
     h, w = predicted_alpha.shape
-    downscaled_image = image.resize((w, h), Image.Resampling.LANCZOS)
     # Estimate foreground and run sky_replacement
-    foreground = _get_foreground_estimation(downscaled_image, predicted_alpha)
-    replaced_sky = _sky_replacement(foreground, predicted_alpha)
     # Resize the predicted alpha and replaced sky to original dimensions
     predicted_alpha_pil = Image.fromarray((predicted_alpha * 255).astype(np.uint8), mode='L')
     predicted_alpha_pil = predicted_alpha_pil.resize((h, w), Image.Resampling.LANCZOS)
-    replaced_sky_pil = Image.fromarray((replaced_sky * 255).astype(np.uint8))  # mode='RGB' typically
     replaced_sky_pil = replaced_sky_pil.resize((h, w), Image.Resampling.LANCZOS)
     return predicted_alpha_pil, replaced_sky_pil
@@ -291,4 +136,4 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
     run_button.click(fn=predict, inputs=input_image, outputs=[output_mask, output_sky])
 # Launch the interface
-demo.launch(share=share_repo, ssr_mode=False)

 import gradio as gr
 import numpy as np
 from PIL import Image
+from pipeline import Pipeline
+from replacements import get_foreground_estimation, sky_replacement
+SHARE_REPO = False
+pipeline = Pipeline(model_name="swin_small_patch4_window7_224")
 def predict(image):
+    # Run inference to get the predicted alpha mask
+    predicted_alpha = pipeline.inference(image)
     h, w = predicted_alpha.shape
     # Estimate foreground and run sky_replacement
+    foreground = get_foreground_estimation(image, predicted_alpha)
+    replaced_sky = sky_replacement(foreground, predicted_alpha)
     # Resize the predicted alpha and replaced sky to original dimensions
     predicted_alpha_pil = Image.fromarray((predicted_alpha * 255).astype(np.uint8), mode='L')
     predicted_alpha_pil = predicted_alpha_pil.resize((h, w), Image.Resampling.LANCZOS)
+    replaced_sky_pil = Image.fromarray((replaced_sky * 255).astype(np.uint8))
     replaced_sky_pil = replaced_sky_pil.resize((h, w), Image.Resampling.LANCZOS)
     return predicted_alpha_pil, replaced_sky_pil
     run_button.click(fn=predict, inputs=input_image, outputs=[output_mask, output_sky])
 # Launch the interface
+demo.launch(share=SHARE_REPO, ssr_mode=False)

model.py → models.py RENAMED Viewed

File without changes

models/.gitkeep ADDED Viewed

File without changes

pipeline.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize
+import numpy as np
+from models import SwinMattingModel
+class Pipeline:
+    def __init__(self, model_name: str):
+        self.transforms = Compose(
+            [
+                Resize(size=(512, 512)),
+                ToTensor(),
+                Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+            ],
+        )
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.is_torch_script = self.device.type == 'cpu'
+        self.model = self._load_model(model_name)
+        self._log_device_info()
+    def inference(self, image):
+        if self.model is None:
+            raise RuntimeError("Model is not loaded. Call load_model() first.")
+        tensor = self.transforms(image).unsqueeze(0).to(self.device)
+        with torch.inference_mode():
+            output = self.model(tensor)
+        output = output.detach().cpu().numpy()
+        output = np.clip(output, a_min=0, a_max=1)
+        return np.squeeze(output, axis=0).squeeze()
+    def _load_pytorch_model(self, checkpoint):
+        model = SwinMattingModel({
+            "encoder": {
+                "model_name": "microsoft/swin-small-patch4-window7-224"
+            },
+            "decoder": {
+                "use_attn": True,
+                "refine_channels": 16
+            }
+        })
+        self._load_checkpoint(model, checkpoint)
+        model.to(self.device)
+        model.eval()
+        return model
+    def _load_model(self, model_name):
+        checkpoint_path = self._get_model_checkpoint(model_name)
+        model = torch.jit.load(checkpoint_path, map_location=self.device) if self.is_torch_script \
+            else self._load_pytorch_model(checkpoint_path)
+        model.to(self.device)
+        model.eval()
+        return model
+    def _get_model_checkpoint(self, model_name):
+        return f"models/{model_name}_torch_script.pt" if self.is_torch_script else f"models/{model_name}_minimal.pt"
+    def _load_checkpoint(self, model, checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+        missing_keys, unexpected_keys = model.load_state_dict(checkpoint)
+        if missing_keys:
+            print(missing_keys)
+            raise RuntimeError("Missing keys in checkpoint.")
+        if unexpected_keys:
+            print(unexpected_keys)
+            raise RuntimeError("Unexpected keys in checkpoint.")
+    def _log_device_info(self):
+        if self.device.type == 'cuda':
+            print(f"Hardware: {torch.cuda.get_device_name(torch.cuda.current_device())}")

replacements.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import pymatting
+import numpy as np
+from PIL import Image
+import random
+from pathlib import Path
+def get_foreground_estimation(image, alpha):
+    # Downscale the input image to match predicted_alpha
+    h, w = alpha.shape
+    downscaled_image = image.resize((w, h), Image.Resampling.LANCZOS)
+    # Normalize the image to [0, 1] range
+    normalized_image = np.array(downscaled_image) / 255.0
+    # Invert the alpha mask since the pymatting library expects the sky to be the background
+    inverted_alpha = 1 - alpha
+    return pymatting.estimate_foreground_ml(image=normalized_image, alpha=inverted_alpha)
+def sky_replacement(foreground, alpha_mask):
+    new_sky_path = Path(__file__).parent / "assets/skies/francesco-ungaro-i75WTJn-RBY-unsplash.jpg"
+    new_sky_img = Image.open(new_sky_path).convert("RGB")
+    # Get the target size from the foreground image
+    h, w = foreground.shape[:2]
+    # Check the size of the sky image
+    sky_width, sky_height = new_sky_img.size
+    # If the sky image is smaller than the target size
+    if sky_width < w or sky_height < h:
+        scale = max(w / sky_width, h / sky_height)
+        new_size = (int(sky_width * scale), int(sky_height * scale))
+        new_sky_img = new_sky_img.resize(new_size, resample=Image.Resampling.LANCZOS)
+        sky_width, sky_height = new_sky_img.size
+    # Determine the maximum possible top-left coordinates for the crop
+    max_left = sky_width - w
+    max_top = sky_height - h
+    # Choose random offsets for left and top within the valid range
+    left = random.randint(a=0, b=max_left) if max_left > 0 else 0
+    top = random.randint(a=0, b=max_top) if max_top > 0 else 0
+    # Crop the sky image to the target size using the random offsets
+    new_sky_img = new_sky_img.crop((left, top, left + w, top + h))
+    new_sky = np.asarray(new_sky_img).astype(np.float32) / 255.0
+    if foreground.dtype != np.float32:
+        foreground = foreground.astype(np.float32) / 255.0
+    if foreground.shape[2] == 4:
+        foreground = foreground[:, :, :3]
+    # Ensure that the alpha mask values are within the range [0, 1]
+    alpha_mask = np.clip(alpha_mask, a_min=0, a_max=1)
+    # Blend the foreground with the new sky using the alpha mask
+    return (1 - alpha_mask[:, :, None]) * foreground + alpha_mask[:, :, None] * new_sky