Spaces:

dawn17
/

Unet

Runtime error

App Files Files Community

dawn17 commited on Aug 16, 2023

Commit

bcc0f94

1 Parent(s): e0678c3

Upload 35 files

Browse files

Files changed (35) hide show

app.py +29 -0
checkpoint/resunet/decoder.pt +3 -0
requirements.txt +12 -0
sample/bird_plane.jpeg +0 -0
sample/dog.jpeg +0 -0
sample/group.webp +0 -0
sample/horse_person_cycle.jpeg +0 -0
sample/mask.jpeg +0 -0
sample/people.jpeg +0 -0
sample/titanic.jpeg +0 -0
src/datasets/__init__.py +0 -0
src/datasets/coco/README.md +6 -0
src/datasets/coco/dataset.ipynb +0 -0
src/datasets/coco/dataset.py +137 -0
src/datasets/coco/samples/airplane.png +0 -0
src/datasets/coco/samples/giraffe.png +0 -0
src/datasets/coco/samples/people.png +0 -0
src/datasets/coco/samples/zebra.png +0 -0
src/models/unet/__init__.py +0 -0
src/models/unet/config/carvana_config.yml +81 -0
src/models/unet/config/paper_config.yml +60 -0
src/models/unet/config/resnet_config.yml +32 -0
src/models/unet/decoder/__init__.py +1 -0
src/models/unet/decoder/decoder.py +76 -0
src/models/unet/encoder/__init__.py +2 -0
src/models/unet/encoder/encoder.py +80 -0
src/models/unet/encoder/resnet.py +30 -0
src/models/unet/example/model_sample.ipynb +532 -0
src/models/unet/resunet.py +66 -0
src/run/unet/example/binary_segmentation_resunet.ipynb +0 -0
src/run/unet/inference.py +111 -0
src/unet/__init__.py +0 -0
src/unet/config/carvana_config.yml +81 -0
src/unet/config/paper_config.yml +60 -0
src/unet/model.py +175 -0

app.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import gradio as gr
+from src.run.unet.inference import ResUnetInfer
+infer = ResUnetInfer(
+    model_path="./checkpoint/resunet/decoder.pt",
+    config_path="./src/models/unet/config/resnet_config.yml",
+)
+demo = gr.Interface(
+    fn=infer.infer,
+    inputs=[
+        gr.Image(
+            shape=(224, 224),
+            label="Input Image",
+            value="./sample/bird_plane.jpeg",
+        )
+    ],
+    outputs=[
+        gr.Image(),
+    ],
+    examples=[[os.path.join("./sample/", f)] for f in os.listdir("./sample/")],
+)
+demo.launch()

checkpoint/resunet/decoder.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df2780f1ec58f0a9653c951b341102097ef20a8bbd9cd9aba2ea8e789876b9ae
+size 189285667

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch
+torchinfo
+easydict
+gradio
+torchvision
+numpy
+grad - cam
+Pillow
+albumentations
+tqdm
+opencv - python
+matplotlib

sample/bird_plane.jpeg ADDED Viewed

sample/dog.jpeg ADDED Viewed

sample/group.webp ADDED Viewed

sample/horse_person_cycle.jpeg ADDED Viewed

sample/mask.jpeg ADDED Viewed

sample/people.jpeg ADDED Viewed

sample/titanic.jpeg ADDED Viewed

src/datasets/__init__.py ADDED Viewed

File without changes

src/datasets/coco/README.md ADDED Viewed

	@@ -0,0 +1,6 @@

+# Coco Dataset Sample
+![Image1](samples/people.png)
+![Image2](samples/giraffe.png)
+![Image3](samples/airplane.png)
+![Image4](samples/zebra.png)

src/datasets/coco/dataset.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

src/datasets/coco/dataset.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os.path
+from typing import Any, Callable, List, Optional, Tuple
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from torchvision.datasets import VisionDataset
+class CocoDetection(VisionDataset):
+    def __init__(
+        self,
+        root: str,
+        annFile: str,
+        class_names: Optional[List] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        transforms: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        from pycocotools.coco import COCO
+        self.coco = COCO(annFile)
+        if class_names is not None:
+            cat_ids = self._get_category_ids_from_name(category_names=class_names)
+            self.ids = list(
+                sorted((self._get_img_ids_for_category_ids(category_ids=cat_ids)))
+            )
+        else:
+            cat_ids = self.coco.getCatIds()
+            self.ids = list(sorted(self.coco.imgs.keys()))
+        self.cat2idx = {cat_id: idx + 1 for idx, cat_id in enumerate(cat_ids)}
+        self.cat2idx[0] = 0
+    def _load_image(self, id: int) -> Image.Image:
+        path = self.coco.loadImgs(id)[0]["file_name"]
+        return Image.open(os.path.join(self.root, path)).convert("RGB")
+    def _load_target(self, id: int) -> List[Any]:
+        return self.coco.loadAnns(self.coco.getAnnIds(id))
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        id = self.ids[index]
+        image = self._load_image(id)
+        mask = self._load_target(id)
+        mask = self._get_mask_in_channels(image, mask)
+        if self.transform is not None:
+            image = self.transform(image=np.array(image))["image"]
+        if self.target_transform is not None:
+            mask = self.target_transform(image=mask)["image"]
+        return image, (mask != 0).int()
+    def __len__(self) -> int:
+        return len(self.ids)
+    def _get_all_classes(self):
+        catIDs = self.coco.getCatIds()
+        return self.coco.loadCats(catIDs)
+    def _get_category_info_from_ids(self, ids: list):
+        all_cat = self._get_all_classes()
+        return [category for category in all_cat if category["id"] in ids]
+    def _get_category_ids_from_name(self, category_names: list):
+        return self.coco.getCatIds(catNms=category_names)
+    def _get_img_ids_for_category_ids(self, category_ids: list):
+        img_ids = []
+        for catIds in category_ids:
+            img_ids.extend(self.coco.getImgIds(catIds=catIds))
+        return img_ids
+    def _get_img_ids_for_category_names(self, category_names: list):
+        img_ids = []
+        category_ids = self._get_category_ids_from_name(category_names=class_names)
+        for catIds in category_ids:
+            img_ids.extend(self.coco.getImgIds(catIds=catIds))
+        return img_ids
+    def _get_all_category_ids_in_img_id(self, img_id: int) -> List:
+        target = self._load_target(img_id)
+        return list({annotation["category_id"] for annotation in target})
+    def _get_mask_aggregated(self, image: Image, annotations: List) -> np.array:
+        w, h = image.size
+        mask = np.zeros((h, w))
+        for annotation in annotations:
+            category_id = annotation["category_id"]
+            if category_id in self.cat2idx:
+                pixel_value = self.cat2idx[category_id]
+                mask = np.maximum(self.coco.annToMask(annotation) * pixel_value, mask)
+        return mask
+    def _get_mask_in_channels(self, image: Image, annotations: List) -> np.array:
+        w, h = image.size
+        mask = np.zeros((len(self.cat2idx), h, w))
+        for annotation in annotations:
+            category_id = annotation["category_id"]
+            if category_id in self.cat2idx:
+                pixel_value = self.cat2idx[category_id]
+                mask[pixel_value] = np.maximum(
+                    self.coco.annToMask(annotation), mask[pixel_value]
+                )
+        # [h, w, channels]
+        mask = np.transpose(mask, (1, 2, 0))
+        return mask
+    def _plot_image_and_mask(self, index):
+        image, mask = self.__getitem__(index)
+        # Create a figure with two subplots side by side
+        fig, axs = plt.subplots(1, 2, figsize=(7, 3))
+        axs[0].imshow(image.permute(1, 2, 0))
+        axs[0].set_title("Image")
+        axs[1].imshow(mask.sum(0, keepdim=True).permute(1, 2, 0))
+        axs[1].set_title("Mask")
+        plt.show()

src/datasets/coco/samples/airplane.png ADDED Viewed

src/datasets/coco/samples/giraffe.png ADDED Viewed

src/datasets/coco/samples/people.png ADDED Viewed

src/datasets/coco/samples/zebra.png ADDED Viewed

src/models/unet/__init__.py ADDED Viewed

File without changes

src/models/unet/config/carvana_config.yml ADDED Viewed

	@@ -0,0 +1,81 @@

+# Input (1, 512, 512)
+# Output (64, 512, 512)
+decoder_config:
+  block5: # (1024, 32, 32)
+    in_channels: 1024
+    kernel_size: 3
+    out_channels: 1024
+    padding:
+    - 1
+    - 1
+    stride: 1 # (1024, 32, 32)
+  block4: # (1024, 32, 32)
+    in_channels: 1024
+    kernel_size: 2
+    out_channels: 512
+    padding:
+    - 0
+    - 1
+    stride: 2 # (512, 64, 64)
+  block3: # (512, 64, 64)
+    in_channels: 512
+    kernel_size: 2
+    out_channels: 256
+    padding:
+    - 0
+    - 1
+    stride: 2 # (256, 128, 128)
+  block2: # (256, 128, 128)
+    in_channels: 256
+    kernel_size: 2
+    out_channels: 128
+    padding:
+    - 0
+    - 1
+    stride: 2 # (128, 256, 256)
+  block1: # (128, 256, 256)
+    in_channels: 128
+    kernel_size: 2
+    out_channels: 64
+    padding:
+    - 0
+    - 1
+    stride: 2 # (64, 512, 512)
+encoder_config:
+  block1: # (1, 512, 512)
+    all_padding: true
+    in_channels: 1
+    maxpool: true
+    n_layers: 2
+    out_channels: 64 # (64, 256, 256)
+  block2: # (64, 256, 256)
+    all_padding: true
+    in_channels: 64
+    maxpool: true
+    n_layers: 2
+    out_channels: 128 # (128, 128, 128)
+  block3: # (128, 128, 128)
+    all_padding: true
+    in_channels: 128
+    maxpool: true
+    n_layers: 2
+    out_channels: 256 # (256, 64, 64)
+  block4: # (256, 64, 64)
+    all_padding: true
+    in_channels: 256
+    maxpool: true
+    n_layers: 2
+    out_channels: 512 # (512, 32, 32)
+  block5: # (512, 32, 32)
+    all_padding: true
+    in_channels: 512
+    maxpool: false
+    n_layers: 2
+    out_channels: 512 # (512, 32, 32)
+  block6: # (512, 32, 32)
+    all_padding: true
+    in_channels: 512
+    maxpool: false
+    n_layers: 2
+    out_channels: 1024 # (1024, 32, 32)
+nclasses: 2

src/models/unet/config/paper_config.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+# Original UNet Paper Configuration
+# Input shape [1, 572, 572]
+# Output shape [64, 388, 388]
+decoder_config:
+  block4: # [1024, 28, 28]
+    in_channels: 1024
+    kernel_size: 2
+    out_channels: 512
+    padding: [0, 0]
+    stride: 2 # [512, 52, 52]
+  block3: # [512, 52, 52]
+    in_channels: 512
+    kernel_size: 2
+    out_channels: 256
+    padding: [0, 0]
+    stride: 2 # [256, 100, 100]
+  block2: # [256, 100, 100]
+    in_channels: 256
+    kernel_size: 2
+    out_channels: 128
+    padding: [0, 0]
+    stride: 2 # [128, 196, 196]
+  block1: # [128, 196, 196]
+    in_channels: 128
+    kernel_size: 2
+    out_channels: 64
+    padding: [0, 0]
+    stride: 2 # [64, 388, 388]
+encoder_config:
+  block1: # [1, 572, 572]
+    all_padding: false
+    in_channels: 1
+    maxpool: true
+    n_layers: 2
+    out_channels: 64 # [64, 568/2, 568/2] = [64, 284, 284]
+  block2: # [64, 568/2, 568/2] = [64, 284, 284]
+    all_padding: false
+    in_channels: 64
+    maxpool: true
+    n_layers: 2
+    out_channels: 128 # [128, 280/2, 280/2] = [128, 140, 140]
+  block3: # [128, 280/2, 280/2] = [128, 140, 140]
+    all_padding: false
+    in_channels: 128
+    maxpool: true
+    n_layers: 2
+    out_channels: 256 # [256, 136/2, 136/2] = [256, 68, 68]
+  block4: # [256, 136/2, 136/2] = [256, 68, 68]
+    all_padding: false
+    in_channels: 256
+    maxpool: true
+    n_layers: 2
+    out_channels: 512  # [512, 64/2, 64/2] = [512, 32, 32]
+  block5: # [512, 64/2, 64/2] = [512, 32, 32]
+    all_padding: false
+    in_channels: 512
+    maxpool: false
+    n_layers: 2
+    out_channels: 1024 # [1024, 28, 28]
+nclasses: 2

src/models/unet/config/resnet_config.yml ADDED Viewed

	@@ -0,0 +1,32 @@

+# Original UNet Paper Configuration
+# Input shape [1, 572, 572]
+# Output shape [64, 388, 388]
+decoder_config:
+  block4: # [2048, 16, 16]
+    in_channels: 2048
+    kernel_size: 2
+    out_channels: 1024
+    padding: [0, 0]
+    stride: 2 # [1024, 28, 28]
+  block3: # [1024, 28, 28]
+    in_channels: 1024
+    kernel_size: 2
+    out_channels: 512
+    padding: [0, 0]
+    stride: 2 # [512, 52, 52]
+  block2: # [512, 52, 52]
+    in_channels: 512
+    kernel_size: 2
+    out_channels: 128
+    padding: [0, 0]
+    stride: 2 # [256, 100, 100]
+  block1: # [256, 100, 100]
+    in_channels: 128
+    kernel_size: 2
+    out_channels: 64
+    padding: [0, 0]
+    stride: 2 # [128, 196, 196]
+nclasses: 1
+input_size: [224, 224]
+mean: [0.485, 0.456, 0.406]
+std: [0.229, 0.224, 0.225]

src/models/unet/decoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .decoder import Decoder as CustomDecoder

src/models/unet/decoder/decoder.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+import torch.nn as nn
+class DecoderLayer(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size=2, stride=2, padding=[0, 0]
+    ):
+        super(DecoderLayer, self).__init__()
+        self.up_conv = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=in_channels // 2,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding[0],
+        )
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv = nn.Sequential(
+            *[
+                self._conv_relu_layer(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    out_channels=out_channels,
+                    padding=padding[1],
+                )
+                for i in range(2)
+            ]
+        )
+    def _conv_relu_layer(self, in_channels, out_channels, padding=0):
+        return nn.Sequential(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=padding,
+            ),
+            nn.ReLU(),
+            nn.BatchNorm2d(out_channels),
+        )
+    @staticmethod
+    def crop_cat(x, encoder_output):
+        delta = (encoder_output.shape[-1] - x.shape[-1]) // 2
+        encoder_output = encoder_output[
+            :, :, delta : delta + x.shape[-1], delta : delta + x.shape[-1]
+        ]
+        return torch.cat((encoder_output, x), dim=1)
+    def forward(self, x, encoder_output):
+        x = self.crop_cat(self.up_conv(x), encoder_output)
+        x = self.bn1(x)
+        return self.conv(x)
+class Decoder(nn.Module):
+    def __init__(self, config):
+        super(Decoder, self).__init__()
+        self.decoder = nn.ModuleDict(
+            {
+                name: DecoderLayer(
+                    in_channels=block["in_channels"],
+                    out_channels=block["out_channels"],
+                    kernel_size=block["kernel_size"],
+                    stride=block["stride"],
+                    padding=block["padding"],
+                )
+                for name, block in config.items()
+            }
+        )
+    def forward(self, x, encoder_output):
+        for name, block in self.decoder.items():
+            x = block(x, encoder_output[name])
+        return x

src/models/unet/encoder/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .encoder import Encoder as CustomEncoder
2	+ from .resnet import Encoder as ResnetEncoder

src/models/unet/encoder/encoder.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch.nn as nn
+"""
+downsampling blocks
+(first half of the 'U' in UNet)
+[ENCODER]
+"""
+class EncoderLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=64,
+        n_layers=2,
+        all_padding=False,
+        maxpool=True,
+    ):
+        super(EncoderLayer, self).__init__()
+        f_in_channel = lambda layer: in_channels if layer == 0 else out_channels
+        f_padding = lambda layer: 1 if layer >= 2 or all_padding else 0
+        self.layer = nn.Sequential(
+            *[
+                self._conv_relu_layer(
+                    in_channels=f_in_channel(i),
+                    out_channels=out_channels,
+                    padding=f_padding(i),
+                )
+                for i in range(n_layers)
+            ]
+        )
+        self.maxpool = maxpool
+    def _conv_relu_layer(self, in_channels, out_channels, padding=0):
+        return nn.Sequential(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=padding,
+            ),
+            nn.ReLU(),
+            nn.BatchNorm2d(out_channels),
+        )
+    def forward(self, x):
+        return self.layer(x)
+class Encoder(nn.Module):
+    def __init__(self, config):
+        super(Encoder, self).__init__()
+        self.encoder = nn.ModuleDict(
+            {
+                name: EncoderLayer(
+                    in_channels=block["in_channels"],
+                    out_channels=block["out_channels"],
+                    n_layers=block["n_layers"],
+                    all_padding=block["all_padding"],
+                    maxpool=block["maxpool"],
+                )
+                for name, block in config.items()
+            }
+        )
+        self.maxpool = nn.MaxPool2d(2)
+    def forward(self, x):
+        output = dict()
+        for i, (block_name, block) in enumerate(self.encoder.items()):
+            x = block(x)
+            output[block_name] = x
+            if block.maxpool:
+                x = self.maxpool(x)
+        return x, output

src/models/unet/encoder/resnet.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from torchvision.models import resnet50, ResNet50_Weights
+import torch.nn as nn
+class Encoder(nn.Module):
+    def __init__(self):
+        super(Encoder, self).__init__()
+        resnet = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
+        for param in resnet.parameters():
+            param.requires_grad_(False)
+        self.stages = nn.ModuleDict(
+            {
+                "block1": nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu),
+                "block2": nn.Sequential(resnet.maxpool, resnet.layer1),
+                "block3": resnet.layer2,
+                "block4": resnet.layer3,
+                "block5": resnet.layer4,
+            }
+        )
+    def forward(self, x):
+        stages = {}
+        for name, stage in self.stages.items():
+            x = stage(x)
+            stages[name] = x
+        return x, stages

src/models/unet/example/model_sample.ipynb ADDED Viewed

	@@ -0,0 +1,532 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "310eb987-37b7-4620-b533-089644fbb440",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.functional as F\n",
+    "import torch.nn as nn\n",
+    "import yaml\n",
+    "from easydict import EasyDict\n",
+    "from torchinfo import summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f8cff897-df8f-4e6d-893b-321805699e1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_path = \"./config/paper_config.yml\"\n",
+    "\n",
+    "with open(config_path, \"r\") as file:\n",
+    "    yaml_data = yaml.safe_load(file)\n",
+    "\n",
+    "config = EasyDict(yaml_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca66846e-d2b4-4dd2-83eb-eee746c26c74",
+   "metadata": {},
+   "source": [
+    "# Encoder "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "975a6f86-68ff-4fda-b7d8-acf453addade",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "==========================================================================================\n",
+       "Layer (type:depth-idx)                   Output Shape              Param #\n",
+       "==========================================================================================\n",
+       "EncoderLayer                             [64, 568, 568]            --\n",
+       "├─Sequential: 1-1                        [64, 568, 568]            --\n",
+       "│    └─Sequential: 2-1                   [64, 570, 570]            --\n",
+       "│    │    └─Conv2d: 3-1                  [64, 570, 570]            640\n",
+       "│    │    └─ReLU: 3-2                    [64, 570, 570]            --\n",
+       "│    └─Sequential: 2-2                   [64, 568, 568]            --\n",
+       "│    │    └─Conv2d: 3-3                  [64, 568, 568]            36,928\n",
+       "│    │    └─ReLU: 3-4                    [64, 568, 568]            --\n",
+       "==========================================================================================\n",
+       "Total params: 37,568\n",
+       "Trainable params: 37,568\n",
+       "Non-trainable params: 0\n",
+       "Total mult-adds (G): 1.37\n",
+       "==========================================================================================\n",
+       "Input size (MB): 1.31\n",
+       "Forward/backward pass size (MB): 331.53\n",
+       "Params size (MB): 0.15\n",
+       "Estimated Total Size (MB): 332.99\n",
+       "=========================================================================================="
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\"\"\n",
+    "downsampling blocks \n",
+    "(first half of the 'U' in UNet) \n",
+    "[ENCODER]\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "class EncoderLayer(nn.Module):\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        in_channels=1,\n",
+    "        out_channels=64,\n",
+    "        n_layers=2,\n",
+    "        all_padding=False,\n",
+    "        maxpool=True,\n",
+    "    ):\n",
+    "        super(EncoderLayer, self).__init__()\n",
+    "\n",
+    "        f_in_channel = lambda layer: in_channels if layer == 0 else out_channels\n",
+    "        f_padding = lambda layer: 1 if layer >= 2 or all_padding else 0\n",
+    "\n",
+    "        self.layer = nn.Sequential(\n",
+    "            *[\n",
+    "                self._conv_relu_layer(\n",
+    "                    in_channels=f_in_channel(i),\n",
+    "                    out_channels=out_channels,\n",
+    "                    padding=f_padding(i),\n",
+    "                )\n",
+    "                for i in range(n_layers)\n",
+    "            ]\n",
+    "        )\n",
+    "        self.maxpool = maxpool\n",
+    "\n",
+    "    def _conv_relu_layer(self, in_channels, out_channels, padding=0):\n",
+    "        return nn.Sequential(\n",
+    "            nn.Conv2d(\n",
+    "                in_channels=in_channels,\n",
+    "                out_channels=out_channels,\n",
+    "                kernel_size=3,\n",
+    "                padding=padding,\n",
+    "            ),\n",
+    "            nn.ReLU(),\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.layer(x)\n",
+    "\n",
+    "\n",
+    "summary(\n",
+    "    EncoderLayer(in_channels=1, out_channels=64, n_layers=2, all_padding=False).cuda(),\n",
+    "    input_size=(1, 572, 572),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4eb7eedd-6530-44e2-9486-fbd8f39fd0ad",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "==========================================================================================\n",
+       "Layer (type:depth-idx)                   Output Shape              Param #\n",
+       "==========================================================================================\n",
+       "Encoder                                  [1024, 28, 28]            --\n",
+       "├─ModuleDict: 1-9                        --                        (recursive)\n",
+       "│    └─EncoderLayer: 2-1                 [64, 568, 568]            --\n",
+       "│    │    └─Sequential: 3-1              [64, 568, 568]            37,568\n",
+       "├─MaxPool2d: 1-2                         [64, 284, 284]            --\n",
+       "├─ModuleDict: 1-9                        --                        (recursive)\n",
+       "│    └─EncoderLayer: 2-2                 [128, 280, 280]           --\n",
+       "│    │    └─Sequential: 3-2              [128, 280, 280]           221,440\n",
+       "├─MaxPool2d: 1-4                         [128, 140, 140]           --\n",
+       "├─ModuleDict: 1-9                        --                        (recursive)\n",
+       "│    └─EncoderLayer: 2-3                 [256, 136, 136]           --\n",
+       "│    │    └─Sequential: 3-3              [256, 136, 136]           885,248\n",
+       "├─MaxPool2d: 1-6                         [256, 68, 68]             --\n",
+       "├─ModuleDict: 1-9                        --                        (recursive)\n",
+       "│    └─EncoderLayer: 2-4                 [512, 64, 64]             --\n",
+       "│    │    └─Sequential: 3-4              [512, 64, 64]             3,539,968\n",
+       "├─MaxPool2d: 1-8                         [512, 32, 32]             --\n",
+       "├─ModuleDict: 1-9                        --                        (recursive)\n",
+       "│    └─EncoderLayer: 2-5                 [512, 28, 28]             --\n",
+       "│    │    └─Sequential: 3-5              [512, 28, 28]             4,719,616\n",
+       "│    └─EncoderLayer: 2-6                 [1024, 28, 28]            --\n",
+       "│    │    └─Sequential: 3-6              [1024, 28, 28]            14,157,824\n",
+       "==========================================================================================\n",
+       "Total params: 23,561,664\n",
+       "Trainable params: 23,561,664\n",
+       "Non-trainable params: 0\n",
+       "Total mult-adds (G): 633.51\n",
+       "==========================================================================================\n",
+       "Input size (MB): 1.31\n",
+       "Forward/backward pass size (MB): 624.49\n",
+       "Params size (MB): 94.25\n",
+       "Estimated Total Size (MB): 720.05\n",
+       "=========================================================================================="
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "class Encoder(nn.Module):\n",
+    "    def __init__(self, config):\n",
+    "        super(Encoder, self).__init__()\n",
+    "        self.encoder = nn.ModuleDict(\n",
+    "            {\n",
+    "                name: EncoderLayer(\n",
+    "                    in_channels=block[\"in_channels\"],\n",
+    "                    out_channels=block[\"out_channels\"],\n",
+    "                    n_layers=block[\"n_layers\"],\n",
+    "                    all_padding=block[\"all_padding\"],\n",
+    "                    maxpool=block[\"maxpool\"],\n",
+    "                )\n",
+    "                for name, block in config.items()\n",
+    "            }\n",
+    "        )\n",
+    "        self.maxpool = nn.MaxPool2d(2)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        output = dict()\n",
+    "\n",
+    "        for i, (block_name, block) in enumerate(self.encoder.items()):\n",
+    "            x = block(x)\n",
+    "            output[block_name] = x\n",
+    "\n",
+    "            if block.maxpool:\n",
+    "                x = self.maxpool(x)\n",
+    "\n",
+    "        return x, output\n",
+    "\n",
+    "\n",
+    "summary(\n",
+    "    Encoder(config.encoder_config).cuda(),\n",
+    "    input_size=(1, 572, 572),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7ad06cb-61a2-4a66-ba58-f29d402a81f2",
+   "metadata": {},
+   "source": [
+    "# Decoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "735322d0-0dc3-4137-b906-ac7e54c43a79",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "==========================================================================================\n",
+       "Layer (type:depth-idx)                   Output Shape              Param #\n",
+       "==========================================================================================\n",
+       "DecoderLayer                             [1, 512, 52, 52]          --\n",
+       "├─ConvTranspose2d: 1-1                   [1, 512, 56, 56]          2,097,664\n",
+       "├─Sequential: 1-2                        [1, 512, 52, 52]          --\n",
+       "│    ���─Sequential: 2-1                   [1, 512, 54, 54]          --\n",
+       "│    │    └─Conv2d: 3-1                  [1, 512, 54, 54]          4,719,104\n",
+       "│    │    └─ReLU: 3-2                    [1, 512, 54, 54]          --\n",
+       "│    └─Sequential: 2-2                   [1, 512, 52, 52]          --\n",
+       "│    │    └─Conv2d: 3-3                  [1, 512, 52, 52]          2,359,808\n",
+       "│    │    └─ReLU: 3-4                    [1, 512, 52, 52]          --\n",
+       "==========================================================================================\n",
+       "Total params: 9,176,576\n",
+       "Trainable params: 9,176,576\n",
+       "Non-trainable params: 0\n",
+       "Total mult-adds (G): 26.72\n",
+       "==========================================================================================\n",
+       "Input size (MB): 11.60\n",
+       "Forward/backward pass size (MB): 35.86\n",
+       "Params size (MB): 36.71\n",
+       "Estimated Total Size (MB): 84.17\n",
+       "=========================================================================================="
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "class DecoderLayer(nn.Module):\n",
+    "    def __init__(\n",
+    "        self, in_channels, out_channels, kernel_size=2, stride=2, padding=[0, 0]\n",
+    "    ):\n",
+    "        super(DecoderLayer, self).__init__()\n",
+    "        self.up_conv = nn.ConvTranspose2d(\n",
+    "            in_channels=in_channels,\n",
+    "            out_channels=in_channels // 2,\n",
+    "            kernel_size=kernel_size,\n",
+    "            stride=stride,\n",
+    "            padding=padding[0],\n",
+    "        )\n",
+    "\n",
+    "        self.conv = nn.Sequential(\n",
+    "            *[\n",
+    "                self._conv_relu_layer(\n",
+    "                    in_channels=in_channels if i == 0 else out_channels,\n",
+    "                    out_channels=out_channels,\n",
+    "                    padding=padding[1],\n",
+    "                )\n",
+    "                for i in range(2)\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "    def _conv_relu_layer(self, in_channels, out_channels, padding=0):\n",
+    "        return nn.Sequential(\n",
+    "            nn.Conv2d(\n",
+    "                in_channels=in_channels,\n",
+    "                out_channels=out_channels,\n",
+    "                kernel_size=3,\n",
+    "                padding=padding,\n",
+    "            ),\n",
+    "            nn.ReLU(),\n",
+    "        )\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def crop_cat(x, encoder_output):\n",
+    "        delta = (encoder_output.shape[-1] - x.shape[-1]) // 2\n",
+    "        encoder_output = encoder_output[\n",
+    "            :, :, delta : delta + x.shape[-1], delta : delta + x.shape[-1]\n",
+    "        ]\n",
+    "        return torch.cat((encoder_output, x), dim=1)\n",
+    "\n",
+    "    def forward(self, x, encoder_output):\n",
+    "        x = self.crop_cat(self.up_conv(x), encoder_output)\n",
+    "        return self.conv(x)\n",
+    "\n",
+    "\n",
+    "# summary\n",
+    "input_data = [torch.rand((1, 1024, 28, 28)), torch.rand((1, 512, 64, 64))]\n",
+    "summary(\n",
+    "    DecoderLayer(in_channels=1024, out_channels=512),\n",
+    "    input_data=input_data,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "3795e85d-ff83-457c-9c12-af6cc6e2830c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "==========================================================================================\n",
+       "Layer (type:depth-idx)                   Output Shape              Param #\n",
+       "==========================================================================================\n",
+       "Decoder                                  [1, 64, 388, 388]         --\n",
+       "├─ModuleDict: 1-1                        --                        --\n",
+       "│    └─DecoderLayer: 2-1                 [1, 1024, 28, 28]         --\n",
+       "│    │    └─ConvTranspose2d: 3-1         [1, 512, 28, 28]          4,719,104\n",
+       "│    │    └─Sequential: 3-2              [1, 1024, 28, 28]         18,876,416\n",
+       "│    └─DecoderLayer: 2-2                 [1, 512, 52, 52]          --\n",
+       "│    │    └─ConvTranspose2d: 3-3         [1, 512, 56, 56]          2,097,664\n",
+       "│    │    └─Sequential: 3-4              [1, 512, 52, 52]          7,078,912\n",
+       "│    └─DecoderLayer: 2-3                 [1, 256, 100, 100]        --\n",
+       "│    │    └─ConvTranspose2d: 3-5         [1, 256, 104, 104]        524,544\n",
+       "│    │    └─Sequential: 3-6              [1, 256, 100, 100]        1,769,984\n",
+       "│    └─DecoderLayer: 2-4                 [1, 128, 196, 196]        --\n",
+       "│    │    └─ConvTranspose2d: 3-7         [1, 128, 200, 200]        131,200\n",
+       "│    │    └─Sequential: 3-8              [1, 128, 196, 196]        442,624\n",
+       "│    └─DecoderLayer: 2-5                 [1, 64, 388, 388]         --\n",
+       "│    │    └─ConvTranspose2d: 3-9         [1, 64, 392, 392]         32,832\n",
+       "│    │    └─Sequential: 3-10             [1, 64, 388, 388]         110,720\n",
+       "==========================================================================================\n",
+       "Total params: 35,784,000\n",
+       "Trainable params: 35,784,000\n",
+       "Non-trainable params: 0\n",
+       "Total mult-adds (G): 113.38\n",
+       "==========================================================================================\n",
+       "Input size (MB): 158.09\n",
+       "Forward/backward pass size (MB): 469.93\n",
+       "Params size (MB): 143.14\n",
+       "Estimated Total Size (MB): 771.16\n",
+       "=========================================================================================="
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "class Decoder(nn.Module):\n",
+    "    def __init__(self, config):\n",
+    "        super(Decoder, self).__init__()\n",
+    "        self.decoder = nn.ModuleDict(\n",
+    "            {\n",
+    "                name: DecoderLayer(\n",
+    "                    in_channels=block[\"in_channels\"],\n",
+    "                    out_channels=block[\"out_channels\"],\n",
+    "                    kernel_size=block[\"kernel_size\"],\n",
+    "                    stride=block[\"stride\"],\n",
+    "                    padding=block[\"padding\"],\n",
+    "                )\n",
+    "                for name, block in config.items()\n",
+    "            }\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x, encoder_output):\n",
+    "        for name, block in self.decoder.items():\n",
+    "            x = block(x, encoder_output[name])\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "# summary\n",
+    "encoder_input = torch.rand((1, 1, 572, 572), device=\"cuda\")\n",
+    "x, encoder_output = Encoder(config.encoder_config).cuda()(encoder_input)\n",
+    "\n",
+    "input_data = [x, encoder_output]\n",
+    "summary(\n",
+    "    Decoder(config.decoder_config).cuda(),\n",
+    "    input_data=input_data,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6cd06e02-abd4-4537-8bce-5a15c4ad4f85",
+   "metadata": {},
+   "source": [
+    "# UNet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "24fd0355-3603-4a55-b827-068eda70b78a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "===============================================================================================\n",
+       "Layer (type:depth-idx)                        Output Shape              Param #\n",
+       "===============================================================================================\n",
+       "UNet                                          [1, 2, 388, 388]          --\n",
+       "├─Encoder: 1-1                                [1, 1024, 28, 28]         --\n",
+       "│    └─ModuleDict: 2-9                        --                        (recursive)\n",
+       "│    │    └─EncoderLayer: 3-1                 [1, 64, 568, 568]         37,568\n",
+       "│    └─MaxPool2d: 2-2                         [1, 64, 284, 284]         --\n",
+       "│    └─ModuleDict: 2-9                        --                        (recursive)\n",
+       "│    │    └─EncoderLayer: 3-2                 [1, 128, 280, 280]        221,440\n",
+       "│    └─MaxPool2d: 2-4                         [1, 128, 140, 140]        --\n",
+       "│    └─ModuleDict: 2-9                        --                        (recursive)\n",
+       "│    │    └─EncoderLayer: 3-3                 [1, 256, 136, 136]        885,248\n",
+       "│    └─MaxPool2d: 2-6                         [1, 256, 68, 68]          --\n",
+       "│    └─ModuleDict: 2-9                        --                        (recursive)\n",
+       "│    │    └─EncoderLayer: 3-4                 [1, 512, 64, 64]          3,539,968\n",
+       "│    └─MaxPool2d: 2-8                         [1, 512, 32, 32]          --\n",
+       "│    └─ModuleDict: 2-9                        --                        (recursive)\n",
+       "│    │    └─EncoderLayer: 3-5                 [1, 512, 28, 28]          4,719,616\n",
+       "│    │    └─EncoderLayer: 3-6                 [1, 1024, 28, 28]         14,157,824\n",
+       "├─Decoder: 1-2                                [1, 64, 388, 388]         --\n",
+       "│    └─ModuleDict: 2-10                       --                        --\n",
+       "│    │    └─DecoderLayer: 3-7                 [1, 1024, 28, 28]         23,595,520\n",
+       "│    │    └─DecoderLayer: 3-8                 [1, 512, 52, 52]          9,176,576\n",
+       "│    │    └─DecoderLayer: 3-9                 [1, 256, 100, 100]        2,294,528\n",
+       "│    │    └─DecoderLayer: 3-10                [1, 128, 196, 196]        573,824\n",
+       "│    │    └─DecoderLayer: 3-11                [1, 64, 388, 388]         143,552\n",
+       "├─Conv2d: 1-3                                 [1, 2, 388, 388]          130\n",
+       "===============================================================================================\n",
+       "Total params: 59,345,794\n",
+       "Trainable params: 59,345,794\n",
+       "Non-trainable params: 0\n",
+       "Total mult-adds (G): 189.38\n",
+       "===============================================================================================\n",
+       "Input size (MB): 1.31\n",
+       "Forward/backward pass size (MB): 1096.83\n",
+       "Params size (MB): 237.38\n",
+       "Estimated Total Size (MB): 1335.52\n",
+       "==============================================================================================="
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "class UNet(nn.Module):\n",
+    "    def __init__(self, encoder_config, decoder_config, nclasses):\n",
+    "        super(UNet, self).__init__()\n",
+    "        self.encoder = Encoder(config=encoder_config)\n",
+    "        self.decoder = Decoder(config=decoder_config)\n",
+    "\n",
+    "        self.output = nn.Conv2d(\n",
+    "            in_channels=decoder_config[\"block1\"][\"out_channels\"],\n",
+    "            out_channels=nclasses,\n",
+    "            kernel_size=1,\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x, encoder_step_output = self.encoder(x)\n",
+    "        x = self.decoder(x, encoder_step_output)\n",
+    "        return self.output(x)\n",
+    "\n",
+    "\n",
+    "summary(\n",
+    "    UNet(\n",
+    "        config[\"encoder_config\"], config[\"decoder_config\"], nclasses=config[\"nclasses\"]\n",
+    "    ),\n",
+    "    input_data=torch.rand((1, 1, 572, 572)),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "550824e4-2151-4c0b-8a12-383fa092b4ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # if config is a dict\n",
+    "# with open('custom_config.yml', 'w') as outfile:\n",
+    "#     yaml.dump(config, outfile, sort_keys=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

src/models/unet/resunet.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch.nn as nn
+from .encoder import ResnetEncoder as Encoder
+from .decoder import CustomDecoder as Decoder
+class UNet(nn.Module):
+    def __init__(self, decoder_config, nclasses, input_shape=(224, 224)):
+        super(UNet, self).__init__()
+        self.encoder = Encoder()
+        self.decoder = Decoder(config=decoder_config)
+        self.output = nn.Sequential(
+            nn.Conv2d(
+                in_channels=decoder_config["block1"]["out_channels"],
+                out_channels=nclasses,
+                kernel_size=1,
+            ),
+            nn.UpsamplingBilinear2d(size=input_shape),
+        )
+    def forward(self, x):
+        x, encoder_step_output = self.encoder(x)
+        x = self.decoder(x, encoder_step_output)
+        x = self.output(x)
+        return x
+if __name__ == "__main__":
+    import torch
+    import yaml
+    from easydict import EasyDict
+    from torchinfo import summary
+    # load config
+    config_path = "./config/resnet_config.yml"
+    with open(config_path, "r") as file:
+        yaml_data = yaml.safe_load(file)
+    config = EasyDict(yaml_data)
+    # input shape
+    input_shape = (224, 224)
+    # device
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda" if use_cuda else "cpu")
+    # model definition
+    model = UNet(
+        decoder_config=config["decoder_config"], nclasses=1, input_shape=input_shape
+    ).to(device)
+    summary(
+        model,
+        input_data=torch.rand((1, 3, input_shape[0], input_shape[1])),
+        device=device,
+    )
+    # load weights (if any)
+    model_path = None
+    if model_path is not None:
+        checkpoint = torch.load(model_path, map_location=device)
+        model.decoder.load_state_dict(checkpoint["decoder_state_dict"], strict=False)
+        model.output.load_state_dict(checkpoint["output_state_dict"], strict=False)

src/run/unet/example/binary_segmentation_resunet.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

src/run/unet/inference.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+import albumentations as A
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+import yaml
+from albumentations.pytorch import ToTensorV2
+from easydict import EasyDict
+from PIL import Image
+from src.models.unet.resunet import UNet as Model
+class ResUnetInfer:
+    def __init__(self, model_path, config_path):
+        use_cuda = torch.cuda.is_available()
+        self.device = torch.device("cuda" if use_cuda else "cpu")
+        self.config = self.load_config(config_path=config_path)
+        self.model = self.load_model(model_path=model_path)
+        self.transform = A.Compose(
+            [
+                A.Resize(self.config.input_size[0], self.config.input_size[1]),
+                A.Normalize(
+                    mean=self.config.mean,
+                    std=self.config.std,
+                    max_pixel_value=255,
+                ),
+                ToTensorV2(),
+            ]
+        )
+    def load_model(self, model_path):
+        model = Model(
+            decoder_config=self.config.decoder_config, nclasses=self.config.nclasses
+        ).to(self.device)
+        if os.path.isfile(model_path):
+            checkpoint = torch.load(model_path, map_location=self.device)
+            model.decoder.load_state_dict(
+                checkpoint["decoder_state_dict"], strict=False
+            )
+            model.output.load_state_dict(checkpoint["output_state_dict"], strict=False)
+        return model
+    def load_config(self, config_path):
+        with open(config_path, "r") as file:
+            yaml_data = yaml.safe_load(file)
+        return EasyDict(yaml_data)
+    def infer(self, image, image_weight=0.01):
+        self.model.eval()
+        input_tensor = self.transform(image=image)["image"].unsqueeze(0)
+        # get mask
+        with torch.no_grad():
+            """
+            output_tensor = [batch, 1, 224, 224]
+            batch = 1
+            """
+            output_tensor = self.model(input_tensor.to(self.device))
+        mask = torch.sigmoid(output_tensor)
+        mask = nn.UpsamplingBilinear2d(size=(image.shape[0], image.shape[1]))(mask)
+        mask = mask.squeeze(0)
+        # add zeros for green and blue channels
+        # our mask will be red in colour
+        zero_channels = torch.zeros((2, image.shape[0], image.shape[1]), device=self.device)
+        mask = torch.cat([mask, zero_channels], dim=0)
+        mask = mask.permute(1,2,0).cpu().numpy()
+        mask = np.uint8(255 * mask)
+        # overlap image and mask
+        mask = (1 - image_weight) * mask + image_weight * image
+        mask = mask / np.max(mask)
+        return np.uint8(255 * mask)
+    @staticmethod
+    def load_image_as_array(image_path):
+        # Load a PIL image
+        pil_image = Image.open(image_path)
+        # Convert PIL image to NumPy array
+        return np.array(pil_image.convert("RGB"))
+    @staticmethod
+    def plot_array(array: np.array, figsize=(10, 10)):
+        plt.figure(figsize=figsize)
+        plt.imshow(array)
+        plt.show()
+    @staticmethod
+    def save_numpy_as_image(numpy_array, image_path):
+        """
+        Saves a NumPy array as an image.
+        Args:
+            numpy_array (numpy.ndarray): The NumPy array to be saved as an image.
+            image_path (str): The path where the image will be saved.
+        """
+        # Convert the NumPy array to a PIL image
+        image = Image.fromarray(numpy_array)
+        # Save the PIL image to the specified path
+        image.save(image_path)

src/unet/__init__.py ADDED Viewed

File without changes

src/unet/config/carvana_config.yml ADDED Viewed

	@@ -0,0 +1,81 @@

+# Input (1, 512, 512)
+# Output (64, 512, 512)
+decoder_config:
+  block5: # (1024, 32, 32)
+    in_channels: 1024
+    kernel_size: 3
+    out_channels: 1024
+    padding:
+    - 1
+    - 1
+    stride: 1 # (1024, 32, 32)
+  block4: # (1024, 32, 32)
+    in_channels: 1024
+    kernel_size: 2
+    out_channels: 512
+    padding:
+    - 0
+    - 1
+    stride: 2 # (512, 64, 64)
+  block3: # (512, 64, 64)
+    in_channels: 512
+    kernel_size: 2
+    out_channels: 256
+    padding:
+    - 0
+    - 1
+    stride: 2 # (256, 128, 128)
+  block2: # (256, 128, 128)
+    in_channels: 256
+    kernel_size: 2
+    out_channels: 128
+    padding:
+    - 0
+    - 1
+    stride: 2 # (128, 256, 256)
+  block1: # (128, 256, 256)
+    in_channels: 128
+    kernel_size: 2
+    out_channels: 64
+    padding:
+    - 0
+    - 1
+    stride: 2 # (64, 512, 512)
+encoder_config:
+  block1: # (1, 512, 512)
+    all_padding: true
+    in_channels: 1
+    maxpool: true
+    n_layers: 2
+    out_channels: 64 # (64, 256, 256)
+  block2: # (64, 256, 256)
+    all_padding: true
+    in_channels: 64
+    maxpool: true
+    n_layers: 2
+    out_channels: 128 # (128, 128, 128)
+  block3: # (128, 128, 128)
+    all_padding: true
+    in_channels: 128
+    maxpool: true
+    n_layers: 2
+    out_channels: 256 # (256, 64, 64)
+  block4: # (256, 64, 64)
+    all_padding: true
+    in_channels: 256
+    maxpool: true
+    n_layers: 2
+    out_channels: 512 # (512, 32, 32)
+  block5: # (512, 32, 32)
+    all_padding: true
+    in_channels: 512
+    maxpool: false
+    n_layers: 2
+    out_channels: 512 # (512, 32, 32)
+  block6: # (512, 32, 32)
+    all_padding: true
+    in_channels: 512
+    maxpool: false
+    n_layers: 2
+    out_channels: 1024 # (1024, 32, 32)
+nclasses: 2

src/unet/config/paper_config.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+# Original UNet Paper Configuration
+# Input shape [1, 572, 572]
+# Output shape [64, 388, 388]
+decoder_config:
+  block4: # [1024, 28, 28]
+    in_channels: 1024
+    kernel_size: 2
+    out_channels: 512
+    padding: [0, 0]
+    stride: 2 # [512, 52, 52]
+  block3: # [512, 52, 52]
+    in_channels: 512
+    kernel_size: 2
+    out_channels: 256
+    padding: [0, 0]
+    stride: 2 # [256, 100, 100]
+  block2: # [256, 100, 100]
+    in_channels: 256
+    kernel_size: 2
+    out_channels: 128
+    padding: [0, 0]
+    stride: 2 # [128, 196, 196]
+  block1: # [128, 196, 196]
+    in_channels: 128
+    kernel_size: 2
+    out_channels: 64
+    padding: [0, 0]
+    stride: 2 # [64, 388, 388]
+encoder_config:
+  block1: # [1, 572, 572]
+    all_padding: false
+    in_channels: 1
+    maxpool: true
+    n_layers: 2
+    out_channels: 64 # [64, 568/2, 568/2] = [64, 284, 284]
+  block2: # [64, 568/2, 568/2] = [64, 284, 284]
+    all_padding: false
+    in_channels: 64
+    maxpool: true
+    n_layers: 2
+    out_channels: 128 # [128, 280/2, 280/2] = [128, 140, 140]
+  block3: # [128, 280/2, 280/2] = [128, 140, 140]
+    all_padding: false
+    in_channels: 128
+    maxpool: true
+    n_layers: 2
+    out_channels: 256 # [256, 136/2, 136/2] = [256, 68, 68]
+  block4: # [256, 136/2, 136/2] = [256, 68, 68]
+    all_padding: false
+    in_channels: 256
+    maxpool: true
+    n_layers: 2
+    out_channels: 512  # [512, 64/2, 64/2] = [512, 32, 32]
+  block5: # [512, 64/2, 64/2] = [512, 32, 32]
+    all_padding: false
+    in_channels: 512
+    maxpool: false
+    n_layers: 2
+    out_channels: 1024 # [1024, 28, 28]
+nclasses: 2

src/unet/model.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import torch
+import torch.nn as nn
+"""
+downsampling blocks
+(first half of the 'U' in UNet)
+[ENCODER]
+"""
+class EncoderLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=64,
+        n_layers=2,
+        all_padding=False,
+        maxpool=True,
+    ):
+        super(EncoderLayer, self).__init__()
+        f_in_channel = lambda layer: in_channels if layer == 0 else out_channels
+        f_padding = lambda layer: 1 if layer >= 2 or all_padding else 0
+        self.layer = nn.Sequential(
+            *[
+                self._conv_relu_layer(
+                    in_channels=f_in_channel(i),
+                    out_channels=out_channels,
+                    padding=f_padding(i),
+                )
+                for i in range(n_layers)
+            ]
+        )
+        self.maxpool = maxpool
+    def _conv_relu_layer(self, in_channels, out_channels, padding=0):
+        return nn.Sequential(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=padding,
+            ),
+            nn.ReLU(),
+        )
+    def forward(self, x):
+        return self.layer(x)
+class Encoder(nn.Module):
+    def __init__(self, config):
+        super(Encoder, self).__init__()
+        self.encoder = nn.ModuleDict(
+            {
+                name: EncoderLayer(
+                    in_channels=block["in_channels"],
+                    out_channels=block["out_channels"],
+                    n_layers=block["n_layers"],
+                    all_padding=block["all_padding"],
+                    maxpool=block["maxpool"],
+                )
+                for name, block in config.items()
+            }
+        )
+        self.maxpool = nn.MaxPool2d(2)
+    def forward(self, x):
+        output = dict()
+        for i, (block_name, block) in enumerate(self.encoder.items()):
+            x = block(x)
+            output[block_name] = x
+            if block.maxpool:
+                x = self.maxpool(x)
+        return x, output
+"""
+upsampling blocks
+(second half of the 'U' in UNet)
+[DECODER]
+"""
+class DecoderLayer(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size=2, stride=2, padding=[0, 0]
+    ):
+        super(DecoderLayer, self).__init__()
+        self.up_conv = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=in_channels // 2,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding[0],
+        )
+        self.conv = nn.Sequential(
+            *[
+                self._conv_relu_layer(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    out_channels=out_channels,
+                    padding=padding[1],
+                )
+                for i in range(2)
+            ]
+        )
+    def _conv_relu_layer(self, in_channels, out_channels, padding=0):
+        return nn.Sequential(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=padding,
+            ),
+            nn.ReLU(),
+        )
+    @staticmethod
+    def crop_cat(x, encoder_output):
+        delta = (encoder_output.shape[-1] - x.shape[-1]) // 2
+        encoder_output = encoder_output[
+            :, :, delta : delta + x.shape[-1], delta : delta + x.shape[-1]
+        ]
+        return torch.cat((encoder_output, x), dim=1)
+    def forward(self, x, encoder_output):
+        x = self.crop_cat(self.up_conv(x), encoder_output)
+        return self.conv(x)
+class Decoder(nn.Module):
+    def __init__(self, config):
+        super(Decoder, self).__init__()
+        self.decoder = nn.ModuleDict(
+            {
+                name: DecoderLayer(
+                    in_channels=block["in_channels"],
+                    out_channels=block["out_channels"],
+                    kernel_size=block["kernel_size"],
+                    stride=block["stride"],
+                    padding=block["padding"],
+                )
+                for name, block in config.items()
+            }
+        )
+    def forward(self, x, encoder_output):
+        for name, block in self.decoder.items():
+            x = block(x, encoder_output[name])
+        return x
+class UNet(nn.Module):
+    def __init__(self, encoder_config, decoder_config, nclasses):
+        super(UNet, self).__init__()
+        self.encoder = Encoder(config=encoder_config)
+        self.decoder = Decoder(config=decoder_config)
+        self.output = nn.Conv2d(
+            in_channels=decoder_config["block1"]["out_channels"],
+            out_channels=nclasses,
+            kernel_size=1,
+        )
+    def forward(self, x):
+        x, encoder_step_output = self.encoder(x)
+        x = self.decoder(x, encoder_step_output)
+        return self.output(x)