EPFL-ECEO
/

segformer-b2-finetuned-coralscapes-1024-1024

@@ -41,109 +41,109 @@ using a window size of 1024x1024.
 The simplest way to use this model to segment an image of the Coralscapes dataset is as follows:
 ```python
-  from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
-  from PIL import Image
-  from datasets import load_dataset
-  # Load an image from the coralscapes dataset or load your own image
-  dataset = load_dataset("EPFL-ECEO/coralscapes")
-  image = dataset["test"][42]["image"]
-  preprocessor = SegformerImageProcessor.from_pretrained("EPFL-ECEO/segformer-b2-finetuned-coralscapes-1024-1024")
-  model = SegformerForSemanticSegmentation.from_pretrained("EPFL-ECEO/segformer-b2-finetuned-coralscapes-1024-1024")
-  inputs = preprocessor(image, return_tensors = "pt")
-  outputs = model(**inputs)
-  outputs = preprocessor.post_process_semantic_segmentation(outputs, target_sizes=[(image.size[1], image.size[0])])
-  label_pred = outputs[0].cpu().numpy()
 ```
 While using the above approach should still work for images of different sizes and scales, for images that are not close to the training size of the model (1024x1024),
 we recommend using the following approach using a sliding window to achieve better results:
 ```python
-  import torch
-  import torch.nn.functional as F
-  from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
-  from PIL import Image
-  import numpy as np
-  from datasets import load_dataset
-  device = 'cuda' if torch.cuda.is_available() else 'cpu'
-  def resize_image(image, target_size=1024):
-      """
-      Used to resize the image such that the smaller side equals 1024
-      """
-      h_img, w_img = image.size
-      if h_img < w_img:
-          new_h, new_w = target_size, int(w_img * (target_size / h_img))
-      else:
-          new_h, new_w  = int(h_img * (target_size / w_img)), target_size
-      resized_img = image.resize((new_h, new_w))
-      return resized_img
-  def segment_image(image, preprocessor, model, crop_size = (1024, 1024), num_classes = 40, transform=None):
-      """
-      Finds an optimal stride based on the image size and aspect ratio to create
-      overlapping sliding windows of size 1024x1024 which are then fed into the model.
-      """
-      h_crop, w_crop = crop_size
-      img = torch.Tensor(np.array(resize_image(image, target_size=1024)).transpose(2, 0, 1)).unsqueeze(0)
-      batch_size, _, h_img, w_img = img.size()
-      if transform:
-          img = torch.Tensor(transform(image = img.numpy())["image"]).to(device)
-      h_grids = int(np.round(3/2*h_img/h_crop)) if h_img > h_crop else 1
-      w_grids = int(np.round(3/2*w_img/w_crop)) if w_img > w_crop else 1
-      h_stride = int((h_img - h_crop + h_grids -1)/(h_grids -1)) if h_grids > 1 else h_crop
-      w_stride = int((w_img - w_crop + w_grids -1)/(w_grids -1)) if w_grids > 1 else w_crop
-      preds = img.new_zeros((batch_size, num_classes, h_img, w_img))
-      count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
-      for h_idx in range(h_grids):
-          for w_idx in range(w_grids):
-              y1 = h_idx * h_stride
-              x1 = w_idx * w_stride
-              y2 = min(y1 + h_crop, h_img)
-              x2 = min(x1 + w_crop, w_img)
-              y1 = max(y2 - h_crop, 0)
-              x1 = max(x2 - w_crop, 0)
-              crop_img = img[:, :, y1:y2, x1:x2]
-              with torch.no_grad():
-                  if(preprocessor):
-                      inputs = preprocessor(crop_img, return_tensors = "pt")
-                      inputs["pixel_values"] = inputs["pixel_values"].to(device)
-                  else:
-                      inputs = crop_img.to(device)
-                  outputs = model(**inputs)
-              resized_logits = F.interpolate(
-                  outputs.logits[0].unsqueeze(dim=0), size=crop_img.shape[-2:], mode="bilinear", align_corners=False
-              )
-              preds += F.pad(resized_logits,
-                              (int(x1), int(preds.shape[3] - x2), int(y1),
-                              int(preds.shape[2] - y2))).cpu()
-              count_mat[:, :, y1:y2, x1:x2] += 1
-      assert (count_mat == 0).sum() == 0
-      preds = preds / count_mat
-      preds = preds.argmax(dim=1)
-      preds = F.interpolate(preds.unsqueeze(0).type(torch.uint8), size=image.size[::-1], mode='nearest')
-      label_pred = preds.squeeze().cpu().numpy()
-      return label_pred
-  # Load an image from the coralscapes dataset or load your own image
-  dataset = load_dataset("EPFL-ECEO/coralscapes")
-  image = dataset["test"][42]["image"]
-  preprocessor = SegformerImageProcessor.from_pretrained("EPFL-ECEO/segformer-b2-finetuned-coralscapes-1024-1024")
-  model = SegformerForSemanticSegmentation.from_pretrained("EPFL-ECEO/segformer-b2-finetuned-coralscapes-1024-1024")
-  label_pred = segment_image(image, preprocessor, model)
 ```
 ## Training & Evaluation Details

 The simplest way to use this model to segment an image of the Coralscapes dataset is as follows:
 ```python
+from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
+from PIL import Image
+from datasets import load_dataset
+# Load an image from the coralscapes dataset or load your own image
+dataset = load_dataset("EPFL-ECEO/coralscapes")
+image = dataset["test"][42]["image"]
+preprocessor = SegformerImageProcessor.from_pretrained("EPFL-ECEO/segformer-b2-finetuned-coralscapes-1024-1024")
+model = SegformerForSemanticSegmentation.from_pretrained("EPFL-ECEO/segformer-b2-finetuned-coralscapes-1024-1024")
+inputs = preprocessor(image, return_tensors = "pt")
+outputs = model(**inputs)
+outputs = preprocessor.post_process_semantic_segmentation(outputs, target_sizes=[(image.size[1], image.size[0])])
+label_pred = outputs[0].numpy()
 ```
 While using the above approach should still work for images of different sizes and scales, for images that are not close to the training size of the model (1024x1024),
 we recommend using the following approach using a sliding window to achieve better results:
 ```python
+import torch
+import torch.nn.functional as F
+from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
+from PIL import Image
+import numpy as np
+from datasets import load_dataset
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def resize_image(image, target_size=1024):
+    """
+    Used to resize the image such that the smaller side equals 1024
+    """
+    h_img, w_img = image.size
+    if h_img < w_img:
+        new_h, new_w = target_size, int(w_img * (target_size / h_img))
+    else:
+        new_h, new_w  = int(h_img * (target_size / w_img)), target_size
+    resized_img = image.resize((new_h, new_w))
+    return resized_img
+def segment_image(image, preprocessor, model, crop_size = (1024, 1024), num_classes = 40, transform=None):
+    """
+    Finds an optimal stride based on the image size and aspect ratio to create
+    overlapping sliding windows of size 1024x1024 which are then fed into the model.
+    """
+    h_crop, w_crop = crop_size
+    img = torch.Tensor(np.array(resize_image(image, target_size=1024)).transpose(2, 0, 1)).unsqueeze(0)
+    batch_size, _, h_img, w_img = img.size()
+    if transform:
+        img = torch.Tensor(transform(image = img.numpy())["image"]).to(device)
+    h_grids = int(np.round(3/2*h_img/h_crop)) if h_img > h_crop else 1
+    w_grids = int(np.round(3/2*w_img/w_crop)) if w_img > w_crop else 1
+    h_stride = int((h_img - h_crop + h_grids -1)/(h_grids -1)) if h_grids > 1 else h_crop
+    w_stride = int((w_img - w_crop + w_grids -1)/(w_grids -1)) if w_grids > 1 else w_crop
+    preds = img.new_zeros((batch_size, num_classes, h_img, w_img))
+    count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
+    for h_idx in range(h_grids):
+        for w_idx in range(w_grids):
+            y1 = h_idx * h_stride
+            x1 = w_idx * w_stride
+            y2 = min(y1 + h_crop, h_img)
+            x2 = min(x1 + w_crop, w_img)
+            y1 = max(y2 - h_crop, 0)
+            x1 = max(x2 - w_crop, 0)
+            crop_img = img[:, :, y1:y2, x1:x2]
+            with torch.no_grad():
+                if(preprocessor):
+                    inputs = preprocessor(crop_img, return_tensors = "pt")
+                    inputs["pixel_values"] = inputs["pixel_values"].to(device)
+                else:
+                    inputs = crop_img.to(device)
+                outputs = model(**inputs)
+            resized_logits = F.interpolate(
+                outputs.logits[0].unsqueeze(dim=0), size=crop_img.shape[-2:], mode="bilinear", align_corners=False
+            )
+            preds += F.pad(resized_logits,
+                            (int(x1), int(preds.shape[3] - x2), int(y1),
+                            int(preds.shape[2] - y2))).cpu()
+            count_mat[:, :, y1:y2, x1:x2] += 1
+    assert (count_mat == 0).sum() == 0
+    preds = preds / count_mat
+    preds = preds.argmax(dim=1)
+    preds = F.interpolate(preds.unsqueeze(0).type(torch.uint8), size=image.size[::-1], mode='nearest')
+    label_pred = preds.squeeze().cpu().numpy()
+    return label_pred
+# Load an image from the coralscapes dataset or load your own image
+dataset = load_dataset("EPFL-ECEO/coralscapes")
+image = dataset["test"][42]["image"]
+preprocessor = SegformerImageProcessor.from_pretrained("EPFL-ECEO/segformer-b2-finetuned-coralscapes-1024-1024")
+model = SegformerForSemanticSegmentation.from_pretrained("EPFL-ECEO/segformer-b2-finetuned-coralscapes-1024-1024")
+label_pred = segment_image(image, preprocessor, model)
 ```
 ## Training & Evaluation Details