Update image_preprocessing_molmo.py

This modifies the ```resize_and_pad``` function to rely on ```torch``` and ```torchvision``` instead, thus removing the ```tensorflow``` dependency and all its related dependencies, which caused massive complex installation problems for me.

Files changed (1) hide show

image_preprocessing_molmo.py +78 -44

image_preprocessing_molmo.py CHANGED Viewed

@@ -66,59 +66,93 @@ def normalize_image(image, offset, scale):
 def resize_and_pad(
-    image,
-    desired_output_size,
-    resize_method=InterpolationMode.BILINEAR,
-    pad_value=0,
-    normalize=True,
-    image_mean=OPENAI_CLIP_MEAN,
-    image_std=OPENAI_CLIP_STD,
-):
     desired_height, desired_width = desired_output_size
     height, width = image.shape[:2]
-    # Cast into float32 since the training code did this in float32 and it (very rarely) effects
-    # the results after rounding.
-    image_scale_y = np.array(desired_height, np.float32) / np.array(height, np.float32)
-    image_scale_x = np.array(desired_width, np.float32) / np.array(width, np.float32)
-    image_scale = min(image_scale_x, image_scale_y)
-    scaled_height = int(np.array(height, np.float32) * image_scale)
-    scaled_width = int(np.array(width, np.float32) * image_scale)
-    # if resize_method == "tensorflow":
-    #     FIXME remove
-    import tensorflow as tf
-    image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
-    image = tf.image.resize(
-        image,
         [scaled_height, scaled_width],
-        method=tf.image.ResizeMethod.BILINEAR,
-        antialias=True,
-    )
-    image = tf.clip_by_value(image, 0.0, 1.0)
-    image = image.numpy()
-    # else:
-    #     image = torch.permute(torch.from_numpy(image), [2, 0, 1])
-    #     image = convert_image_dtype(image)  # resize in flaot32
-    #     image = torchvision.transforms.Resize(
-    #         [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
-    #     )(image)
-    #     image = torch.clip(image, 0.0, 1.0)
-    #     image = torch.permute(image, [1, 2, 0]).numpy()
     top_pad = (desired_height - scaled_height) // 2
     left_pad = (desired_width - scaled_width) // 2
-    padding = [
-        [top_pad, desired_height - scaled_height - top_pad],
-        [left_pad, desired_width - scaled_width - left_pad],
-        [0, 0]
-    ]
-    image_mask = np.pad(np.ones_like(image[:, :, 0], dtype=bool), padding[:2])
-    image = np.pad(image, padding, constant_values=pad_value)
     if normalize:
-        image = normalize_image(image, offset=image_mean, scale=image_std)
-    return image, image_mask
 def select_tiling(h, w, patch_size, max_num_patches):
     """Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""

 def resize_and_pad(
+    image: np.ndarray,
+    desired_output_size: List[int],
+    resize_method: str = "bilinear",
+    pad_value: float = 0,
+    normalize: bool = True,
+    image_mean: Optional[List[float]] = OPENAI_CLIP_MEAN,
+    image_std: Optional[List[float]] = OPENAI_CLIP_STD,
+) -> (np.ndarray, np.ndarray):
+    """
+    Resize and pad the image to the desired output size.
+    Args:
+        image (np.ndarray): Input image as a NumPy array.
+        desired_output_size (List[int]): Desired output size as [height, width].
+        resize_method (str, optional): Resize interpolation method. Defaults to "bilinear".
+        pad_value (float, optional): Padding value. Defaults to 0.
+        normalize (bool, optional): Whether to normalize the image. Defaults to True.
+        image_mean (Optional[List[float]], optional): Mean for normalization. Defaults to OPENAI_CLIP_MEAN.
+        image_std (Optional[List[float]], optional): Standard deviation for normalization. Defaults to OPENAI_CLIP_STD.
+    Returns:
+        Tuple[np.ndarray, np.ndarray]: Resized and padded image, and image mask.
+    """
     desired_height, desired_width = desired_output_size
     height, width = image.shape[:2]
+    # Calculate scaling factors and determine the scaling factor to maintain aspect ratio
+    scale_y = desired_height / height
+    scale_x = desired_width / width
+    scale = min(scale_x, scale_y)
+    scaled_height = int(height * scale)
+    scaled_width = int(width * scale)
+    # Convert the image to a PyTorch tensor and normalize to [0, 1]
+    image_tensor = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
+    # Define the interpolation mode
+    if resize_method.lower() == "bilinear":
+        interpolation = InterpolationMode.BILINEAR
+    elif resize_method.lower() == "nearest":
+        interpolation = InterpolationMode.NEAREST
+    elif resize_method.lower() == "bicubic":
+        interpolation = InterpolationMode.BICUBIC
+    elif resize_method.lower() == "lanczos":
+        interpolation = InterpolationMode.LANCZOS
+    else:
+        raise ValueError(f"Unsupported resize method: {resize_method}")
+    # Resize the image
+    resized_image = torchvision.transforms.Resize(
         [scaled_height, scaled_width],
+        interpolation=interpolation,
+        antialias=True
+    )(image_tensor)
+    # Clip the image to ensure values are within [0, 1]
+    resized_image = torch.clamp(resized_image, 0.0, 1.0)
+    # Convert back to NumPy
+    resized_image_np = resized_image.permute(1, 2, 0).numpy()
+    # Calculate padding
     top_pad = (desired_height - scaled_height) // 2
+    bottom_pad = desired_height - scaled_height - top_pad
     left_pad = (desired_width - scaled_width) // 2
+    right_pad = desired_width - scaled_width - left_pad
+    # Pad the image using NumPy
+    padded_image = np.pad(
+        resized_image_np,
+        pad_width=((top_pad, bottom_pad), (left_pad, right_pad), (0, 0)),
+        mode='constant',
+        constant_values=pad_value
+    )
+    # Create the image mask
+    image_mask = np.pad(
+        np.ones((scaled_height, scaled_width), dtype=bool),
+        pad_width=((top_pad, bottom_pad), (left_pad, right_pad)),
+        mode='constant',
+        constant_values=False
+    )
     if normalize:
+        padded_image = normalize_image(padded_image, offset=image_mean, scale=image_std)
+    return padded_image, image_mask
 def select_tiling(h, w, patch_size, max_num_patches):
     """Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""