ctranslate2-4you commited on
Commit
964d596
1 Parent(s): b72f674

Update image_preprocessing_molmo.py

Browse files

This modifies the ```resize_and_pad``` function to rely on ```torch``` and ```torchvision``` instead, thus removing the ```tensorflow``` dependency and all its related dependencies, which caused massive complex installation problems for me.

Files changed (1) hide show
  1. image_preprocessing_molmo.py +78 -44
image_preprocessing_molmo.py CHANGED
@@ -66,59 +66,93 @@ def normalize_image(image, offset, scale):
66
 
67
 
68
  def resize_and_pad(
69
- image,
70
- desired_output_size,
71
- resize_method=InterpolationMode.BILINEAR,
72
- pad_value=0,
73
- normalize=True,
74
- image_mean=OPENAI_CLIP_MEAN,
75
- image_std=OPENAI_CLIP_STD,
76
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  desired_height, desired_width = desired_output_size
78
  height, width = image.shape[:2]
79
 
80
- # Cast into float32 since the training code did this in float32 and it (very rarely) effects
81
- # the results after rounding.
82
- image_scale_y = np.array(desired_height, np.float32) / np.array(height, np.float32)
83
- image_scale_x = np.array(desired_width, np.float32) / np.array(width, np.float32)
84
- image_scale = min(image_scale_x, image_scale_y)
85
- scaled_height = int(np.array(height, np.float32) * image_scale)
86
- scaled_width = int(np.array(width, np.float32) * image_scale)
87
-
88
- # if resize_method == "tensorflow":
89
- # FIXME remove
90
- import tensorflow as tf
91
- image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
92
- image = tf.image.resize(
93
- image,
 
 
 
 
 
 
 
 
 
 
94
  [scaled_height, scaled_width],
95
- method=tf.image.ResizeMethod.BILINEAR,
96
- antialias=True,
97
- )
98
- image = tf.clip_by_value(image, 0.0, 1.0)
99
- image = image.numpy()
100
- # else:
101
- # image = torch.permute(torch.from_numpy(image), [2, 0, 1])
102
- # image = convert_image_dtype(image) # resize in flaot32
103
- # image = torchvision.transforms.Resize(
104
- # [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
105
- # )(image)
106
- # image = torch.clip(image, 0.0, 1.0)
107
- # image = torch.permute(image, [1, 2, 0]).numpy()
108
 
 
 
 
 
109
  top_pad = (desired_height - scaled_height) // 2
 
110
  left_pad = (desired_width - scaled_width) // 2
111
- padding = [
112
- [top_pad, desired_height - scaled_height - top_pad],
113
- [left_pad, desired_width - scaled_width - left_pad],
114
- [0, 0]
115
- ]
116
- image_mask = np.pad(np.ones_like(image[:, :, 0], dtype=bool), padding[:2])
117
- image = np.pad(image, padding, constant_values=pad_value)
 
 
 
 
 
 
 
 
 
 
 
118
  if normalize:
119
- image = normalize_image(image, offset=image_mean, scale=image_std)
120
- return image, image_mask
121
 
 
122
 
123
  def select_tiling(h, w, patch_size, max_num_patches):
124
  """Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""
 
66
 
67
 
68
  def resize_and_pad(
69
+ image: np.ndarray,
70
+ desired_output_size: List[int],
71
+ resize_method: str = "bilinear",
72
+ pad_value: float = 0,
73
+ normalize: bool = True,
74
+ image_mean: Optional[List[float]] = OPENAI_CLIP_MEAN,
75
+ image_std: Optional[List[float]] = OPENAI_CLIP_STD,
76
+ ) -> (np.ndarray, np.ndarray):
77
+ """
78
+ Resize and pad the image to the desired output size.
79
+
80
+ Args:
81
+ image (np.ndarray): Input image as a NumPy array.
82
+ desired_output_size (List[int]): Desired output size as [height, width].
83
+ resize_method (str, optional): Resize interpolation method. Defaults to "bilinear".
84
+ pad_value (float, optional): Padding value. Defaults to 0.
85
+ normalize (bool, optional): Whether to normalize the image. Defaults to True.
86
+ image_mean (Optional[List[float]], optional): Mean for normalization. Defaults to OPENAI_CLIP_MEAN.
87
+ image_std (Optional[List[float]], optional): Standard deviation for normalization. Defaults to OPENAI_CLIP_STD.
88
+
89
+ Returns:
90
+ Tuple[np.ndarray, np.ndarray]: Resized and padded image, and image mask.
91
+ """
92
  desired_height, desired_width = desired_output_size
93
  height, width = image.shape[:2]
94
 
95
+ # Calculate scaling factors and determine the scaling factor to maintain aspect ratio
96
+ scale_y = desired_height / height
97
+ scale_x = desired_width / width
98
+ scale = min(scale_x, scale_y)
99
+ scaled_height = int(height * scale)
100
+ scaled_width = int(width * scale)
101
+
102
+ # Convert the image to a PyTorch tensor and normalize to [0, 1]
103
+ image_tensor = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
104
+
105
+ # Define the interpolation mode
106
+ if resize_method.lower() == "bilinear":
107
+ interpolation = InterpolationMode.BILINEAR
108
+ elif resize_method.lower() == "nearest":
109
+ interpolation = InterpolationMode.NEAREST
110
+ elif resize_method.lower() == "bicubic":
111
+ interpolation = InterpolationMode.BICUBIC
112
+ elif resize_method.lower() == "lanczos":
113
+ interpolation = InterpolationMode.LANCZOS
114
+ else:
115
+ raise ValueError(f"Unsupported resize method: {resize_method}")
116
+
117
+ # Resize the image
118
+ resized_image = torchvision.transforms.Resize(
119
  [scaled_height, scaled_width],
120
+ interpolation=interpolation,
121
+ antialias=True
122
+ )(image_tensor)
123
+
124
+ # Clip the image to ensure values are within [0, 1]
125
+ resized_image = torch.clamp(resized_image, 0.0, 1.0)
 
 
 
 
 
 
 
126
 
127
+ # Convert back to NumPy
128
+ resized_image_np = resized_image.permute(1, 2, 0).numpy()
129
+
130
+ # Calculate padding
131
  top_pad = (desired_height - scaled_height) // 2
132
+ bottom_pad = desired_height - scaled_height - top_pad
133
  left_pad = (desired_width - scaled_width) // 2
134
+ right_pad = desired_width - scaled_width - left_pad
135
+
136
+ # Pad the image using NumPy
137
+ padded_image = np.pad(
138
+ resized_image_np,
139
+ pad_width=((top_pad, bottom_pad), (left_pad, right_pad), (0, 0)),
140
+ mode='constant',
141
+ constant_values=pad_value
142
+ )
143
+
144
+ # Create the image mask
145
+ image_mask = np.pad(
146
+ np.ones((scaled_height, scaled_width), dtype=bool),
147
+ pad_width=((top_pad, bottom_pad), (left_pad, right_pad)),
148
+ mode='constant',
149
+ constant_values=False
150
+ )
151
+
152
  if normalize:
153
+ padded_image = normalize_image(padded_image, offset=image_mean, scale=image_std)
 
154
 
155
+ return padded_image, image_mask
156
 
157
  def select_tiling(h, w, patch_size, max_num_patches):
158
  """Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""