chrisc36 commited on
Commit
dd7f44e
1 Parent(s): ffdda66

Upload image_preprocessing_molmo.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. image_preprocessing_molmo.py +38 -59
image_preprocessing_molmo.py CHANGED
@@ -15,36 +15,13 @@ from transformers.image_utils import (
15
  is_valid_image,
16
  )
17
  from transformers.processing_utils import ImagesKwargs
18
- from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
19
- from transformers.utils import TensorType, is_vision_available, logging
20
 
21
 
22
  logger = logging.get_logger(__name__)
23
 
24
 
25
- def make_batched_images(images) -> List[List[ImageInput]]:
26
- """
27
- Accepts images in list or nested list format, and makes a list of images for preprocessing.
28
-
29
- Args:
30
- images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
31
- The input image.
32
-
33
- Returns:
34
- list: A list of images.
35
- """
36
- if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
37
- return [img for img_list in images for img in img_list]
38
-
39
- elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
40
- return images
41
-
42
- elif is_valid_image(images):
43
- return [images]
44
-
45
- raise ValueError(f"Could not make batched images from {images}")
46
-
47
-
48
  def pad_to_bounding_box(
49
  image, offset_height, offset_width, target_height,
50
  target_width, value=0
@@ -68,7 +45,7 @@ def normalize_image(image, offset, scale):
68
  def resize_and_pad(
69
  image,
70
  desired_output_size,
71
- resize_method=InterpolationMode.BILINEAR,
72
  pad_value=0,
73
  normalize=True,
74
  image_mean=OPENAI_CLIP_MEAN,
@@ -85,26 +62,29 @@ def resize_and_pad(
85
  scaled_height = int(np.array(height, np.float32) * image_scale)
86
  scaled_width = int(np.array(width, np.float32) * image_scale)
87
 
88
- # if resize_method == "tensorflow":
89
- # FIXME remove
90
- import tensorflow as tf
91
- image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
92
- image = tf.image.resize(
93
- image,
94
- [scaled_height, scaled_width],
95
- method=tf.image.ResizeMethod.BILINEAR,
96
- antialias=True,
97
- )
98
- image = tf.clip_by_value(image, 0.0, 1.0)
99
- image = image.numpy()
100
- # else:
101
- # image = torch.permute(torch.from_numpy(image), [2, 0, 1])
102
- # image = convert_image_dtype(image) # resize in flaot32
103
- # image = torchvision.transforms.Resize(
104
- # [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
105
- # )(image)
106
- # image = torch.clip(image, 0.0, 1.0)
107
- # image = torch.permute(image, [1, 2, 0]).numpy()
 
 
 
108
 
109
  top_pad = (desired_height - scaled_height) // 2
110
  left_pad = (desired_width - scaled_width) // 2
@@ -201,18 +181,6 @@ class MolmoImageProcessor(BaseImageProcessor):
201
  image_token_length_h: Optional[int] = None,
202
  image_patch_size: Optional[int] = None,
203
  ):
204
- """Preprocesses an image
205
-
206
- Returns:
207
- crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
208
- change between images but the other dimension are fixed
209
- tokens: (n_tokens,) int32 tokens, pad tokens indicating where to insert the
210
- patch features, might include other special tokens as well
211
- patch_ordering: (n_crops, n_tokens_per_crop) order image features should be inserted
212
- into the `tokens`, negative values indicates patches features to exclude
213
- padding_mask: (n_crops, n_patches) what percent of each crop is padding, be None
214
- if the image mask is not being used.
215
- """
216
  if isinstance(base_image_input_size, int):
217
  base_image_input_size = (base_image_input_size, base_image_input_size)
218
 
@@ -438,7 +406,18 @@ class MolmoImageProcessor(BaseImageProcessor):
438
  image_patch_size: Optional[int] = None,
439
  **kwargs,
440
  ):
441
- """Preprocesses a single image"""
 
 
 
 
 
 
 
 
 
 
 
442
 
443
  max_crops = max_crops or self.max_crops
444
  overlap_margins = overlap_margins or self.overlap_margins
 
15
  is_valid_image,
16
  )
17
  from transformers.processing_utils import ImagesKwargs
18
+ from transformers.image_processing_utils import BaseImageProcessor
19
+ from transformers.utils import logging
20
 
21
 
22
  logger = logging.get_logger(__name__)
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def pad_to_bounding_box(
26
  image, offset_height, offset_width, target_height,
27
  target_width, value=0
 
45
  def resize_and_pad(
46
  image,
47
  desired_output_size,
48
+ resize_method="torch-bilinear",
49
  pad_value=0,
50
  normalize=True,
51
  image_mean=OPENAI_CLIP_MEAN,
 
62
  scaled_height = int(np.array(height, np.float32) * image_scale)
63
  scaled_width = int(np.array(width, np.float32) * image_scale)
64
 
65
+ if resize_method == "tensorflow":
66
+ # This how the original training code did resizing, it can produce slightly different
67
+ # results then using torch resize so we keep it just in case
68
+ import tensorflow as tf
69
+ image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
70
+ image = tf.image.resize(
71
+ image,
72
+ [scaled_height, scaled_width],
73
+ method=tf.image.ResizeMethod.BILINEAR,
74
+ antialias=True,
75
+ )
76
+ image = tf.clip_by_value(image, 0.0, 1.0)
77
+ image = image.numpy()
78
+ elif resize_method == "torch-bilinear":
79
+ image = torch.permute(torch.from_numpy(image), [2, 0, 1])
80
+ image = convert_image_dtype(image) # resize in float32 to match the training code
81
+ image = torchvision.transforms.Resize(
82
+ [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
83
+ )(image)
84
+ image = torch.clip(image, 0.0, 1.0)
85
+ image = torch.permute(image, [1, 2, 0]).numpy()
86
+ else:
87
+ raise NotImplementedError(resize_method)
88
 
89
  top_pad = (desired_height - scaled_height) // 2
90
  left_pad = (desired_width - scaled_width) // 2
 
181
  image_token_length_h: Optional[int] = None,
182
  image_patch_size: Optional[int] = None,
183
  ):
 
 
 
 
 
 
 
 
 
 
 
 
184
  if isinstance(base_image_input_size, int):
185
  base_image_input_size = (base_image_input_size, base_image_input_size)
186
 
 
406
  image_patch_size: Optional[int] = None,
407
  **kwargs,
408
  ):
409
+ """Preprocesses an image
410
+
411
+ Returns:
412
+ crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
413
+ change between images but the other dimension are fixed
414
+ tokens: (n_tokens,) int32 tokens, pad tokens indicate where to insert the
415
+ patch features, might include other special tokens as well
416
+ image_idx: (n_crops, n_patches) index in `tokens` to put the patch features from the
417
+ crops after pooling, negative values indicates patches features to exclude
418
+ padding_mask: (n_crops, n_patches) what percent of each crop is padding, can be None
419
+ if the image mask is not being used.
420
+ """
421
 
422
  max_crops = max_crops or self.max_crops
423
  overlap_margins = overlap_margins or self.overlap_margins