Bingsu commited on
Commit
4aaad99
·
verified ·
1 Parent(s): bf440f1

Upload processor

Browse files
image_processing_hyperclovax.py ADDED
@@ -0,0 +1,789 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import os
4
+ from typing import Dict, List, Optional, Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ from PIL import Image
9
+ from transformers.feature_extraction_utils import BatchFeature
10
+ from transformers.image_processing_utils import (
11
+ BaseImageProcessor,
12
+ get_size_dict,
13
+ )
14
+ from transformers.image_transforms import (
15
+ convert_to_rgb,
16
+ get_resize_output_image_size,
17
+ resize,
18
+ to_channel_dimension_format,
19
+ )
20
+ from transformers.image_utils import (
21
+ OPENAI_CLIP_MEAN,
22
+ OPENAI_CLIP_STD,
23
+ ChannelDimension,
24
+ ImageInput,
25
+ PILImageResampling,
26
+ get_image_size,
27
+ infer_channel_dimension_format,
28
+ is_scaled_image,
29
+ make_list_of_images,
30
+ to_numpy_array,
31
+ valid_images,
32
+ )
33
+ from transformers.utils import TensorType, logging
34
+
35
+ logger = logging.get_logger(__name__)
36
+
37
+
38
+ class HCXImageProcessor(BaseImageProcessor):
39
+ r"""
40
+ Constructs a VLM image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques for processing high resolution images.
41
+ Args:
42
+ anyres: (bool) anyres 기능을 사용할지 안할지
43
+ unpad: (bool) anyres 사용시, unpad 기능 (순수 pad 영역에 해당하는 visual tokens 은 LLM input 에서 제거) 을 사용할지 안할지
44
+ num_queries_vis_abstractor: (int) 각 grid 에 대해서 resampler 를 사용하는 경우, visual query 수
45
+ possible_resolutions: (List) anyres 기능 사용시, 가능한 resolution 조합, 예: [[336, 336], [336, 672], [672, 336]]
46
+ patch_size: (int) ViT patch size
47
+ pad_to_square: (bool) 정사각형으로 padding 을 수행할지, 안할지를 결정. False 이면 정사각형이 아니기 때문에 center crop 을 거쳐 ViT 의 입력으로 들어감
48
+ """
49
+
50
+ model_input_names = ["pixel_values"]
51
+
52
+ def __init__(
53
+ self,
54
+ do_resize: bool = True,
55
+ size: Dict[str, int] = None,
56
+ anyres: bool = False,
57
+ unpad: bool = False,
58
+ num_queries_vis_abstractor_image: int = 81,
59
+ num_queries_vis_abstractor_video_slow: int = 81,
60
+ num_queries_vis_abstractor_video_fast: int = 9,
61
+ first_last_frames_slow_video: bool = False,
62
+ possible_resolutions: List = [],
63
+ patch_size: int = 14,
64
+ pad_to_square: bool = True,
65
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
66
+ do_center_crop: bool = True,
67
+ crop_size: Dict[str, int] = None,
68
+ do_rescale: bool = True,
69
+ rescale_factor: Union[int, float] = 1 / 255,
70
+ do_normalize: bool = True,
71
+ image_mean: Optional[Union[float, List[float]]] = None,
72
+ image_std: Optional[Union[float, List[float]]] = None,
73
+ do_convert_rgb: bool = True,
74
+ **kwargs,
75
+ ) -> None:
76
+ super().__init__(**kwargs)
77
+ size = size if size is not None else {"shortest_edge": 336}
78
+ size = get_size_dict(size, default_to_square=False)
79
+ crop_size = crop_size if crop_size is not None else {"height": 336, "width": 336}
80
+ crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
81
+
82
+ self.do_resize = do_resize
83
+ self.size = size
84
+ self.anyres = anyres
85
+ self.unpad = unpad
86
+ self.num_queries_vis_abstractor_image = num_queries_vis_abstractor_image
87
+ self.num_queries_vis_abstractor_video_slow = num_queries_vis_abstractor_video_slow
88
+ self.num_queries_vis_abstractor_video_fast = num_queries_vis_abstractor_video_fast
89
+ self.first_last_frames_slow_video = first_last_frames_slow_video
90
+ self.possible_resolutions = [_resolution for _resolution in possible_resolutions]
91
+ self.patch_size = patch_size
92
+ self.pad_to_square = pad_to_square
93
+ self.resample = resample
94
+ self.do_center_crop = do_center_crop
95
+ self.crop_size = crop_size
96
+ self.do_rescale = do_rescale
97
+ self.rescale_factor = rescale_factor
98
+ self.do_normalize = do_normalize
99
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
100
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
101
+ self.do_convert_rgb = do_convert_rgb
102
+
103
+ def resize(
104
+ self,
105
+ image: np.ndarray,
106
+ size: Dict[str, int],
107
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
108
+ data_format: Optional[Union[str, ChannelDimension]] = None,
109
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
110
+ **kwargs,
111
+ ) -> np.ndarray:
112
+ default_to_square = True
113
+ if "shortest_edge" in size:
114
+ size = size["shortest_edge"]
115
+ default_to_square = False
116
+ elif "height" in size and "width" in size:
117
+ size = (size["height"], size["width"])
118
+ else:
119
+ raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
120
+
121
+ output_size = get_resize_output_image_size(
122
+ image,
123
+ size=size,
124
+ default_to_square=default_to_square,
125
+ input_data_format=input_data_format,
126
+ )
127
+
128
+ return resize(
129
+ image,
130
+ size=output_size,
131
+ resample=resample,
132
+ data_format=data_format,
133
+ input_data_format=input_data_format,
134
+ **kwargs,
135
+ )
136
+
137
+ def _preprocess(
138
+ self,
139
+ images: ImageInput,
140
+ do_resize: bool = None,
141
+ size: Dict[str, int] = None,
142
+ resample: PILImageResampling = None,
143
+ do_center_crop: bool = None,
144
+ crop_size: int = None,
145
+ do_rescale: bool = None,
146
+ rescale_factor: float = None,
147
+ do_normalize: bool = None,
148
+ image_mean: Optional[Union[float, List[float]]] = None,
149
+ image_std: Optional[Union[float, List[float]]] = None,
150
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
151
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
152
+ ) -> Image.Image:
153
+ images = make_list_of_images(images)
154
+
155
+ if do_resize:
156
+ images = [
157
+ self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
158
+ for image in images
159
+ ]
160
+
161
+ if do_center_crop:
162
+ images = [
163
+ self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
164
+ ]
165
+
166
+ if do_rescale:
167
+ images = [
168
+ self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) for image in images
169
+ ]
170
+
171
+ if do_normalize:
172
+ images = [
173
+ self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
174
+ for image in images
175
+ ]
176
+
177
+ images = [
178
+ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
179
+ ]
180
+
181
+ return images
182
+
183
+ def _resize_for_local_grids(
184
+ self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension
185
+ ) -> np.array:
186
+ new_height, new_width = _get_local_grids_output_size(image, target_resolution, input_data_format)
187
+
188
+ # Resize the image
189
+ resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
190
+
191
+ return resized_image
192
+
193
+ def _pad_for_patching(
194
+ self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension
195
+ ) -> np.array:
196
+ """
197
+ Pad an image to a target resolution while maintaining aspect ratio.
198
+ """
199
+ target_height, target_width = target_resolution
200
+
201
+ background_color = tuple(int(x * 255) for x in self.image_mean)
202
+ padded_image = pad(
203
+ image,
204
+ target_size=(target_height, target_width),
205
+ background_color=background_color,
206
+ input_data_format=input_data_format,
207
+ )
208
+
209
+ return padded_image
210
+
211
+ def get_image_grids(
212
+ self,
213
+ image: np.array,
214
+ possible_resolutions,
215
+ grid_size: int,
216
+ resample: PILImageResampling,
217
+ data_format: ChannelDimension,
218
+ input_data_format: ChannelDimension,
219
+ ) -> List[np.array]:
220
+ if not isinstance(possible_resolutions, list):
221
+ raise ValueError("possible_resolutions must be a list of possible resolutions.")
222
+
223
+ image_size = get_image_size(image, channel_dim=input_data_format)
224
+ best_resolution = select_best_resolution(image_size, possible_resolutions)
225
+ resized_image = self._resize_for_local_grids(
226
+ image, best_resolution, resample=resample, input_data_format=input_data_format
227
+ )
228
+ padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format)
229
+ local_grids = divide_to_grids(padded_image, grid_size=grid_size, input_data_format=input_data_format)
230
+
231
+ # make sure that all patches are in the input data format
232
+ local_grids = [
233
+ to_channel_dimension_format(grid, channel_dim=data_format, input_channel_dim=input_data_format)
234
+ for grid in local_grids
235
+ ]
236
+
237
+ return local_grids
238
+
239
+ def preprocess(
240
+ self,
241
+ images: ImageInput,
242
+ do_resize: bool = None,
243
+ size: Dict[str, int] = None,
244
+ anyres: bool = None,
245
+ unpad: bool = None,
246
+ is_video: bool = False,
247
+ num_queries_vis_abstractor_image: int = None,
248
+ num_queries_vis_abstractor_video_slow: int = None,
249
+ num_queries_vis_abstractor_video_fast: int = None,
250
+ first_last_frames_slow_video: bool = None,
251
+ possible_resolutions: List = None,
252
+ patch_size: int = None,
253
+ pad_to_square: bool = None,
254
+ resample: PILImageResampling = None,
255
+ do_center_crop: bool = None,
256
+ crop_size: int = None,
257
+ do_rescale: bool = None,
258
+ rescale_factor: float = None,
259
+ do_normalize: bool = None,
260
+ image_mean: Optional[Union[float, List[float]]] = None,
261
+ image_std: Optional[Union[float, List[float]]] = None,
262
+ do_convert_rgb: bool = None,
263
+ return_tensors: Optional[Union[str, TensorType]] = None,
264
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
265
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
266
+ return_dummy_image: bool = False,
267
+ first_last_frames_slow: bool = False,
268
+ is_first_or_last_frames: bool = False,
269
+ **kwargs,
270
+ ):
271
+ """
272
+ HCXVisionImageProcessor 로 image tensor, original image size (width, height), visual tokens
273
+ :return pixel_values: List of 4D tensor 로 image tensor
274
+ :return image_sizes: List of Dict 로 image width, height [{"width": image 1 의 width, "height": image 1 의 height}, {"width": image 2 의 width, "height": image 2 의 height}, ...]
275
+ :return vision_query_lengths: List of int 로 각 image 가 LLM 입력으로 전달될때 변환되는 visual token 수
276
+ """
277
+
278
+ do_resize = do_resize if do_resize is not None else self.do_resize
279
+ size = size if size is not None else self.size
280
+ size = get_size_dict(size, param_name="size", default_to_square=False)
281
+ anyres = anyres if anyres is not None else self.anyres
282
+ unpad = unpad if unpad is not None else self.unpad
283
+ num_queries_vis_abstractor_image = (
284
+ num_queries_vis_abstractor_image
285
+ if num_queries_vis_abstractor_image is not None
286
+ else self.num_queries_vis_abstractor_image
287
+ )
288
+ num_queries_vis_abstractor_video_slow = (
289
+ num_queries_vis_abstractor_video_slow
290
+ if num_queries_vis_abstractor_video_slow is not None
291
+ else self.num_queries_vis_abstractor_video_slow
292
+ )
293
+ num_queries_vis_abstractor_video_fast = (
294
+ num_queries_vis_abstractor_video_fast
295
+ if num_queries_vis_abstractor_video_fast is not None
296
+ else self.num_queries_vis_abstractor_video_fast
297
+ )
298
+ first_last_frames_slow_video = (
299
+ first_last_frames_slow_video
300
+ if first_last_frames_slow_video is not None
301
+ else self.first_last_frames_slow_video
302
+ )
303
+ possible_resolutions = possible_resolutions if possible_resolutions is not None else self.possible_resolutions
304
+ patch_size = patch_size if patch_size is not None else self.patch_size
305
+ pad_to_square = pad_to_square if pad_to_square is not None else self.pad_to_square
306
+ resample = resample if resample is not None else self.resample
307
+ do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
308
+ crop_size = crop_size if crop_size is not None else self.crop_size
309
+ crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
310
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
311
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
312
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
313
+ image_mean = image_mean if image_mean is not None else self.image_mean
314
+ image_std = image_std if image_std is not None else self.image_std
315
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
316
+
317
+ if is_video:
318
+ num_queries_vis_abstractor = num_queries_vis_abstractor_video_fast
319
+ num_queries_vis_abstractor_slow = num_queries_vis_abstractor_video_slow
320
+ unpad = False
321
+ else:
322
+ num_queries_vis_abstractor = num_queries_vis_abstractor_image
323
+ num_queries_vis_abstractor_slow = 0
324
+
325
+ if return_dummy_image:
326
+ images = Image.new("RGB", (224, 224), (0, 0, 0))
327
+
328
+ images = make_list_of_images(images)
329
+
330
+ if not valid_images(images):
331
+ raise ValueError(
332
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
333
+ "torch.Tensor, tf.Tensor or jax.ndarray."
334
+ )
335
+
336
+ if do_convert_rgb:
337
+ images = [convert_to_rgb(image) for image in images]
338
+
339
+ # All transformations expect numpy arrays.
340
+ images = [to_numpy_array(image) for image in images]
341
+
342
+ if is_scaled_image(images[0]) and do_rescale:
343
+ logger.warning_once(
344
+ "It looks like you are trying to rescale already rescaled images. If the input"
345
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
346
+ )
347
+
348
+ if input_data_format is None:
349
+ # We assume that all images have the same channel dimension format.
350
+ input_data_format = infer_channel_dimension_format(images[0])
351
+
352
+ new_images = []
353
+ image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
354
+ vision_query_lengths = []
355
+
356
+ assert crop_size["height"] == crop_size["width"]
357
+
358
+ # global image 의 padding 연산은, image original width, height 가 클 때 bottleneck 이 될 수 있음
359
+ # 장축의 길이를 size["shortest_edge"] 로 resize 를 먼저 한 뒤에, padding
360
+ if anyres:
361
+ anyres_global_images = copy.deepcopy(images)
362
+ if pad_to_square:
363
+ background_color = tuple(int(x * 255) for x in self.image_mean)
364
+ anyres_global_images = [
365
+ resize_longside(copy.deepcopy(image), size["shortest_edge"], resample, input_data_format)
366
+ for image in anyres_global_images
367
+ ]
368
+ anyres_global_images = [
369
+ expand2square(image, background_color=background_color, input_data_format=input_data_format)[0]
370
+ for image in anyres_global_images
371
+ ]
372
+ else:
373
+ anyres_global_images = [
374
+ self.resize(
375
+ image=image,
376
+ size={"height": size["shortest_edge"], "width": size["shortest_edge"]},
377
+ resample=resample,
378
+ input_data_format=input_data_format,
379
+ )
380
+ for image in anyres_global_images
381
+ ]
382
+ else:
383
+ anyres_global_images = [None for _ in range(len(images))]
384
+ if pad_to_square:
385
+ background_color = tuple(int(x * 255) for x in self.image_mean)
386
+ images = [
387
+ resize_longside(image, size["shortest_edge"], resample, input_data_format) for image in images
388
+ ]
389
+ images = [
390
+ expand2square(image, background_color=background_color, input_data_format=input_data_format)[0]
391
+ for image in images
392
+ ]
393
+
394
+ for image, anyres_global_image, image_size in zip(images, anyres_global_images, image_sizes):
395
+ if anyres:
396
+ # convert image into a list of grids
397
+ # we intentially use the same data format as the input data format
398
+ image_grids = self.get_image_grids(
399
+ image,
400
+ possible_resolutions,
401
+ grid_size=crop_size["height"],
402
+ resample=resample,
403
+ data_format=input_data_format,
404
+ input_data_format=input_data_format,
405
+ )
406
+ # video 에 대해서는 global image (thumbnail) 를 사용하지 않음
407
+ if not is_video:
408
+ image_grids = [anyres_global_image] + image_grids
409
+ else:
410
+ image_grids = [image]
411
+
412
+ pixel_values = self._preprocess(
413
+ image_grids,
414
+ do_resize=do_resize,
415
+ size=size,
416
+ resample=resample,
417
+ do_center_crop=do_center_crop,
418
+ crop_size=crop_size,
419
+ do_rescale=do_rescale,
420
+ rescale_factor=rescale_factor,
421
+ do_normalize=do_normalize,
422
+ image_mean=image_mean,
423
+ image_std=image_std,
424
+ data_format=data_format,
425
+ input_data_format=input_data_format,
426
+ )
427
+
428
+ pixel_values = np.array(pixel_values)
429
+ new_images.append(pixel_values)
430
+
431
+ vision_query_length = determine_anyres_num_vision_patches(
432
+ image_size=image_size,
433
+ grid_size=crop_size["height"],
434
+ patch_size=patch_size,
435
+ possible_resolutions=possible_resolutions,
436
+ anyres=anyres,
437
+ unpad=unpad,
438
+ num_queries_vis_abstractor=num_queries_vis_abstractor,
439
+ num_queries_vis_abstractor_slow=num_queries_vis_abstractor_slow,
440
+ is_video=is_video,
441
+ first_last_frames_slow=first_last_frames_slow,
442
+ is_first_or_last_frames=is_first_or_last_frames,
443
+ )
444
+
445
+ vision_query_lengths.append(vision_query_length)
446
+
447
+ if return_dummy_image:
448
+ vision_query_lengths = []
449
+
450
+ data = {
451
+ "pixel_values": [torch.tensor(new_image) for new_image in new_images],
452
+ "image_sizes": [{"width": image_size[1], "height": image_size[0]} for image_size in image_sizes],
453
+ "vision_query_lengths": vision_query_lengths,
454
+ }
455
+
456
+ return BatchFeature(data=data, tensor_type=return_tensors)
457
+
458
+ def save_pretrained(
459
+ self,
460
+ save_directory: Union[str, os.PathLike],
461
+ *args,
462
+ **kwargs,
463
+ ):
464
+ self.register_for_auto_class()
465
+ super().save_pretrained(save_directory, *args, **kwargs)
466
+
467
+
468
+ def determine_anyres_num_vision_patches(
469
+ image_size,
470
+ grid_size,
471
+ patch_size,
472
+ possible_resolutions,
473
+ anyres=False,
474
+ unpad=True,
475
+ num_queries_vis_abstractor=0,
476
+ num_queries_vis_abstractor_slow=0,
477
+ is_video=False,
478
+ first_last_frames_slow=False, # sample-wise option
479
+ is_first_or_last_frames=False, # grid-wise option
480
+ ):
481
+ """
482
+ Computes the number of visual tokens (patches) based on image resolution, grid configuration, and patch size.
483
+
484
+ This function supports both fixed-size and any-resolution settings, as well as video-specific configurations
485
+ such as handling slow frames and frame position flags.
486
+
487
+ Args:
488
+ num_grids (int): Number of grids per image (e.g., 1 for 1x1, 4 for 2x2, etc.).
489
+ image_size (tuple): The original image size as (height, width).
490
+ grid_size (int): Size of each grid in pixels (e.g., 336).
491
+ patch_size (int): Size of each vision patch (e.g., 14 for ViT models).
492
+ possible_resolutions (list): List of possible resolution tuples [(h1, w1), (h2, w2), ...].
493
+ anyres (bool, optional): Whether to use any-resolution mode. Defaults to False.
494
+ unpad (bool, optional): Whether to unpad the image before computing patches. Defaults to True.
495
+ num_queries_vis_abstractor (int, optional): Number of query tokens for vision abstractor (fast path).
496
+ num_queries_vis_abstractor_slow (int, optional): Number of query tokens for vision abstractor (slow path).
497
+ is_video (bool, optional): Whether the input is a video. Defaults to False.
498
+ first_last_frames_slow (bool, optional): Whether to treat first/last video frames as "slow". Defaults to False.
499
+ is_first_or_last_frames (bool, optional): Whether current grid corresponds to first/last frame. Defaults to False.
500
+
501
+ Returns:
502
+ int: Total number of visual tokens (patches) after processing.
503
+ """
504
+
505
+ if not anyres:
506
+ return num_queries_vis_abstractor if num_queries_vis_abstractor > 0 else (grid_size // patch_size) ** 2
507
+
508
+ if num_queries_vis_abstractor > 0:
509
+ num_patch_per_grid = int(num_queries_vis_abstractor**0.5)
510
+ else:
511
+ num_patch_per_grid = grid_size // patch_size
512
+
513
+ num_global_per_grid = num_patch_per_grid
514
+
515
+ # In anyres mode, a global image is included, so there are always at least 2 grids.
516
+ # However, for video inputs, there is no global image, so it's possible to have only 1 grid.
517
+ # Therefore, the assertion below is commented out:
518
+ # assert num_grids > 1
519
+
520
+ # Compute the number of vision patches.
521
+ height, width = select_best_resolution(image_size, possible_resolutions)
522
+
523
+ num_patch_height = (height // grid_size) * num_patch_per_grid
524
+ num_patch_width = (width // grid_size) * num_patch_per_grid
525
+
526
+ # local images
527
+ if unpad:
528
+ original_height, original_width = image_size
529
+
530
+ original_aspect_ratio = original_width / original_height
531
+ current_aspect_ratio = num_patch_width / num_patch_height
532
+
533
+ if original_aspect_ratio > current_aspect_ratio:
534
+ scale_factor = num_patch_width / original_width
535
+ new_height = int(original_height * scale_factor)
536
+ padding = (num_patch_height - new_height) // 2
537
+ num_patch_height = num_patch_height - padding * 2
538
+ else:
539
+ scale_factor = num_patch_height / original_height
540
+ new_width = int(original_width * scale_factor)
541
+ padding = (num_patch_width - new_width) // 2
542
+ num_patch_width = num_patch_width - padding * 2
543
+
544
+ num_patches = num_patch_width * num_patch_height + num_patch_height
545
+ else:
546
+ num_patches = num_patch_width * num_patch_height
547
+
548
+ # In the "slow" strategy, when applying to first and last frames only, it is applied exclusively to those two frames.
549
+ if num_queries_vis_abstractor_slow > 0:
550
+ if first_last_frames_slow:
551
+ if is_first_or_last_frames:
552
+ num_patches += num_queries_vis_abstractor_slow - num_queries_vis_abstractor
553
+ else:
554
+ num_patches += num_queries_vis_abstractor_slow - num_queries_vis_abstractor
555
+ # The slowfast feature is only applicable when unpad is set to False.
556
+ assert unpad is False
557
+
558
+ # Global image is not included for video inputs.
559
+ if not is_video:
560
+ num_patches += num_global_per_grid**2
561
+
562
+ return num_patches
563
+
564
+
565
+ def divide_to_grids(image: np.array, grid_size: int, input_data_format=None) -> List[np.array]:
566
+ """
567
+ Divides a local image into grids of size (grid_size x grid_size).
568
+
569
+ Args:
570
+ image (np.array): Input image as a NumPy array.
571
+ grid_size (int): The size (in pixels) of each square grid.
572
+ input_data_format (optional): Optional format specifier (e.g., "channels_first" or "channels_last").
573
+
574
+ Returns:
575
+ List[np.array]: A list of image patches, each of size (grid_size x grid_size).
576
+ """
577
+ grids = []
578
+ height, width = get_image_size(image, channel_dim=input_data_format)
579
+ for i in range(0, height, grid_size):
580
+ for j in range(0, width, grid_size):
581
+ if input_data_format == ChannelDimension.LAST:
582
+ grid = image[i : i + grid_size, j : j + grid_size]
583
+ else:
584
+ grid = image[:, i : i + grid_size, j : j + grid_size]
585
+ grids.append(grid)
586
+
587
+ return grids
588
+
589
+
590
+ def pad(
591
+ image: np.array,
592
+ target_size: tuple,
593
+ background_color=(127, 127, 127),
594
+ input_data_format=None,
595
+ ) -> np.array:
596
+ """
597
+ Pads the input image on the sides (top/bottom and left/right) to match the target height and width.
598
+
599
+ Args:
600
+ image (np.array): Input image as a NumPy array.
601
+ target_size (tuple): Target size as (target_height, target_width).
602
+ background_color (tuple, optional): RGB color value used for padding. Defaults to (127, 127, 127).
603
+ input_data_format (optional): Optional format specifier (e.g., "channels_first" or "channels_last").
604
+
605
+ Returns:
606
+ np.array: The padded image with the specified target size.
607
+ """
608
+ target_height, target_width = target_size
609
+ height, width = get_image_size(image, channel_dim=input_data_format)
610
+
611
+ # result = np.ones((target_height, target_width, image.shape[2]), dtype=image.dtype) * background_color
612
+ result = np.empty((target_height, target_width, image.shape[2]), dtype=image.dtype)
613
+ for i in range(image.shape[2]):
614
+ result[..., i].fill(background_color[i])
615
+
616
+ paste_x = (target_width - width) // 2
617
+ paste_y = (target_height - height) // 2
618
+
619
+ result[paste_y : paste_y + height, paste_x : paste_x + width, :] = image
620
+
621
+ return result
622
+
623
+
624
+ def expand2square(
625
+ image: np.array,
626
+ bboxes_dict=None,
627
+ background_color=(127, 127, 127),
628
+ input_data_format=None,
629
+ ) -> np.array:
630
+ """
631
+ Expands the input image to a square shape by placing it at the center of a new square canvas,
632
+ with padding added to the shorter side (either top/bottom or left/right).
633
+
634
+ The image is always centered on the new canvas, and padding is applied symmetrically.
635
+
636
+ Args:
637
+ image (np.array): Input image as a NumPy array.
638
+ bboxes_dict (dict, optional): A dictionary of bounding boxes, where each value is an NDArray of shape (N, 4, 2)
639
+ with box coordinates in the format [[xtl, ytl], [xtr, ytr], [xbr, ybr], [xbl, ybl]].
640
+ Supports multiple categories (e.g., "ocr", "html") simultaneously.
641
+ background_color (tuple, optional): RGB color to fill the padding area. Defaults to (127, 127, 127).
642
+ input_data_format (optional): Optional format specifier for image data (e.g., "channels_first" or "channels_last").
643
+
644
+ Returns:
645
+ np.array: A square-shaped image with the original image centered and padded as needed.
646
+
647
+ Example:
648
+ >>> _img = np.ones((80, 100), dtype=np.uint8) * 100
649
+ >>> _bboxes_dict = {"words": np.array([[[10, 10], [20, 10], [20, 20], [10, 20]],
650
+ ... [[30, 30], [40, 30], [40, 40], [30, 40]]])}
651
+ >>> _img, _bboxes_dict = expand2square(_img, _bboxes_dict, (255, 255, 255))
652
+ >>> _img.shape
653
+ (100, 100)
654
+ >>> guessed_ocr_bboxes = np.array([[[20, 10], [30, 10], [30, 20], [20, 20]],
655
+ ... [[40, 30], [50, 30], [50, 40], [40, 40]]])
656
+ >>> np.testing.assert_array_almost_equal(_bboxes_dict["words"], guessed_ocr_bboxes) is None
657
+ True
658
+ """
659
+ height, width = get_image_size(image, channel_dim=input_data_format)
660
+ if width == height:
661
+ return image, bboxes_dict
662
+ elif width > height:
663
+ # result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color
664
+ result = np.empty((width, width, image.shape[2]), dtype=image.dtype)
665
+ for i in range(image.shape[2]):
666
+ result[..., i].fill(background_color[i])
667
+
668
+ result[(width - height) // 2 : (width - height) // 2 + height, :] = image
669
+ if bboxes_dict is not None:
670
+ for key in bboxes_dict:
671
+ bboxes_dict[key][:, :, 1] += (width - height) // 2
672
+ return result, bboxes_dict
673
+ else:
674
+ # result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color
675
+ result = np.empty((height, height, image.shape[2]), dtype=image.dtype)
676
+ for i in range(image.shape[2]):
677
+ result[..., i].fill(background_color[i])
678
+
679
+ result[:, (height - width) // 2 : (height - width) // 2 + width] = image
680
+ if bboxes_dict is not None:
681
+ for key in bboxes_dict:
682
+ bboxes_dict[key][:, :, 0] += (height - width) // 2
683
+ return result, bboxes_dict
684
+
685
+
686
+ def resize_longside(
687
+ image: np.array,
688
+ size: int,
689
+ resample: PILImageResampling = PILImageResampling.BICUBIC, # type: ignore
690
+ data_format: Optional[Union[str, ChannelDimension]] = None,
691
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
692
+ ):
693
+ """
694
+ Resizes the image so that its longer side matches the specified size, maintaining the original aspect ratio.
695
+
696
+ Args:
697
+ image (np.array): Input image as a NumPy array.
698
+ size (int): Target size for the longer side of the image.
699
+ resample (PILImageResampling, optional): Resampling method to use during resizing. Defaults to BICUBIC.
700
+ data_format (str or ChannelDimension, optional): Output data format (e.g., "channels_first" or "channels_last").
701
+ input_data_format (str or ChannelDimension, optional): Input data format of the image.
702
+
703
+ Returns:
704
+ np.array: The resized image with its aspect ratio preserved.
705
+ """
706
+ height, width = get_image_size(image, channel_dim=input_data_format)
707
+
708
+ if width == height:
709
+ target_height, target_width = size, size
710
+ elif width > height:
711
+ target_width = size
712
+ target_height = math.ceil(height / width * size)
713
+ else:
714
+ target_width = math.ceil(width / height * size)
715
+ target_height = size
716
+
717
+ return resize(
718
+ image,
719
+ size=(target_height, target_width),
720
+ resample=resample,
721
+ data_format=data_format,
722
+ input_data_format=input_data_format,
723
+ )
724
+
725
+
726
+ def _get_local_grids_output_size(image: np.array, target_resolution: tuple, input_data_format=None):
727
+ """
728
+ Computes the number of local grids (patches) along the height and width when resizing an image
729
+ to the target resolution.
730
+
731
+ Args:
732
+ image (np.array): Input image as a NumPy array.
733
+ target_resolution (tuple): Target resolution in the format (target_height, target_width).
734
+ input_data_format (optional): Optional format specifier (e.g., "channels_first" or "channels_last").
735
+
736
+ Returns:
737
+ tuple: A tuple (grid_h, grid_w) representing the number of grids along the height and width.
738
+ """
739
+ original_height, original_width = get_image_size(image, channel_dim=input_data_format)
740
+ target_height, target_width = target_resolution
741
+
742
+ scale_w = target_width / original_width
743
+ scale_h = target_height / original_height
744
+
745
+ if scale_w < scale_h:
746
+ new_width = target_width
747
+ new_height = min(math.ceil(original_height * scale_w), target_height)
748
+ else:
749
+ new_height = target_height
750
+ new_width = min(math.ceil(original_width * scale_h), target_width)
751
+
752
+ return new_height, new_width
753
+
754
+
755
+ def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
756
+ """
757
+ Selects the best-fit resolution from a list of possible resolutions based on the original image size.
758
+
759
+ This function, adapted from LLaVA-Next
760
+ (https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/llava_next/image_processing_llava_next.py),
761
+ evaluates each resolution by computing its effective and wasted area compared to the original size.
762
+ The optimal resolution is the one that maximizes the effective area while minimizing unused (wasted) space.
763
+
764
+ Args:
765
+ original_size (tuple): The original image size in the format (height, width).
766
+ possible_resolutions (list): A list of candidate resolutions in the format [(height1, width1), (height2, width2), ...].
767
+
768
+ Returns:
769
+ tuple: The best-fit resolution in the format (height, width).
770
+ """
771
+ original_height, original_width = original_size
772
+ best_fit = None
773
+ max_effective_resolution = 0
774
+ min_wasted_resolution = float("inf")
775
+
776
+ for height, width in possible_resolutions:
777
+ scale = min(width / original_width, height / original_height)
778
+ downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
779
+ effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
780
+ wasted_resolution = (width * height) - effective_resolution
781
+
782
+ if effective_resolution > max_effective_resolution or (
783
+ effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
784
+ ):
785
+ max_effective_resolution = effective_resolution
786
+ min_wasted_resolution = wasted_resolution
787
+ best_fit = (height, width)
788
+
789
+ return best_fit
preprocessor_config.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "anyres": true,
3
  "auto_map": {
4
- "AutoImageProcessor": "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B--preprocessor.HCXVisionProcessor",
5
- "AutoProcessor": "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B--preprocessor.HCXVisionProcessor"
6
  },
7
  "crop_size": {
8
  "height": 378,
@@ -13,21 +13,20 @@
13
  "do_normalize": true,
14
  "do_rescale": true,
15
  "do_resize": true,
16
- "first_last_frames_slow": false,
17
  "image_mean": [
18
  0.5,
19
  0.5,
20
  0.5
21
  ],
22
- "image_processor_type": "HCXVisionProcessor",
 
23
  "image_std": [
24
  0.5,
25
  0.5,
26
  0.5
27
  ],
28
- "max_image_cnt": 12,
29
- "max_num_grids": 9,
30
- "num_queries_vis_abstractor": 81,
31
  "num_queries_vis_abstractor_video_fast": 9,
32
  "num_queries_vis_abstractor_video_slow": 81,
33
  "pad_to_square": true,
@@ -126,7 +125,7 @@
126
  378
127
  ]
128
  ],
129
- "processor_class": "HCXVisionProcessor",
130
  "resample": 2,
131
  "rescale_factor": 0.00392156862745098,
132
  "size": {
 
1
  {
2
  "anyres": true,
3
  "auto_map": {
4
+ "AutoImageProcessor": "image_processing_hyperclovax.HCXImageProcessor",
5
+ "AutoProcessor": "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B--processing_hyperclovax.HCXProcessor"
6
  },
7
  "crop_size": {
8
  "height": 378,
 
13
  "do_normalize": true,
14
  "do_rescale": true,
15
  "do_resize": true,
16
+ "first_last_frames_slow_video": false,
17
  "image_mean": [
18
  0.5,
19
  0.5,
20
  0.5
21
  ],
22
+ "image_processor_class": "AutoImageProcessor",
23
+ "image_processor_type": "HCXImageProcessor",
24
  "image_std": [
25
  0.5,
26
  0.5,
27
  0.5
28
  ],
29
+ "num_queries_vis_abstractor_image": 81,
 
 
30
  "num_queries_vis_abstractor_video_fast": 9,
31
  "num_queries_vis_abstractor_video_slow": 81,
32
  "pad_to_square": true,
 
125
  378
126
  ]
127
  ],
128
+ "processor_class": "HCXProcessor",
129
  "resample": 2,
130
  "rescale_factor": 0.00392156862745098,
131
  "size": {
tokenizer_config.json CHANGED
@@ -503,5 +503,6 @@
503
  "pad_token": "<|endoftext|>",
504
  "processor_class": "HCXProcessor",
505
  "tokenizer_class": "GPT2Tokenizer",
506
- "unk_token": "<|endoftext|>"
 
507
  }
 
503
  "pad_token": "<|endoftext|>",
504
  "processor_class": "HCXProcessor",
505
  "tokenizer_class": "GPT2Tokenizer",
506
+ "unk_token": "<|endoftext|>",
507
+ "use_fast": true
508
  }