Upload processor
Browse files- image_processing_hyperclovax.py +789 -0
- preprocessor_config.json +7 -8
- tokenizer_config.json +2 -1
image_processing_hyperclovax.py
ADDED
@@ -0,0 +1,789 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
import os
|
4 |
+
from typing import Dict, List, Optional, Union
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import torch
|
8 |
+
from PIL import Image
|
9 |
+
from transformers.feature_extraction_utils import BatchFeature
|
10 |
+
from transformers.image_processing_utils import (
|
11 |
+
BaseImageProcessor,
|
12 |
+
get_size_dict,
|
13 |
+
)
|
14 |
+
from transformers.image_transforms import (
|
15 |
+
convert_to_rgb,
|
16 |
+
get_resize_output_image_size,
|
17 |
+
resize,
|
18 |
+
to_channel_dimension_format,
|
19 |
+
)
|
20 |
+
from transformers.image_utils import (
|
21 |
+
OPENAI_CLIP_MEAN,
|
22 |
+
OPENAI_CLIP_STD,
|
23 |
+
ChannelDimension,
|
24 |
+
ImageInput,
|
25 |
+
PILImageResampling,
|
26 |
+
get_image_size,
|
27 |
+
infer_channel_dimension_format,
|
28 |
+
is_scaled_image,
|
29 |
+
make_list_of_images,
|
30 |
+
to_numpy_array,
|
31 |
+
valid_images,
|
32 |
+
)
|
33 |
+
from transformers.utils import TensorType, logging
|
34 |
+
|
35 |
+
logger = logging.get_logger(__name__)
|
36 |
+
|
37 |
+
|
38 |
+
class HCXImageProcessor(BaseImageProcessor):
|
39 |
+
r"""
|
40 |
+
Constructs a VLM image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques for processing high resolution images.
|
41 |
+
Args:
|
42 |
+
anyres: (bool) anyres 기능을 사용할지 안할지
|
43 |
+
unpad: (bool) anyres 사용시, unpad 기능 (순수 pad 영역에 해당하는 visual tokens 은 LLM input 에서 제거) 을 사용할지 안할지
|
44 |
+
num_queries_vis_abstractor: (int) 각 grid 에 대해서 resampler 를 사용하는 경우, visual query 수
|
45 |
+
possible_resolutions: (List) anyres 기능 사용시, 가능한 resolution 조합, 예: [[336, 336], [336, 672], [672, 336]]
|
46 |
+
patch_size: (int) ViT patch size
|
47 |
+
pad_to_square: (bool) 정사각형으로 padding 을 수행할지, 안할지를 결정. False 이면 정사각형이 아니기 때문에 center crop 을 거쳐 ViT 의 입력으로 들어감
|
48 |
+
"""
|
49 |
+
|
50 |
+
model_input_names = ["pixel_values"]
|
51 |
+
|
52 |
+
def __init__(
|
53 |
+
self,
|
54 |
+
do_resize: bool = True,
|
55 |
+
size: Dict[str, int] = None,
|
56 |
+
anyres: bool = False,
|
57 |
+
unpad: bool = False,
|
58 |
+
num_queries_vis_abstractor_image: int = 81,
|
59 |
+
num_queries_vis_abstractor_video_slow: int = 81,
|
60 |
+
num_queries_vis_abstractor_video_fast: int = 9,
|
61 |
+
first_last_frames_slow_video: bool = False,
|
62 |
+
possible_resolutions: List = [],
|
63 |
+
patch_size: int = 14,
|
64 |
+
pad_to_square: bool = True,
|
65 |
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
66 |
+
do_center_crop: bool = True,
|
67 |
+
crop_size: Dict[str, int] = None,
|
68 |
+
do_rescale: bool = True,
|
69 |
+
rescale_factor: Union[int, float] = 1 / 255,
|
70 |
+
do_normalize: bool = True,
|
71 |
+
image_mean: Optional[Union[float, List[float]]] = None,
|
72 |
+
image_std: Optional[Union[float, List[float]]] = None,
|
73 |
+
do_convert_rgb: bool = True,
|
74 |
+
**kwargs,
|
75 |
+
) -> None:
|
76 |
+
super().__init__(**kwargs)
|
77 |
+
size = size if size is not None else {"shortest_edge": 336}
|
78 |
+
size = get_size_dict(size, default_to_square=False)
|
79 |
+
crop_size = crop_size if crop_size is not None else {"height": 336, "width": 336}
|
80 |
+
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
|
81 |
+
|
82 |
+
self.do_resize = do_resize
|
83 |
+
self.size = size
|
84 |
+
self.anyres = anyres
|
85 |
+
self.unpad = unpad
|
86 |
+
self.num_queries_vis_abstractor_image = num_queries_vis_abstractor_image
|
87 |
+
self.num_queries_vis_abstractor_video_slow = num_queries_vis_abstractor_video_slow
|
88 |
+
self.num_queries_vis_abstractor_video_fast = num_queries_vis_abstractor_video_fast
|
89 |
+
self.first_last_frames_slow_video = first_last_frames_slow_video
|
90 |
+
self.possible_resolutions = [_resolution for _resolution in possible_resolutions]
|
91 |
+
self.patch_size = patch_size
|
92 |
+
self.pad_to_square = pad_to_square
|
93 |
+
self.resample = resample
|
94 |
+
self.do_center_crop = do_center_crop
|
95 |
+
self.crop_size = crop_size
|
96 |
+
self.do_rescale = do_rescale
|
97 |
+
self.rescale_factor = rescale_factor
|
98 |
+
self.do_normalize = do_normalize
|
99 |
+
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
100 |
+
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
101 |
+
self.do_convert_rgb = do_convert_rgb
|
102 |
+
|
103 |
+
def resize(
|
104 |
+
self,
|
105 |
+
image: np.ndarray,
|
106 |
+
size: Dict[str, int],
|
107 |
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
108 |
+
data_format: Optional[Union[str, ChannelDimension]] = None,
|
109 |
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
110 |
+
**kwargs,
|
111 |
+
) -> np.ndarray:
|
112 |
+
default_to_square = True
|
113 |
+
if "shortest_edge" in size:
|
114 |
+
size = size["shortest_edge"]
|
115 |
+
default_to_square = False
|
116 |
+
elif "height" in size and "width" in size:
|
117 |
+
size = (size["height"], size["width"])
|
118 |
+
else:
|
119 |
+
raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
|
120 |
+
|
121 |
+
output_size = get_resize_output_image_size(
|
122 |
+
image,
|
123 |
+
size=size,
|
124 |
+
default_to_square=default_to_square,
|
125 |
+
input_data_format=input_data_format,
|
126 |
+
)
|
127 |
+
|
128 |
+
return resize(
|
129 |
+
image,
|
130 |
+
size=output_size,
|
131 |
+
resample=resample,
|
132 |
+
data_format=data_format,
|
133 |
+
input_data_format=input_data_format,
|
134 |
+
**kwargs,
|
135 |
+
)
|
136 |
+
|
137 |
+
def _preprocess(
|
138 |
+
self,
|
139 |
+
images: ImageInput,
|
140 |
+
do_resize: bool = None,
|
141 |
+
size: Dict[str, int] = None,
|
142 |
+
resample: PILImageResampling = None,
|
143 |
+
do_center_crop: bool = None,
|
144 |
+
crop_size: int = None,
|
145 |
+
do_rescale: bool = None,
|
146 |
+
rescale_factor: float = None,
|
147 |
+
do_normalize: bool = None,
|
148 |
+
image_mean: Optional[Union[float, List[float]]] = None,
|
149 |
+
image_std: Optional[Union[float, List[float]]] = None,
|
150 |
+
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
151 |
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
152 |
+
) -> Image.Image:
|
153 |
+
images = make_list_of_images(images)
|
154 |
+
|
155 |
+
if do_resize:
|
156 |
+
images = [
|
157 |
+
self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
|
158 |
+
for image in images
|
159 |
+
]
|
160 |
+
|
161 |
+
if do_center_crop:
|
162 |
+
images = [
|
163 |
+
self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
|
164 |
+
]
|
165 |
+
|
166 |
+
if do_rescale:
|
167 |
+
images = [
|
168 |
+
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) for image in images
|
169 |
+
]
|
170 |
+
|
171 |
+
if do_normalize:
|
172 |
+
images = [
|
173 |
+
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
|
174 |
+
for image in images
|
175 |
+
]
|
176 |
+
|
177 |
+
images = [
|
178 |
+
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
|
179 |
+
]
|
180 |
+
|
181 |
+
return images
|
182 |
+
|
183 |
+
def _resize_for_local_grids(
|
184 |
+
self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension
|
185 |
+
) -> np.array:
|
186 |
+
new_height, new_width = _get_local_grids_output_size(image, target_resolution, input_data_format)
|
187 |
+
|
188 |
+
# Resize the image
|
189 |
+
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
|
190 |
+
|
191 |
+
return resized_image
|
192 |
+
|
193 |
+
def _pad_for_patching(
|
194 |
+
self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension
|
195 |
+
) -> np.array:
|
196 |
+
"""
|
197 |
+
Pad an image to a target resolution while maintaining aspect ratio.
|
198 |
+
"""
|
199 |
+
target_height, target_width = target_resolution
|
200 |
+
|
201 |
+
background_color = tuple(int(x * 255) for x in self.image_mean)
|
202 |
+
padded_image = pad(
|
203 |
+
image,
|
204 |
+
target_size=(target_height, target_width),
|
205 |
+
background_color=background_color,
|
206 |
+
input_data_format=input_data_format,
|
207 |
+
)
|
208 |
+
|
209 |
+
return padded_image
|
210 |
+
|
211 |
+
def get_image_grids(
|
212 |
+
self,
|
213 |
+
image: np.array,
|
214 |
+
possible_resolutions,
|
215 |
+
grid_size: int,
|
216 |
+
resample: PILImageResampling,
|
217 |
+
data_format: ChannelDimension,
|
218 |
+
input_data_format: ChannelDimension,
|
219 |
+
) -> List[np.array]:
|
220 |
+
if not isinstance(possible_resolutions, list):
|
221 |
+
raise ValueError("possible_resolutions must be a list of possible resolutions.")
|
222 |
+
|
223 |
+
image_size = get_image_size(image, channel_dim=input_data_format)
|
224 |
+
best_resolution = select_best_resolution(image_size, possible_resolutions)
|
225 |
+
resized_image = self._resize_for_local_grids(
|
226 |
+
image, best_resolution, resample=resample, input_data_format=input_data_format
|
227 |
+
)
|
228 |
+
padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format)
|
229 |
+
local_grids = divide_to_grids(padded_image, grid_size=grid_size, input_data_format=input_data_format)
|
230 |
+
|
231 |
+
# make sure that all patches are in the input data format
|
232 |
+
local_grids = [
|
233 |
+
to_channel_dimension_format(grid, channel_dim=data_format, input_channel_dim=input_data_format)
|
234 |
+
for grid in local_grids
|
235 |
+
]
|
236 |
+
|
237 |
+
return local_grids
|
238 |
+
|
239 |
+
def preprocess(
|
240 |
+
self,
|
241 |
+
images: ImageInput,
|
242 |
+
do_resize: bool = None,
|
243 |
+
size: Dict[str, int] = None,
|
244 |
+
anyres: bool = None,
|
245 |
+
unpad: bool = None,
|
246 |
+
is_video: bool = False,
|
247 |
+
num_queries_vis_abstractor_image: int = None,
|
248 |
+
num_queries_vis_abstractor_video_slow: int = None,
|
249 |
+
num_queries_vis_abstractor_video_fast: int = None,
|
250 |
+
first_last_frames_slow_video: bool = None,
|
251 |
+
possible_resolutions: List = None,
|
252 |
+
patch_size: int = None,
|
253 |
+
pad_to_square: bool = None,
|
254 |
+
resample: PILImageResampling = None,
|
255 |
+
do_center_crop: bool = None,
|
256 |
+
crop_size: int = None,
|
257 |
+
do_rescale: bool = None,
|
258 |
+
rescale_factor: float = None,
|
259 |
+
do_normalize: bool = None,
|
260 |
+
image_mean: Optional[Union[float, List[float]]] = None,
|
261 |
+
image_std: Optional[Union[float, List[float]]] = None,
|
262 |
+
do_convert_rgb: bool = None,
|
263 |
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
264 |
+
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
265 |
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
266 |
+
return_dummy_image: bool = False,
|
267 |
+
first_last_frames_slow: bool = False,
|
268 |
+
is_first_or_last_frames: bool = False,
|
269 |
+
**kwargs,
|
270 |
+
):
|
271 |
+
"""
|
272 |
+
HCXVisionImageProcessor 로 image tensor, original image size (width, height), visual tokens
|
273 |
+
:return pixel_values: List of 4D tensor 로 image tensor
|
274 |
+
:return image_sizes: List of Dict 로 image width, height [{"width": image 1 의 width, "height": image 1 의 height}, {"width": image 2 의 width, "height": image 2 의 height}, ...]
|
275 |
+
:return vision_query_lengths: List of int 로 각 image 가 LLM 입력으로 전달될때 변환되는 visual token 수
|
276 |
+
"""
|
277 |
+
|
278 |
+
do_resize = do_resize if do_resize is not None else self.do_resize
|
279 |
+
size = size if size is not None else self.size
|
280 |
+
size = get_size_dict(size, param_name="size", default_to_square=False)
|
281 |
+
anyres = anyres if anyres is not None else self.anyres
|
282 |
+
unpad = unpad if unpad is not None else self.unpad
|
283 |
+
num_queries_vis_abstractor_image = (
|
284 |
+
num_queries_vis_abstractor_image
|
285 |
+
if num_queries_vis_abstractor_image is not None
|
286 |
+
else self.num_queries_vis_abstractor_image
|
287 |
+
)
|
288 |
+
num_queries_vis_abstractor_video_slow = (
|
289 |
+
num_queries_vis_abstractor_video_slow
|
290 |
+
if num_queries_vis_abstractor_video_slow is not None
|
291 |
+
else self.num_queries_vis_abstractor_video_slow
|
292 |
+
)
|
293 |
+
num_queries_vis_abstractor_video_fast = (
|
294 |
+
num_queries_vis_abstractor_video_fast
|
295 |
+
if num_queries_vis_abstractor_video_fast is not None
|
296 |
+
else self.num_queries_vis_abstractor_video_fast
|
297 |
+
)
|
298 |
+
first_last_frames_slow_video = (
|
299 |
+
first_last_frames_slow_video
|
300 |
+
if first_last_frames_slow_video is not None
|
301 |
+
else self.first_last_frames_slow_video
|
302 |
+
)
|
303 |
+
possible_resolutions = possible_resolutions if possible_resolutions is not None else self.possible_resolutions
|
304 |
+
patch_size = patch_size if patch_size is not None else self.patch_size
|
305 |
+
pad_to_square = pad_to_square if pad_to_square is not None else self.pad_to_square
|
306 |
+
resample = resample if resample is not None else self.resample
|
307 |
+
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
|
308 |
+
crop_size = crop_size if crop_size is not None else self.crop_size
|
309 |
+
crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
|
310 |
+
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
311 |
+
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
312 |
+
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
313 |
+
image_mean = image_mean if image_mean is not None else self.image_mean
|
314 |
+
image_std = image_std if image_std is not None else self.image_std
|
315 |
+
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
316 |
+
|
317 |
+
if is_video:
|
318 |
+
num_queries_vis_abstractor = num_queries_vis_abstractor_video_fast
|
319 |
+
num_queries_vis_abstractor_slow = num_queries_vis_abstractor_video_slow
|
320 |
+
unpad = False
|
321 |
+
else:
|
322 |
+
num_queries_vis_abstractor = num_queries_vis_abstractor_image
|
323 |
+
num_queries_vis_abstractor_slow = 0
|
324 |
+
|
325 |
+
if return_dummy_image:
|
326 |
+
images = Image.new("RGB", (224, 224), (0, 0, 0))
|
327 |
+
|
328 |
+
images = make_list_of_images(images)
|
329 |
+
|
330 |
+
if not valid_images(images):
|
331 |
+
raise ValueError(
|
332 |
+
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
333 |
+
"torch.Tensor, tf.Tensor or jax.ndarray."
|
334 |
+
)
|
335 |
+
|
336 |
+
if do_convert_rgb:
|
337 |
+
images = [convert_to_rgb(image) for image in images]
|
338 |
+
|
339 |
+
# All transformations expect numpy arrays.
|
340 |
+
images = [to_numpy_array(image) for image in images]
|
341 |
+
|
342 |
+
if is_scaled_image(images[0]) and do_rescale:
|
343 |
+
logger.warning_once(
|
344 |
+
"It looks like you are trying to rescale already rescaled images. If the input"
|
345 |
+
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
346 |
+
)
|
347 |
+
|
348 |
+
if input_data_format is None:
|
349 |
+
# We assume that all images have the same channel dimension format.
|
350 |
+
input_data_format = infer_channel_dimension_format(images[0])
|
351 |
+
|
352 |
+
new_images = []
|
353 |
+
image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
|
354 |
+
vision_query_lengths = []
|
355 |
+
|
356 |
+
assert crop_size["height"] == crop_size["width"]
|
357 |
+
|
358 |
+
# global image 의 padding 연산은, image original width, height 가 클 때 bottleneck 이 될 수 있음
|
359 |
+
# 장축의 길이를 size["shortest_edge"] 로 resize 를 먼저 한 뒤에, padding
|
360 |
+
if anyres:
|
361 |
+
anyres_global_images = copy.deepcopy(images)
|
362 |
+
if pad_to_square:
|
363 |
+
background_color = tuple(int(x * 255) for x in self.image_mean)
|
364 |
+
anyres_global_images = [
|
365 |
+
resize_longside(copy.deepcopy(image), size["shortest_edge"], resample, input_data_format)
|
366 |
+
for image in anyres_global_images
|
367 |
+
]
|
368 |
+
anyres_global_images = [
|
369 |
+
expand2square(image, background_color=background_color, input_data_format=input_data_format)[0]
|
370 |
+
for image in anyres_global_images
|
371 |
+
]
|
372 |
+
else:
|
373 |
+
anyres_global_images = [
|
374 |
+
self.resize(
|
375 |
+
image=image,
|
376 |
+
size={"height": size["shortest_edge"], "width": size["shortest_edge"]},
|
377 |
+
resample=resample,
|
378 |
+
input_data_format=input_data_format,
|
379 |
+
)
|
380 |
+
for image in anyres_global_images
|
381 |
+
]
|
382 |
+
else:
|
383 |
+
anyres_global_images = [None for _ in range(len(images))]
|
384 |
+
if pad_to_square:
|
385 |
+
background_color = tuple(int(x * 255) for x in self.image_mean)
|
386 |
+
images = [
|
387 |
+
resize_longside(image, size["shortest_edge"], resample, input_data_format) for image in images
|
388 |
+
]
|
389 |
+
images = [
|
390 |
+
expand2square(image, background_color=background_color, input_data_format=input_data_format)[0]
|
391 |
+
for image in images
|
392 |
+
]
|
393 |
+
|
394 |
+
for image, anyres_global_image, image_size in zip(images, anyres_global_images, image_sizes):
|
395 |
+
if anyres:
|
396 |
+
# convert image into a list of grids
|
397 |
+
# we intentially use the same data format as the input data format
|
398 |
+
image_grids = self.get_image_grids(
|
399 |
+
image,
|
400 |
+
possible_resolutions,
|
401 |
+
grid_size=crop_size["height"],
|
402 |
+
resample=resample,
|
403 |
+
data_format=input_data_format,
|
404 |
+
input_data_format=input_data_format,
|
405 |
+
)
|
406 |
+
# video 에 대해서는 global image (thumbnail) 를 사용하지 않음
|
407 |
+
if not is_video:
|
408 |
+
image_grids = [anyres_global_image] + image_grids
|
409 |
+
else:
|
410 |
+
image_grids = [image]
|
411 |
+
|
412 |
+
pixel_values = self._preprocess(
|
413 |
+
image_grids,
|
414 |
+
do_resize=do_resize,
|
415 |
+
size=size,
|
416 |
+
resample=resample,
|
417 |
+
do_center_crop=do_center_crop,
|
418 |
+
crop_size=crop_size,
|
419 |
+
do_rescale=do_rescale,
|
420 |
+
rescale_factor=rescale_factor,
|
421 |
+
do_normalize=do_normalize,
|
422 |
+
image_mean=image_mean,
|
423 |
+
image_std=image_std,
|
424 |
+
data_format=data_format,
|
425 |
+
input_data_format=input_data_format,
|
426 |
+
)
|
427 |
+
|
428 |
+
pixel_values = np.array(pixel_values)
|
429 |
+
new_images.append(pixel_values)
|
430 |
+
|
431 |
+
vision_query_length = determine_anyres_num_vision_patches(
|
432 |
+
image_size=image_size,
|
433 |
+
grid_size=crop_size["height"],
|
434 |
+
patch_size=patch_size,
|
435 |
+
possible_resolutions=possible_resolutions,
|
436 |
+
anyres=anyres,
|
437 |
+
unpad=unpad,
|
438 |
+
num_queries_vis_abstractor=num_queries_vis_abstractor,
|
439 |
+
num_queries_vis_abstractor_slow=num_queries_vis_abstractor_slow,
|
440 |
+
is_video=is_video,
|
441 |
+
first_last_frames_slow=first_last_frames_slow,
|
442 |
+
is_first_or_last_frames=is_first_or_last_frames,
|
443 |
+
)
|
444 |
+
|
445 |
+
vision_query_lengths.append(vision_query_length)
|
446 |
+
|
447 |
+
if return_dummy_image:
|
448 |
+
vision_query_lengths = []
|
449 |
+
|
450 |
+
data = {
|
451 |
+
"pixel_values": [torch.tensor(new_image) for new_image in new_images],
|
452 |
+
"image_sizes": [{"width": image_size[1], "height": image_size[0]} for image_size in image_sizes],
|
453 |
+
"vision_query_lengths": vision_query_lengths,
|
454 |
+
}
|
455 |
+
|
456 |
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
457 |
+
|
458 |
+
def save_pretrained(
|
459 |
+
self,
|
460 |
+
save_directory: Union[str, os.PathLike],
|
461 |
+
*args,
|
462 |
+
**kwargs,
|
463 |
+
):
|
464 |
+
self.register_for_auto_class()
|
465 |
+
super().save_pretrained(save_directory, *args, **kwargs)
|
466 |
+
|
467 |
+
|
468 |
+
def determine_anyres_num_vision_patches(
|
469 |
+
image_size,
|
470 |
+
grid_size,
|
471 |
+
patch_size,
|
472 |
+
possible_resolutions,
|
473 |
+
anyres=False,
|
474 |
+
unpad=True,
|
475 |
+
num_queries_vis_abstractor=0,
|
476 |
+
num_queries_vis_abstractor_slow=0,
|
477 |
+
is_video=False,
|
478 |
+
first_last_frames_slow=False, # sample-wise option
|
479 |
+
is_first_or_last_frames=False, # grid-wise option
|
480 |
+
):
|
481 |
+
"""
|
482 |
+
Computes the number of visual tokens (patches) based on image resolution, grid configuration, and patch size.
|
483 |
+
|
484 |
+
This function supports both fixed-size and any-resolution settings, as well as video-specific configurations
|
485 |
+
such as handling slow frames and frame position flags.
|
486 |
+
|
487 |
+
Args:
|
488 |
+
num_grids (int): Number of grids per image (e.g., 1 for 1x1, 4 for 2x2, etc.).
|
489 |
+
image_size (tuple): The original image size as (height, width).
|
490 |
+
grid_size (int): Size of each grid in pixels (e.g., 336).
|
491 |
+
patch_size (int): Size of each vision patch (e.g., 14 for ViT models).
|
492 |
+
possible_resolutions (list): List of possible resolution tuples [(h1, w1), (h2, w2), ...].
|
493 |
+
anyres (bool, optional): Whether to use any-resolution mode. Defaults to False.
|
494 |
+
unpad (bool, optional): Whether to unpad the image before computing patches. Defaults to True.
|
495 |
+
num_queries_vis_abstractor (int, optional): Number of query tokens for vision abstractor (fast path).
|
496 |
+
num_queries_vis_abstractor_slow (int, optional): Number of query tokens for vision abstractor (slow path).
|
497 |
+
is_video (bool, optional): Whether the input is a video. Defaults to False.
|
498 |
+
first_last_frames_slow (bool, optional): Whether to treat first/last video frames as "slow". Defaults to False.
|
499 |
+
is_first_or_last_frames (bool, optional): Whether current grid corresponds to first/last frame. Defaults to False.
|
500 |
+
|
501 |
+
Returns:
|
502 |
+
int: Total number of visual tokens (patches) after processing.
|
503 |
+
"""
|
504 |
+
|
505 |
+
if not anyres:
|
506 |
+
return num_queries_vis_abstractor if num_queries_vis_abstractor > 0 else (grid_size // patch_size) ** 2
|
507 |
+
|
508 |
+
if num_queries_vis_abstractor > 0:
|
509 |
+
num_patch_per_grid = int(num_queries_vis_abstractor**0.5)
|
510 |
+
else:
|
511 |
+
num_patch_per_grid = grid_size // patch_size
|
512 |
+
|
513 |
+
num_global_per_grid = num_patch_per_grid
|
514 |
+
|
515 |
+
# In anyres mode, a global image is included, so there are always at least 2 grids.
|
516 |
+
# However, for video inputs, there is no global image, so it's possible to have only 1 grid.
|
517 |
+
# Therefore, the assertion below is commented out:
|
518 |
+
# assert num_grids > 1
|
519 |
+
|
520 |
+
# Compute the number of vision patches.
|
521 |
+
height, width = select_best_resolution(image_size, possible_resolutions)
|
522 |
+
|
523 |
+
num_patch_height = (height // grid_size) * num_patch_per_grid
|
524 |
+
num_patch_width = (width // grid_size) * num_patch_per_grid
|
525 |
+
|
526 |
+
# local images
|
527 |
+
if unpad:
|
528 |
+
original_height, original_width = image_size
|
529 |
+
|
530 |
+
original_aspect_ratio = original_width / original_height
|
531 |
+
current_aspect_ratio = num_patch_width / num_patch_height
|
532 |
+
|
533 |
+
if original_aspect_ratio > current_aspect_ratio:
|
534 |
+
scale_factor = num_patch_width / original_width
|
535 |
+
new_height = int(original_height * scale_factor)
|
536 |
+
padding = (num_patch_height - new_height) // 2
|
537 |
+
num_patch_height = num_patch_height - padding * 2
|
538 |
+
else:
|
539 |
+
scale_factor = num_patch_height / original_height
|
540 |
+
new_width = int(original_width * scale_factor)
|
541 |
+
padding = (num_patch_width - new_width) // 2
|
542 |
+
num_patch_width = num_patch_width - padding * 2
|
543 |
+
|
544 |
+
num_patches = num_patch_width * num_patch_height + num_patch_height
|
545 |
+
else:
|
546 |
+
num_patches = num_patch_width * num_patch_height
|
547 |
+
|
548 |
+
# In the "slow" strategy, when applying to first and last frames only, it is applied exclusively to those two frames.
|
549 |
+
if num_queries_vis_abstractor_slow > 0:
|
550 |
+
if first_last_frames_slow:
|
551 |
+
if is_first_or_last_frames:
|
552 |
+
num_patches += num_queries_vis_abstractor_slow - num_queries_vis_abstractor
|
553 |
+
else:
|
554 |
+
num_patches += num_queries_vis_abstractor_slow - num_queries_vis_abstractor
|
555 |
+
# The slowfast feature is only applicable when unpad is set to False.
|
556 |
+
assert unpad is False
|
557 |
+
|
558 |
+
# Global image is not included for video inputs.
|
559 |
+
if not is_video:
|
560 |
+
num_patches += num_global_per_grid**2
|
561 |
+
|
562 |
+
return num_patches
|
563 |
+
|
564 |
+
|
565 |
+
def divide_to_grids(image: np.array, grid_size: int, input_data_format=None) -> List[np.array]:
|
566 |
+
"""
|
567 |
+
Divides a local image into grids of size (grid_size x grid_size).
|
568 |
+
|
569 |
+
Args:
|
570 |
+
image (np.array): Input image as a NumPy array.
|
571 |
+
grid_size (int): The size (in pixels) of each square grid.
|
572 |
+
input_data_format (optional): Optional format specifier (e.g., "channels_first" or "channels_last").
|
573 |
+
|
574 |
+
Returns:
|
575 |
+
List[np.array]: A list of image patches, each of size (grid_size x grid_size).
|
576 |
+
"""
|
577 |
+
grids = []
|
578 |
+
height, width = get_image_size(image, channel_dim=input_data_format)
|
579 |
+
for i in range(0, height, grid_size):
|
580 |
+
for j in range(0, width, grid_size):
|
581 |
+
if input_data_format == ChannelDimension.LAST:
|
582 |
+
grid = image[i : i + grid_size, j : j + grid_size]
|
583 |
+
else:
|
584 |
+
grid = image[:, i : i + grid_size, j : j + grid_size]
|
585 |
+
grids.append(grid)
|
586 |
+
|
587 |
+
return grids
|
588 |
+
|
589 |
+
|
590 |
+
def pad(
|
591 |
+
image: np.array,
|
592 |
+
target_size: tuple,
|
593 |
+
background_color=(127, 127, 127),
|
594 |
+
input_data_format=None,
|
595 |
+
) -> np.array:
|
596 |
+
"""
|
597 |
+
Pads the input image on the sides (top/bottom and left/right) to match the target height and width.
|
598 |
+
|
599 |
+
Args:
|
600 |
+
image (np.array): Input image as a NumPy array.
|
601 |
+
target_size (tuple): Target size as (target_height, target_width).
|
602 |
+
background_color (tuple, optional): RGB color value used for padding. Defaults to (127, 127, 127).
|
603 |
+
input_data_format (optional): Optional format specifier (e.g., "channels_first" or "channels_last").
|
604 |
+
|
605 |
+
Returns:
|
606 |
+
np.array: The padded image with the specified target size.
|
607 |
+
"""
|
608 |
+
target_height, target_width = target_size
|
609 |
+
height, width = get_image_size(image, channel_dim=input_data_format)
|
610 |
+
|
611 |
+
# result = np.ones((target_height, target_width, image.shape[2]), dtype=image.dtype) * background_color
|
612 |
+
result = np.empty((target_height, target_width, image.shape[2]), dtype=image.dtype)
|
613 |
+
for i in range(image.shape[2]):
|
614 |
+
result[..., i].fill(background_color[i])
|
615 |
+
|
616 |
+
paste_x = (target_width - width) // 2
|
617 |
+
paste_y = (target_height - height) // 2
|
618 |
+
|
619 |
+
result[paste_y : paste_y + height, paste_x : paste_x + width, :] = image
|
620 |
+
|
621 |
+
return result
|
622 |
+
|
623 |
+
|
624 |
+
def expand2square(
|
625 |
+
image: np.array,
|
626 |
+
bboxes_dict=None,
|
627 |
+
background_color=(127, 127, 127),
|
628 |
+
input_data_format=None,
|
629 |
+
) -> np.array:
|
630 |
+
"""
|
631 |
+
Expands the input image to a square shape by placing it at the center of a new square canvas,
|
632 |
+
with padding added to the shorter side (either top/bottom or left/right).
|
633 |
+
|
634 |
+
The image is always centered on the new canvas, and padding is applied symmetrically.
|
635 |
+
|
636 |
+
Args:
|
637 |
+
image (np.array): Input image as a NumPy array.
|
638 |
+
bboxes_dict (dict, optional): A dictionary of bounding boxes, where each value is an NDArray of shape (N, 4, 2)
|
639 |
+
with box coordinates in the format [[xtl, ytl], [xtr, ytr], [xbr, ybr], [xbl, ybl]].
|
640 |
+
Supports multiple categories (e.g., "ocr", "html") simultaneously.
|
641 |
+
background_color (tuple, optional): RGB color to fill the padding area. Defaults to (127, 127, 127).
|
642 |
+
input_data_format (optional): Optional format specifier for image data (e.g., "channels_first" or "channels_last").
|
643 |
+
|
644 |
+
Returns:
|
645 |
+
np.array: A square-shaped image with the original image centered and padded as needed.
|
646 |
+
|
647 |
+
Example:
|
648 |
+
>>> _img = np.ones((80, 100), dtype=np.uint8) * 100
|
649 |
+
>>> _bboxes_dict = {"words": np.array([[[10, 10], [20, 10], [20, 20], [10, 20]],
|
650 |
+
... [[30, 30], [40, 30], [40, 40], [30, 40]]])}
|
651 |
+
>>> _img, _bboxes_dict = expand2square(_img, _bboxes_dict, (255, 255, 255))
|
652 |
+
>>> _img.shape
|
653 |
+
(100, 100)
|
654 |
+
>>> guessed_ocr_bboxes = np.array([[[20, 10], [30, 10], [30, 20], [20, 20]],
|
655 |
+
... [[40, 30], [50, 30], [50, 40], [40, 40]]])
|
656 |
+
>>> np.testing.assert_array_almost_equal(_bboxes_dict["words"], guessed_ocr_bboxes) is None
|
657 |
+
True
|
658 |
+
"""
|
659 |
+
height, width = get_image_size(image, channel_dim=input_data_format)
|
660 |
+
if width == height:
|
661 |
+
return image, bboxes_dict
|
662 |
+
elif width > height:
|
663 |
+
# result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color
|
664 |
+
result = np.empty((width, width, image.shape[2]), dtype=image.dtype)
|
665 |
+
for i in range(image.shape[2]):
|
666 |
+
result[..., i].fill(background_color[i])
|
667 |
+
|
668 |
+
result[(width - height) // 2 : (width - height) // 2 + height, :] = image
|
669 |
+
if bboxes_dict is not None:
|
670 |
+
for key in bboxes_dict:
|
671 |
+
bboxes_dict[key][:, :, 1] += (width - height) // 2
|
672 |
+
return result, bboxes_dict
|
673 |
+
else:
|
674 |
+
# result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color
|
675 |
+
result = np.empty((height, height, image.shape[2]), dtype=image.dtype)
|
676 |
+
for i in range(image.shape[2]):
|
677 |
+
result[..., i].fill(background_color[i])
|
678 |
+
|
679 |
+
result[:, (height - width) // 2 : (height - width) // 2 + width] = image
|
680 |
+
if bboxes_dict is not None:
|
681 |
+
for key in bboxes_dict:
|
682 |
+
bboxes_dict[key][:, :, 0] += (height - width) // 2
|
683 |
+
return result, bboxes_dict
|
684 |
+
|
685 |
+
|
686 |
+
def resize_longside(
|
687 |
+
image: np.array,
|
688 |
+
size: int,
|
689 |
+
resample: PILImageResampling = PILImageResampling.BICUBIC, # type: ignore
|
690 |
+
data_format: Optional[Union[str, ChannelDimension]] = None,
|
691 |
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
692 |
+
):
|
693 |
+
"""
|
694 |
+
Resizes the image so that its longer side matches the specified size, maintaining the original aspect ratio.
|
695 |
+
|
696 |
+
Args:
|
697 |
+
image (np.array): Input image as a NumPy array.
|
698 |
+
size (int): Target size for the longer side of the image.
|
699 |
+
resample (PILImageResampling, optional): Resampling method to use during resizing. Defaults to BICUBIC.
|
700 |
+
data_format (str or ChannelDimension, optional): Output data format (e.g., "channels_first" or "channels_last").
|
701 |
+
input_data_format (str or ChannelDimension, optional): Input data format of the image.
|
702 |
+
|
703 |
+
Returns:
|
704 |
+
np.array: The resized image with its aspect ratio preserved.
|
705 |
+
"""
|
706 |
+
height, width = get_image_size(image, channel_dim=input_data_format)
|
707 |
+
|
708 |
+
if width == height:
|
709 |
+
target_height, target_width = size, size
|
710 |
+
elif width > height:
|
711 |
+
target_width = size
|
712 |
+
target_height = math.ceil(height / width * size)
|
713 |
+
else:
|
714 |
+
target_width = math.ceil(width / height * size)
|
715 |
+
target_height = size
|
716 |
+
|
717 |
+
return resize(
|
718 |
+
image,
|
719 |
+
size=(target_height, target_width),
|
720 |
+
resample=resample,
|
721 |
+
data_format=data_format,
|
722 |
+
input_data_format=input_data_format,
|
723 |
+
)
|
724 |
+
|
725 |
+
|
726 |
+
def _get_local_grids_output_size(image: np.array, target_resolution: tuple, input_data_format=None):
|
727 |
+
"""
|
728 |
+
Computes the number of local grids (patches) along the height and width when resizing an image
|
729 |
+
to the target resolution.
|
730 |
+
|
731 |
+
Args:
|
732 |
+
image (np.array): Input image as a NumPy array.
|
733 |
+
target_resolution (tuple): Target resolution in the format (target_height, target_width).
|
734 |
+
input_data_format (optional): Optional format specifier (e.g., "channels_first" or "channels_last").
|
735 |
+
|
736 |
+
Returns:
|
737 |
+
tuple: A tuple (grid_h, grid_w) representing the number of grids along the height and width.
|
738 |
+
"""
|
739 |
+
original_height, original_width = get_image_size(image, channel_dim=input_data_format)
|
740 |
+
target_height, target_width = target_resolution
|
741 |
+
|
742 |
+
scale_w = target_width / original_width
|
743 |
+
scale_h = target_height / original_height
|
744 |
+
|
745 |
+
if scale_w < scale_h:
|
746 |
+
new_width = target_width
|
747 |
+
new_height = min(math.ceil(original_height * scale_w), target_height)
|
748 |
+
else:
|
749 |
+
new_height = target_height
|
750 |
+
new_width = min(math.ceil(original_width * scale_h), target_width)
|
751 |
+
|
752 |
+
return new_height, new_width
|
753 |
+
|
754 |
+
|
755 |
+
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
|
756 |
+
"""
|
757 |
+
Selects the best-fit resolution from a list of possible resolutions based on the original image size.
|
758 |
+
|
759 |
+
This function, adapted from LLaVA-Next
|
760 |
+
(https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/llava_next/image_processing_llava_next.py),
|
761 |
+
evaluates each resolution by computing its effective and wasted area compared to the original size.
|
762 |
+
The optimal resolution is the one that maximizes the effective area while minimizing unused (wasted) space.
|
763 |
+
|
764 |
+
Args:
|
765 |
+
original_size (tuple): The original image size in the format (height, width).
|
766 |
+
possible_resolutions (list): A list of candidate resolutions in the format [(height1, width1), (height2, width2), ...].
|
767 |
+
|
768 |
+
Returns:
|
769 |
+
tuple: The best-fit resolution in the format (height, width).
|
770 |
+
"""
|
771 |
+
original_height, original_width = original_size
|
772 |
+
best_fit = None
|
773 |
+
max_effective_resolution = 0
|
774 |
+
min_wasted_resolution = float("inf")
|
775 |
+
|
776 |
+
for height, width in possible_resolutions:
|
777 |
+
scale = min(width / original_width, height / original_height)
|
778 |
+
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
|
779 |
+
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
|
780 |
+
wasted_resolution = (width * height) - effective_resolution
|
781 |
+
|
782 |
+
if effective_resolution > max_effective_resolution or (
|
783 |
+
effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
|
784 |
+
):
|
785 |
+
max_effective_resolution = effective_resolution
|
786 |
+
min_wasted_resolution = wasted_resolution
|
787 |
+
best_fit = (height, width)
|
788 |
+
|
789 |
+
return best_fit
|
preprocessor_config.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"anyres": true,
|
3 |
"auto_map": {
|
4 |
-
"AutoImageProcessor": "
|
5 |
-
"AutoProcessor": "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B--
|
6 |
},
|
7 |
"crop_size": {
|
8 |
"height": 378,
|
@@ -13,21 +13,20 @@
|
|
13 |
"do_normalize": true,
|
14 |
"do_rescale": true,
|
15 |
"do_resize": true,
|
16 |
-
"
|
17 |
"image_mean": [
|
18 |
0.5,
|
19 |
0.5,
|
20 |
0.5
|
21 |
],
|
22 |
-
"
|
|
|
23 |
"image_std": [
|
24 |
0.5,
|
25 |
0.5,
|
26 |
0.5
|
27 |
],
|
28 |
-
"
|
29 |
-
"max_num_grids": 9,
|
30 |
-
"num_queries_vis_abstractor": 81,
|
31 |
"num_queries_vis_abstractor_video_fast": 9,
|
32 |
"num_queries_vis_abstractor_video_slow": 81,
|
33 |
"pad_to_square": true,
|
@@ -126,7 +125,7 @@
|
|
126 |
378
|
127 |
]
|
128 |
],
|
129 |
-
"processor_class": "
|
130 |
"resample": 2,
|
131 |
"rescale_factor": 0.00392156862745098,
|
132 |
"size": {
|
|
|
1 |
{
|
2 |
"anyres": true,
|
3 |
"auto_map": {
|
4 |
+
"AutoImageProcessor": "image_processing_hyperclovax.HCXImageProcessor",
|
5 |
+
"AutoProcessor": "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B--processing_hyperclovax.HCXProcessor"
|
6 |
},
|
7 |
"crop_size": {
|
8 |
"height": 378,
|
|
|
13 |
"do_normalize": true,
|
14 |
"do_rescale": true,
|
15 |
"do_resize": true,
|
16 |
+
"first_last_frames_slow_video": false,
|
17 |
"image_mean": [
|
18 |
0.5,
|
19 |
0.5,
|
20 |
0.5
|
21 |
],
|
22 |
+
"image_processor_class": "AutoImageProcessor",
|
23 |
+
"image_processor_type": "HCXImageProcessor",
|
24 |
"image_std": [
|
25 |
0.5,
|
26 |
0.5,
|
27 |
0.5
|
28 |
],
|
29 |
+
"num_queries_vis_abstractor_image": 81,
|
|
|
|
|
30 |
"num_queries_vis_abstractor_video_fast": 9,
|
31 |
"num_queries_vis_abstractor_video_slow": 81,
|
32 |
"pad_to_square": true,
|
|
|
125 |
378
|
126 |
]
|
127 |
],
|
128 |
+
"processor_class": "HCXProcessor",
|
129 |
"resample": 2,
|
130 |
"rescale_factor": 0.00392156862745098,
|
131 |
"size": {
|
tokenizer_config.json
CHANGED
@@ -503,5 +503,6 @@
|
|
503 |
"pad_token": "<|endoftext|>",
|
504 |
"processor_class": "HCXProcessor",
|
505 |
"tokenizer_class": "GPT2Tokenizer",
|
506 |
-
"unk_token": "<|endoftext|>"
|
|
|
507 |
}
|
|
|
503 |
"pad_token": "<|endoftext|>",
|
504 |
"processor_class": "HCXProcessor",
|
505 |
"tokenizer_class": "GPT2Tokenizer",
|
506 |
+
"unk_token": "<|endoftext|>",
|
507 |
+
"use_fast": true
|
508 |
}
|