kjunh commited on
Commit
b7d5f5c
·
1 Parent(s): a157f67

Support Transformers AutoModel

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ backup/
2
+ __pycache__/
3
+ *.pyc
4
+ *.swo
5
+ *.swp
__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoModelForCausalLM,
3
+ AutoProcessor,
4
+ AutoImageProcessor,
5
+ AutoConfig,
6
+ )
7
+
8
+ from .processor import (
9
+ Qwen2VLImagePointerProcessor,
10
+ get_processor,
11
+ V1Processor,
12
+ collate_fn,
13
+ )
14
+ from .modeling_v1 import V1ForConditionalGeneration
15
+ from .configuration_v1 import V1Config
16
+
17
+ print("Registering V1 model and processor with Transformers")
18
+ AutoConfig.register("v1", V1Config)
19
+ AutoModelForCausalLM.register(
20
+ V1Config, V1ForConditionalGeneration
21
+ )
22
+ AutoProcessor.register(V1Config, V1Processor)
23
+ AutoImageProcessor.register(V1Config, Qwen2VLImagePointerProcessor)
config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "architectures": [
3
- "Qwen2_5_VL_PGNForConditionalGeneration"
4
  ],
5
  "attention_dropout": 0.0,
6
  "bos_token_id": 151643,
@@ -18,7 +18,7 @@
18
  "label_smoothing": 0.0,
19
  "max_position_embeddings": 128000,
20
  "max_window_layers": 28,
21
- "model_type": "qwen2_5_vl_pgn",
22
  "normalize_copy_states": false,
23
  "num_attention_heads": 28,
24
  "num_hidden_layers": 28,
@@ -75,5 +75,9 @@
75
  "vision_token_id": 151654,
76
  "vocab_size": 152064,
77
  "z_loss_top_k": 40,
78
- "z_loss_weight": 1e-05
 
 
 
 
79
  }
 
1
  {
2
  "architectures": [
3
+ "V1ForConditionalGeneration"
4
  ],
5
  "attention_dropout": 0.0,
6
  "bos_token_id": 151643,
 
18
  "label_smoothing": 0.0,
19
  "max_position_embeddings": 128000,
20
  "max_window_layers": 28,
21
+ "model_type": "v1",
22
  "normalize_copy_states": false,
23
  "num_attention_heads": 28,
24
  "num_hidden_layers": 28,
 
75
  "vision_token_id": 151654,
76
  "vocab_size": 152064,
77
  "z_loss_top_k": 40,
78
+ "z_loss_weight": 1e-05,
79
+ "auto_map": {
80
+ "AutoConfig": "configuration_v1.V1Config",
81
+ "AutoModelForConditionalGeneration": "modeling_v1.V1ForConditionalGeneration"
82
+ }
83
  }
configuration_v1.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.modeling_rope_utils import rope_config_validation
5
+ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
6
+ Qwen2_5_VLVisionConfig,
7
+ )
8
+
9
+
10
+ class V1Config(PretrainedConfig):
11
+ r"""
12
+ This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a
13
+ Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
14
+ with the defaults will yield a similar configuration to that of
15
+ Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
16
+
17
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
18
+ documentation from [`PretrainedConfig`] for more information.
19
+
20
+
21
+ Args:
22
+ vocab_size (`int`, *optional*, defaults to 152064):
23
+ Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the
24
+ `inputs_ids` passed when calling [`Qwen2_5_VLModel`]
25
+ hidden_size (`int`, *optional*, defaults to 8192):
26
+ Dimension of the hidden representations.
27
+ intermediate_size (`int`, *optional*, defaults to 29568):
28
+ Dimension of the MLP representations.
29
+ num_hidden_layers (`int`, *optional*, defaults to 80):
30
+ Number of hidden layers in the Transformer encoder.
31
+ num_attention_heads (`int`, *optional*, defaults to 64):
32
+ Number of attention heads for each attention layer in the Transformer encoder.
33
+ num_key_value_heads (`int`, *optional*, defaults to 8):
34
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
35
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
36
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
37
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
38
+ by meanpooling all the original heads within that group. For more details checkout [this
39
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
40
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
41
+ The non-linear activation function (function or string) in the decoder.
42
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
43
+ The maximum sequence length that this model might ever be used with.
44
+ initializer_range (`float`, *optional*, defaults to 0.02):
45
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
46
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
47
+ The epsilon used by the rms normalization layers.
48
+ use_cache (`bool`, *optional*, defaults to `True`):
49
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
50
+ relevant if `config.is_decoder=True`.
51
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
52
+ Whether the model's input and output word embeddings should be tied.
53
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
54
+ The base period of the RoPE embeddings.
55
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
56
+ Whether to use sliding window attention.
57
+ sliding_window (`int`, *optional*, defaults to 4096):
58
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
59
+ max_window_layers (`int`, *optional*, defaults to 80):
60
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
61
+ attention_dropout (`float`, *optional*, defaults to 0.0):
62
+ The dropout ratio for the attention probabilities.
63
+ vision_config (`Dict`, *optional*):
64
+ The config for the visual encoder initialization.
65
+ rope_scaling (`Dict`, *optional*):
66
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
67
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
68
+ accordingly.
69
+ Expected contents:
70
+ `rope_type` (`str`):
71
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
72
+ 'llama3'], with 'default' being the original RoPE implementation.
73
+ `factor` (`float`, *optional*):
74
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
75
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
76
+ original maximum pre-trained length.
77
+ `original_max_position_embeddings` (`int`, *optional*):
78
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
79
+ pretraining.
80
+ `attention_factor` (`float`, *optional*):
81
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
82
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
83
+ `factor` field to infer the suggested value.
84
+ `beta_fast` (`float`, *optional*):
85
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
86
+ ramp function. If unspecified, it defaults to 32.
87
+ `beta_slow` (`float`, *optional*):
88
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
89
+ ramp function. If unspecified, it defaults to 1.
90
+ `short_factor` (`List[float]`, *optional*):
91
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
92
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
93
+ size divided by the number of attention heads divided by 2
94
+ `long_factor` (`List[float]`, *optional*):
95
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
96
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
97
+ size divided by the number of attention heads divided by 2
98
+ `low_freq_factor` (`float`, *optional*):
99
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
100
+ `high_freq_factor` (`float`, *optional*):
101
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
102
+
103
+ ```python
104
+ >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig
105
+
106
+ >>> # Initializing a Qwen2_5_VL style configuration
107
+ >>> configuration = Qwen2_5_VLConfig()
108
+
109
+ >>> # Initializing a model from the Qwen2-VL-7B style configuration
110
+ >>> model = Qwen2_5_VLForConditionalGeneration(configuration)
111
+
112
+ >>> # Accessing the model configuration
113
+ >>> configuration = model.config
114
+ ```"""
115
+
116
+ model_type = "v1"
117
+ sub_configs = {"vision_config": Qwen2_5_VLVisionConfig}
118
+ keys_to_ignore_at_inference = ["past_key_values"]
119
+ # Default tensor parallel plan for base model `Qwen2_5_VL`
120
+ base_model_tp_plan = {
121
+ "layers.*.self_attn.q_proj": "colwise",
122
+ "layers.*.self_attn.k_proj": "colwise",
123
+ "layers.*.self_attn.v_proj": "colwise",
124
+ "layers.*.self_attn.o_proj": "rowwise",
125
+ "layers.*.mlp.gate_proj": "colwise",
126
+ "layers.*.mlp.up_proj": "colwise",
127
+ "layers.*.mlp.down_proj": "rowwise",
128
+ }
129
+ base_model_pp_plan = {
130
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
131
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
132
+ "norm": (["hidden_states"], ["hidden_states"]),
133
+ }
134
+
135
+ def __init__(
136
+ self,
137
+ vocab_size=152064,
138
+ hidden_size=8192,
139
+ intermediate_size=29568,
140
+ num_hidden_layers=80,
141
+ num_attention_heads=64,
142
+ num_key_value_heads=8,
143
+ hidden_act="silu",
144
+ max_position_embeddings=32768,
145
+ initializer_range=0.02,
146
+ rms_norm_eps=1e-05,
147
+ use_cache=True,
148
+ tie_word_embeddings=False,
149
+ rope_theta=1000000.0,
150
+ use_sliding_window=False,
151
+ sliding_window=4096,
152
+ max_window_layers=80,
153
+ attention_dropout=0.0,
154
+ vision_config=None,
155
+ rope_scaling=None,
156
+ region_token_id: int = 151662, # <|fim_pad|>
157
+ copy_token_start: int = 151665,
158
+ copy_token_num: int = 30000,
159
+ copy_scaler: float = 0.1,
160
+ use_embeddings_as_keys: bool = False,
161
+ normalize_copy_states: bool = False,
162
+ copy_extraction_layer: int = -1,
163
+ tie_copy_heads: bool = False,
164
+ use_cfg: bool = False,
165
+ copy_hidden_size: Optional[int] = None,
166
+ z_loss_weight: float = 1e-5,
167
+ z_loss_top_k: int = 40,
168
+ use_gate: bool = False,
169
+ label_smoothing: bool = False,
170
+ separate_copy_loss: bool = False,
171
+ do_copy: bool = True,
172
+ **kwargs,
173
+ ):
174
+ if isinstance(vision_config, dict):
175
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
176
+ elif vision_config is None:
177
+ self.vision_config = self.sub_configs["vision_config"]()
178
+
179
+ self.vocab_size = vocab_size
180
+ self.max_position_embeddings = max_position_embeddings
181
+ self.hidden_size = hidden_size
182
+ self.intermediate_size = intermediate_size
183
+ self.num_hidden_layers = num_hidden_layers
184
+ self.num_attention_heads = num_attention_heads
185
+ self.use_sliding_window = use_sliding_window
186
+ self.sliding_window = sliding_window
187
+ self.max_window_layers = max_window_layers
188
+
189
+ # for backward compatibility
190
+ if num_key_value_heads is None:
191
+ num_key_value_heads = num_attention_heads
192
+
193
+ self.num_key_value_heads = num_key_value_heads
194
+ self.hidden_act = hidden_act
195
+ self.initializer_range = initializer_range
196
+ self.rms_norm_eps = rms_norm_eps
197
+ self.use_cache = use_cache
198
+ self.rope_theta = rope_theta
199
+ self.attention_dropout = attention_dropout
200
+ self.rope_scaling = rope_scaling
201
+
202
+ # Validate the correctness of rotary position embeddings parameters
203
+ # BC: if there is a 'type' field, move it to 'rope_type'.
204
+ # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations
205
+ # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
206
+ # TODO: @raushan update config in the hub
207
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
208
+ if self.rope_scaling["type"] == "mrope":
209
+ self.rope_scaling["type"] = "default"
210
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
211
+ rope_config_validation(self, ignore_keys={"mrope_section"})
212
+
213
+ self.region_token_id = region_token_id
214
+ self.copy_token_start = copy_token_start
215
+ self.copy_token_num = copy_token_num
216
+ self.copy_scaler = copy_scaler
217
+ self.use_embeddings_as_keys = use_embeddings_as_keys
218
+ self.normalize_copy_states = normalize_copy_states
219
+ self.copy_extraction_layer = copy_extraction_layer
220
+ self.tie_copy_heads = tie_copy_heads
221
+ self.use_cfg = use_cfg
222
+
223
+ if copy_hidden_size is None:
224
+ copy_hidden_size = self.hidden_size
225
+ self.copy_hidden_size = copy_hidden_size
226
+ self.z_loss_weight = z_loss_weight
227
+ self.z_loss_top_k = z_loss_top_k
228
+ self.use_gate = use_gate
229
+ self.label_smoothing = label_smoothing
230
+ self.separate_copy_loss = separate_copy_loss
231
+ self.do_copy = do_copy
232
+
233
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
234
+
235
+
236
+ __all__ = ["V1Config"]
grounding.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, List, Optional, Union
2
+ import re
3
+ import math
4
+
5
+ from PIL import Image
6
+ import numpy as np
7
+ import torch
8
+ import torch.nn.functional as F
9
+ from qwen_vl_utils import process_vision_info
10
+ from transformers.feature_extraction_utils import BatchFeature
11
+ from transformers.image_utils import ImageInput, VideoInput
12
+ from transformers.processing_utils import (
13
+ Unpack,
14
+ )
15
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
16
+ from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
17
+ smart_resize,
18
+ Qwen2VLImageProcessor,
19
+ )
20
+ from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import (
21
+ Qwen2_5_VLProcessorKwargs,
22
+ Qwen2_5_VLProcessor,
23
+ )
24
+
25
+
26
+ """
27
+ Qwen2.5-VL does not use AnyRes to my relief.
28
+ Things to take into account:
29
+ - smart_resize
30
+ - temporal dimension
31
+ - grid_t = patches.shape[0] // self.temporal_patch_size
32
+ - grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
33
+ - merge_size (2)
34
+
35
+
36
+ Usage:
37
+
38
+ model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
39
+
40
+
41
+ processor = Qwen2_5_VLPointerProcessor.from_pretrained(model_name)
42
+ processor.image_processor = Qwen2VLImagePointerProcessor.from_pretrained(model_name)
43
+
44
+ messages = [
45
+ {
46
+ "role": "user",
47
+ "content": [
48
+ {
49
+ "type": "image",
50
+ "image": "https://example---/demo.jpeg",
51
+ },
52
+ {"type": "text", "text": "Describe this image."},
53
+ ],
54
+ },
55
+ {
56
+ 'role': 'assistant',
57
+ 'content': [
58
+ {
59
+ 'type': 'text', 'text': '<think>Theres a cat at <|region|>, a dog at <|region|>.</think>A calico cat hanging out with a golden retriever.'
60
+ }
61
+ ]
62
+ }
63
+ ]
64
+
65
+ # Preparation for inference
66
+ text = processor.apply_chat_template(
67
+ messages, tokenize=False, add_generation_prompt=True
68
+ )
69
+ regions = [
70
+ [0, 10, 100, 200],
71
+ [300, 0, 600, 250]
72
+ ]
73
+ image_inputs, video_inputs = process_vision_info(messages)
74
+ inputs = processor(
75
+ text=[text],
76
+ images=image_inputs,
77
+ videos=video_inputs,
78
+ regions=[regions]
79
+ padding=True,
80
+ return_tensors="pt",
81
+ )
82
+ inputs = inputs.to("cuda")
83
+
84
+
85
+ # Qwen2VLImageProcessor in a nutshell
86
+ '(tl tp) c (hlm hm hp) (wlm wm wp) -> (tl hlm wlm hm wm) (c tp hp wp)'
87
+ """
88
+
89
+
90
+ BBOX = Tuple[int, int, int, int]
91
+
92
+
93
+ class PointerProcessor:
94
+ @staticmethod
95
+ def normalize_bbox(image_size: Tuple[int, int], bbox: BBOX):
96
+ w, h = image_size
97
+ bbox = [
98
+ bbox[0] / w,
99
+ bbox[1] / h,
100
+ bbox[2] / w,
101
+ bbox[3] / h,
102
+ ]
103
+ return "[{}]".format(", ".join([f"{v:.2f}" for v in bbox]))
104
+
105
+ def get_masks(self, image_size: Tuple[int, int], indices: List[int]):
106
+ width, height = image_size
107
+ resized_height, resized_width = smart_resize(
108
+ height,
109
+ width,
110
+ factor=self.patch_size * self.merge_size,
111
+ min_pixels=self.min_pixels,
112
+ max_pixels=self.max_pixels,
113
+ )
114
+
115
+ # grid_h = resized_height // self.patch_size // self.merge_size
116
+ grid_w_m = resized_width // self.patch_size // self.merge_size
117
+
118
+ mask = torch.zeros(resized_height, resized_width)
119
+ for index in indices:
120
+ index_h = index // grid_w_m
121
+ index_w = index % grid_w_m
122
+ bbox = (
123
+ max(index_w * self.patch_size * self.merge_size, 0),
124
+ max(index_h * self.patch_size * self.merge_size, 0),
125
+ min((index_w + 1) * self.patch_size * self.merge_size, resized_width),
126
+ min((index_h + 1) * self.patch_size * self.merge_size, resized_height),
127
+ )
128
+ x1, y1, x2, y2 = bbox
129
+ mask[y1:y2, x1:x2] = 1
130
+ # mask = mask.t() # to width, height
131
+ return mask, (resized_width, resized_height)
132
+
133
+ def get_patch_pointers(
134
+ self, image_size: Tuple[int, int], region: Union[BBOX, np.ndarray]
135
+ ):
136
+ if isinstance(region, np.ndarray):
137
+ return self.get_mask_patch_pointers(image_size, region)
138
+ else:
139
+ return self.get_bbox_patch_pointers(image_size, region)
140
+
141
+ def get_bbox_patch_pointers(self, image_size: Tuple[int, int], bbox: BBOX):
142
+ factor = self.merge_size
143
+ # factor = 1
144
+ width, height = image_size
145
+ resized_height, resized_width = smart_resize(
146
+ height,
147
+ width,
148
+ factor=self.patch_size * self.merge_size,
149
+ min_pixels=self.min_pixels,
150
+ max_pixels=self.max_pixels,
151
+ )
152
+ x0, y0, x1, y1 = bbox
153
+ resized_bbox = [
154
+ max(x0 / width * resized_width, 0),
155
+ max(y0 / height * resized_height, 0),
156
+ min(x1 / width * resized_width, resized_width),
157
+ min(y1 / height * resized_height, resized_height),
158
+ ]
159
+ # patch_bbox = [v / self.patch_size / self.merge_size for v in resized_bbox]
160
+ patch_bbox = [v / self.patch_size / factor for v in resized_bbox]
161
+ x0, y0, x1, y1 = patch_bbox
162
+ boundaries = [
163
+ math.floor(x0),
164
+ math.floor(y0),
165
+ math.ceil(x1),
166
+ math.ceil(y1),
167
+ ]
168
+ x0, y0, x1, y1 = boundaries
169
+
170
+ # t, h, w
171
+ grid_w = resized_width // self.patch_size
172
+ grid_w_m = grid_w // factor
173
+ rows, cols = np.meshgrid(np.arange(y0, y1), np.arange(x0, x1), indexing="ij")
174
+ grid_indices = np.column_stack((rows.ravel(), cols.ravel()))
175
+ indices = grid_indices[:, 0] * grid_w_m + grid_indices[:, 1]
176
+ base_ids = list(indices)
177
+ # reorder
178
+ # t, hl, wl, hm, wm
179
+ # ids_map = torch.arange(grid_h * grid_w).reshape(grid_h, grid_w)
180
+ # ids_map = rearrange(
181
+ # ids_map,
182
+ # "(hl hm) (wl wm) -> (hl wl) (hm wm)",
183
+ # hm=self.merge_size,
184
+ # wm=self.merge_size,
185
+ # ).reshape(-1)
186
+ # inv_map = ids_map.argsort()
187
+ # ids = inv_map[base_ids].numpy()
188
+ ids = np.array(base_ids)
189
+ # ids.sort()
190
+ return ids
191
+
192
+ def get_mask_patch_pointers(self, image_size: Tuple[int, int], mask: np.ndarray):
193
+ # mask size: w h
194
+ width, height = image_size
195
+ resized_height, resized_width = smart_resize(
196
+ height,
197
+ width,
198
+ factor=self.patch_size * self.merge_size,
199
+ min_pixels=self.min_pixels,
200
+ max_pixels=self.max_pixels,
201
+ )
202
+ grid_w_m = resized_width // self.patch_size // self.merge_size
203
+ grid_h_m = resized_height // self.patch_size // self.merge_size
204
+
205
+ m = torch.from_numpy(mask).float()
206
+ m = F.interpolate(
207
+ m[None, None], (grid_h_m, grid_w_m), mode="bilinear", antialias="bilinear"
208
+ )[0, 0]
209
+ # m = m > 0 # upper bound
210
+
211
+ grid_indices = m.nonzero(as_tuple=False)
212
+ indices = grid_indices[:, 0] * grid_w_m + grid_indices[:, 1]
213
+ ids = indices.numpy()
214
+ return ids
215
+
216
+ def renormalize(self, tensor):
217
+ # crude - non-accurate implementation for the lazy
218
+ mean = np.array(self.image_mean).mean()
219
+ std = np.array(self.image_std).mean()
220
+ return tensor * std + mean
221
+
222
+
223
+ class Qwen2VLImagePointerProcessor(Qwen2VLImageProcessor, PointerProcessor):
224
+ pass
225
+
226
+
227
+ class Qwen2_5_VLPointerProcessor(Qwen2_5_VLProcessor):
228
+ image_processor_class = "Qwen2VLImagePointerProcessor"
229
+
230
+ def __init__(
231
+ self,
232
+ image_processor=None,
233
+ tokenizer=None,
234
+ chat_template=None,
235
+ prepend_raw_region_to_text: bool = True,
236
+ **kwargs,
237
+ ):
238
+ super().__init__(
239
+ image_processor=image_processor,
240
+ tokenizer=tokenizer,
241
+ chat_template=chat_template,
242
+ **kwargs,
243
+ )
244
+
245
+ self.region_token = "<|region|>"
246
+ self.copy_token_start = None
247
+ self.prepend_raw_region_to_text = prepend_raw_region_to_text
248
+
249
+ def extract_masks(self, image_size: Tuple[int, int], text: str):
250
+ # first, gather region indices from text
251
+ region_pattern = re.compile(r"<region>(.*?)</region>")
252
+ regions = region_pattern.findall(text)
253
+
254
+ indices = []
255
+ copy_pattern = re.compile(r"<\|copy_(\d+)\|>")
256
+
257
+ for region in regions:
258
+ # Extract all numbers inside <|copy_X|> tags within the region
259
+ numbers = [int(match) for match in copy_pattern.findall(region)]
260
+ indices.append(numbers)
261
+
262
+ # Then, convert region indices into masks
263
+ masks = []
264
+ resized_image_size = image_size
265
+ for region in indices:
266
+ mask, resized_image_size = self.image_processor.get_masks(
267
+ image_size, region
268
+ )
269
+ masks.append(mask)
270
+ return masks, resized_image_size
271
+
272
+ def __call__(
273
+ self,
274
+ images: ImageInput = None,
275
+ text: Union[
276
+ TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
277
+ ] = None,
278
+ videos: VideoInput = None,
279
+ regions: Optional[List[Union[BBOX, np.ndarray]]] = None,
280
+ **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
281
+ ) -> BatchFeature:
282
+ """
283
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
284
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
285
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
286
+ Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
287
+
288
+ Args:
289
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
290
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
291
+ tensor. Both channels-first and channels-last formats are supported.
292
+ text (`str`, `List[str]`, `List[List[str]]`):
293
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
294
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
295
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
296
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
297
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
298
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
299
+ regions:
300
+ either bboxes: List[Tuple[int, int, int, int]]
301
+ or masks: List[np.ndarray[width, height]]
302
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
303
+ If set, will return tensors of a particular framework. Acceptable values are:
304
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
305
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
306
+ - `'np'`: Return NumPy `np.ndarray` objects.
307
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
308
+
309
+ Returns:
310
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
311
+
312
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
313
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
314
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
315
+ `None`).
316
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
317
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
318
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
319
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
320
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
321
+ """
322
+
323
+ output_kwargs = self._merge_kwargs(
324
+ Qwen2_5_VLProcessorKwargs,
325
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
326
+ **kwargs,
327
+ )
328
+ obj_ptrs = None
329
+ if images is not None:
330
+ image_inputs = self.image_processor(
331
+ images=images, videos=None, **output_kwargs["images_kwargs"]
332
+ )
333
+ image_grid_thw = image_inputs["image_grid_thw"]
334
+
335
+ for image in images:
336
+ assert isinstance(
337
+ image, Image.Image
338
+ ), "only supporting a single image per row for now"
339
+
340
+ if regions is not None:
341
+ obj_ptrs = [
342
+ [
343
+ (
344
+ self.image_processor.get_patch_pointers(image.size, region)
345
+ if region is not None
346
+ else np.array([])
347
+ )
348
+ for region in image_region
349
+ ]
350
+ for image, image_region in zip(images, regions)
351
+ ]
352
+ else:
353
+ image_inputs = {}
354
+ image_grid_thw = None
355
+
356
+ assert videos is None, "video inputs are not supported yet" # TODO
357
+ if videos is not None:
358
+ videos_inputs = self.image_processor(
359
+ images=None, videos=videos, **output_kwargs["images_kwargs"]
360
+ )
361
+ video_grid_thw = videos_inputs["video_grid_thw"]
362
+
363
+ fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
364
+ if isinstance(fps, (int, float)):
365
+ second_per_grid_ts = [
366
+ self.image_processor.temporal_patch_size / fps
367
+ ] * len(video_grid_thw)
368
+ elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
369
+ second_per_grid_ts = [
370
+ self.image_processor.temporal_patch_size / tmp for tmp in fps
371
+ ]
372
+ else:
373
+ raise ValueError(
374
+ f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
375
+ )
376
+ videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
377
+
378
+ else:
379
+ videos_inputs = {}
380
+ video_grid_thw = None
381
+
382
+ if not isinstance(text, list):
383
+ text = [text]
384
+
385
+ if image_grid_thw is not None:
386
+ merge_length = self.image_processor.merge_size**2
387
+ index = 0
388
+ for i in range(len(text)):
389
+ while self.image_token in text[i]:
390
+ text[i] = text[i].replace(
391
+ self.image_token,
392
+ "<|placeholder|>"
393
+ * (image_grid_thw[index].prod() // merge_length),
394
+ 1,
395
+ )
396
+ index += 1
397
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
398
+
399
+ if obj_ptrs is not None:
400
+ assert regions is not None
401
+ for i in range(len(text)):
402
+ ptrs = obj_ptrs[i]
403
+ region = regions[i]
404
+ assert len(ptrs) == text[i].count(self.region_token)
405
+ index = 0
406
+ while self.region_token in text[i]:
407
+ ptrs_str = "".join([f"<|copy_{j}|>" for j in ptrs[index]])
408
+ region_str = self.image_processor.normalize_bbox(
409
+ image.size, region[index]
410
+ )
411
+ out_str = ("<region>" + ptrs_str + "</region>",)
412
+ if self.prepend_raw_region_to_text:
413
+ out_str = "<region>" + region_str + ptrs_str + "</region>"
414
+
415
+ text[i] = text[i].replace(
416
+ self.region_token,
417
+ out_str,
418
+ 1,
419
+ )
420
+ index += 1
421
+
422
+ # text[i] = text[i].replace("<|placeholder|>", self.region_token)
423
+
424
+ if video_grid_thw is not None:
425
+ # TODO: support video inputs
426
+ merge_length = self.image_processor.merge_size**2
427
+ index = 0
428
+ for i in range(len(text)):
429
+ while self.video_token in text[i]:
430
+ text[i] = text[i].replace(
431
+ self.video_token,
432
+ "<patch>"
433
+ + "<|placeholder|>"
434
+ * (video_grid_thw[index].prod() // merge_length)
435
+ + "</patch>",
436
+ 1,
437
+ )
438
+ index += 1
439
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
440
+
441
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
442
+
443
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
444
+
445
+
446
+ def get_processor(model_name: str, **kwargs):
447
+ processor = Qwen2_5_VLPointerProcessor.from_pretrained(model_name, **kwargs)
448
+ processor.image_processor = Qwen2VLImagePointerProcessor.from_pretrained(
449
+ model_name, **kwargs
450
+ )
451
+ # max_position_tokens = processor.tokenizer.model_max_length
452
+ # new_tokens = [f"<|copy_{i}|>" for i in range(max_position_tokens)] # too slow
453
+ processor.tokenizer.orig_vocab_size = len(processor.tokenizer)
454
+ new_tokens = [f"<|copy_{i}|>" for i in range(30000)]
455
+ processor.tokenizer.add_tokens(new_tokens)
456
+ processor.copy_token_start = processor.tokenizer.convert_tokens_to_ids("<|copy_0|>")
457
+ return processor
458
+
459
+
460
+ # Create a data collator to encode text and image pairs
461
+ def collate_fn(examples, processor):
462
+ # Get the texts and images, and apply the chat template
463
+ examples, masks = zip(*examples)
464
+ texts = [
465
+ processor.apply_chat_template(example, tokenize=False) for example in examples
466
+ ] # Prepare texts for processing
467
+ image_inputs = [
468
+ process_vision_info(example)[0][0] for example in examples
469
+ ] # Process the images to extract inputs
470
+
471
+ # Tokenize the texts and process the images
472
+ batch = processor(
473
+ text=texts,
474
+ images=image_inputs,
475
+ videos=None,
476
+ regions=masks,
477
+ padding=True,
478
+ return_tensors="pt",
479
+ ) # Encode texts and images into tensors
480
+
481
+ # The labels are the input_ids, and we mask the padding tokens in the loss computation
482
+ labels = batch["input_ids"].clone() # Clone input IDs for labels
483
+ labels[labels == processor.tokenizer.pad_token_id] = (
484
+ -100
485
+ ) # Mask padding tokens in labels
486
+
487
+ # Ignore the image token index in the loss computation (model specific)
488
+ if isinstance(
489
+ processor, Qwen2VLImagePointerProcessor
490
+ ): # Check if the processor is Qwen2VLProcessor
491
+ image_tokens = [
492
+ 151652,
493
+ 151653,
494
+ 151655,
495
+ ] # Specific image token IDs for Qwen2VLProcessor
496
+ else:
497
+ image_tokens = [
498
+ processor.tokenizer.convert_tokens_to_ids(processor.image_token)
499
+ ] # Convert image token to ID
500
+
501
+ # Mask image token IDs in the labels
502
+ for image_token_id in image_tokens:
503
+ labels[labels == image_token_id] = -100 # Mask image token IDs in labels
504
+
505
+ batch["labels"] = labels # Add labels to the batch
506
+
507
+ return batch # Return the prepared batch
508
+
509
+
510
+ if __name__ == "__main__":
511
+ # processor = Qwen2VLImagePointerProcessor.from_pretrained(
512
+ # "Qwen/Qwen2.5-VL-7B-Instruct"
513
+ # )
514
+
515
+ # image_size = [1036, 756]
516
+ # regions = [[0, 20, 25, 120], [512, 600, 800, 800], [0, 0, 1023, 740]]
517
+ # processor.test(image_size, regions)
518
+
519
+ model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
520
+ processor = get_processor(model_name)
521
+
522
+ messages = [
523
+ {
524
+ "role": "user",
525
+ "content": [
526
+ {
527
+ "type": "image",
528
+ "image": "https://example---/demo.jpeg",
529
+ },
530
+ {"type": "text", "text": "Describe this image."},
531
+ ],
532
+ },
533
+ {
534
+ "role": "assistant",
535
+ "content": [
536
+ {
537
+ "type": "text",
538
+ "text": "<think>Theres a cat at <|region|>, a dog at <|region|>.</think>A calico cat hanging out with a golden retriever.",
539
+ }
540
+ ],
541
+ },
542
+ ]
543
+ image = Image.new("RGB", (800, 500), "black")
544
+ text = processor.apply_chat_template(
545
+ messages, tokenize=False, add_generation_prompt=True
546
+ )
547
+ bboxes = [[0, 10, 100, 200], [300, 0, 600, 250]]
548
+ inputs = processor(
549
+ text=[text],
550
+ images=[image],
551
+ videos=None,
552
+ regions=[bboxes],
553
+ padding=True,
554
+ return_tensors="pt",
555
+ )
556
+ text = processor.tokenizer.decode(inputs.input_ids[0])
557
+ print(text)
558
+ masks, image_size = processor.extract_masks(image.size, text)
559
+ import ipdb; ipdb.set_trace() # noqa # fmt: skip
modeling_v1.py ADDED
@@ -0,0 +1,577 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Optional, Union, Tuple, List
3
+ from dataclasses import dataclass
4
+
5
+ import torch
6
+ from torch import nn
7
+ import torch.nn.functional as F
8
+ from torch.nn import CrossEntropyLoss
9
+ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
10
+ Qwen2_5_VisionTransformerPretrainedModel,
11
+ Qwen2_5_VLModel,
12
+ Qwen2_5_VLForConditionalGeneration,
13
+ Qwen2_5_VLCausalLMOutputWithPast,
14
+ )
15
+
16
+ from .configuration_v1 import V1Config
17
+
18
+
19
+ def init_identity(layer, scale: float = 1):
20
+ if isinstance(layer, nn.Linear):
21
+ with torch.no_grad():
22
+ # Ensure weight matrix is square
23
+ rows, cols = layer.weight.shape
24
+ identity_matrix = (
25
+ torch.eye(rows, cols) * scale
26
+ ) # Creates an identity matrix
27
+ layer.weight.copy_(
28
+ identity_matrix
29
+ ) # Copy identity matrix into layer weights
30
+ if hasattr(layer, "bias"):
31
+ layer.bias.fill_(0) # Set bias to zero (or another value if needed)
32
+
33
+
34
+ @dataclass
35
+ class V1CausalLMOutputWithPast(Qwen2_5_VLCausalLMOutputWithPast):
36
+ z_loss: torch.Tensor = None
37
+ gen_loss: torch.Tensor = None
38
+ copy_loss: torch.Tensor = None
39
+
40
+
41
+ class V1ForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
42
+ config_class = V1Config
43
+
44
+ def __init__(self, config):
45
+ super().__init__(config)
46
+ self.visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(
47
+ config.vision_config
48
+ )
49
+ self.model = Qwen2_5_VLModel(config)
50
+ self.copy_init_scale = 1 / math.sqrt(self.config.hidden_size)
51
+
52
+ # self.tokenizer_vocab_size = (
53
+ # config.tokenizer_vocab_size
54
+ # ) # Qwen2.5-VL: different from embedding_size==vocab_size. 151665 vs. 152064
55
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
56
+ self.rope_deltas = None # cache rope_deltas here
57
+
58
+ if self.config.do_copy:
59
+ if self.config.tie_copy_heads:
60
+ self._copy_head = nn.Linear(config.hidden_size, config.copy_hidden_size)
61
+ else:
62
+ self._copy_q_head = nn.Linear(
63
+ config.hidden_size, config.copy_hidden_size
64
+ )
65
+ self._copy_k_head = nn.Linear(
66
+ config.hidden_size, config.copy_hidden_size
67
+ )
68
+ if self.config.use_gate:
69
+ self.gate = nn.Linear(config.hidden_size, 1, bias=False)
70
+
71
+ # Initialize weights and apply final processing
72
+ self.post_init()
73
+
74
+ @torch.no_grad()
75
+ def after_loading(self):
76
+ if self.config.do_copy:
77
+ self.init_heads()
78
+ if self.config.use_gate:
79
+ self.lm_head.weight.data = self.lm_head.weight.data * 2
80
+ self.gate.weight.data.fill_(0)
81
+
82
+ @property
83
+ def copy_q_head(self):
84
+ return self._copy_head if self.config.tie_copy_heads else self._copy_q_head
85
+
86
+ @property
87
+ def copy_k_head(self):
88
+ return self._copy_head if self.config.tie_copy_heads else self._copy_k_head
89
+
90
+ def init_heads(self):
91
+ if hasattr(self, "_copy_head"):
92
+ init_identity(self._copy_head, self.copy_init_scale)
93
+ if hasattr(self, "_copy_k_head"):
94
+ init_identity(self._copy_k_head, self.copy_init_scale)
95
+ if hasattr(self, "_copy_q_head"):
96
+ init_identity(self._copy_q_head, self.copy_init_scale)
97
+
98
+ def copy_representations(
99
+ self,
100
+ inputs_embeds: torch.FloatTensor,
101
+ input_ids: torch.LongTensor,
102
+ copy_values: Optional[torch.FloatTensor] = None,
103
+ ):
104
+ if copy_values is None:
105
+ mask = input_ids == self.config.image_token_id
106
+ copy_values, _ = self.extract_image_tokens(inputs_embeds, mask) # initial
107
+ assert copy_values is not None
108
+ copy_values = copy_values.to(inputs_embeds.device)
109
+ input_ids = input_ids.to(inputs_embeds.device)
110
+
111
+ input_ids = input_ids.clone()
112
+ input_ids = input_ids - self.config.copy_token_start
113
+ copy_mask = input_ids >= 0
114
+ input_ids[~copy_mask] = 0
115
+
116
+ assert copy_values is not None
117
+ extracted = copy_values.gather(
118
+ 1, input_ids[..., None].repeat(1, 1, copy_values.shape[-1])
119
+ )
120
+ copy_mask = copy_mask.to(extracted.dtype)[..., None]
121
+ return copy_mask * extracted + (1 - copy_mask) * inputs_embeds
122
+
123
+ def extract_image_tokens(self, features: torch.FloatTensor, mask: torch.Tensor):
124
+ out_feat, out_mask = extract_image_tokens_right_pad(features, mask)
125
+ return out_feat, out_mask
126
+
127
+ def forward(
128
+ self,
129
+ input_ids: torch.LongTensor = None,
130
+ attention_mask: Optional[torch.Tensor] = None,
131
+ position_ids: Optional[torch.LongTensor] = None,
132
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
133
+ inputs_embeds: Optional[torch.FloatTensor] = None,
134
+ labels: Optional[torch.LongTensor] = None,
135
+ use_cache: Optional[bool] = None,
136
+ output_attentions: Optional[bool] = None,
137
+ output_hidden_states: Optional[bool] = None,
138
+ return_dict: Optional[bool] = None,
139
+ pixel_values: Optional[torch.Tensor] = None,
140
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
141
+ image_grid_thw: Optional[torch.LongTensor] = None,
142
+ video_grid_thw: Optional[torch.LongTensor] = None,
143
+ rope_deltas: Optional[torch.LongTensor] = None,
144
+ cache_position: Optional[torch.LongTensor] = None,
145
+ second_per_grid_ts: Optional[torch.Tensor] = None,
146
+ ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
147
+ r"""
148
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
149
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
150
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
151
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
152
+
153
+ Returns:
154
+
155
+ Example:
156
+
157
+ ```python
158
+ >>> from PIL import Image
159
+ >>> import requests
160
+ >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
161
+
162
+ >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
163
+ >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
164
+
165
+ >>> messages = [
166
+ {
167
+ "role": "user",
168
+ "content": [
169
+ {"type": "image"},
170
+ {"type": "text", "text": "What is shown in this image?"},
171
+ ],
172
+ },
173
+ ]
174
+ >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
175
+ >>> image = Image.open(requests.get(url, stream=True).raw)
176
+
177
+ >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
178
+ >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
179
+
180
+ >>> # Generate
181
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
182
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
183
+ "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
184
+ ```"""
185
+
186
+ output_attentions = (
187
+ output_attentions
188
+ if output_attentions is not None
189
+ else self.config.output_attentions
190
+ )
191
+ output_hidden_states = (
192
+ output_hidden_states
193
+ if output_hidden_states is not None
194
+ else self.config.output_hidden_states
195
+ )
196
+ return_dict = (
197
+ return_dict if return_dict is not None else self.config.use_return_dict
198
+ )
199
+
200
+ input_ids = input_ids.clone()
201
+ input_ids_with_ptrs = input_ids.clone()
202
+ input_ids[input_ids >= self.config.copy_token_start] = (
203
+ self.config.region_token_id
204
+ )
205
+
206
+ if inputs_embeds is None:
207
+ inputs_embeds = self.model.embed_tokens(input_ids)
208
+ if pixel_values is not None:
209
+ pixel_values = pixel_values.type(self.visual.dtype)
210
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
211
+
212
+ mask = input_ids == self.config.image_token_id
213
+ mask_unsqueezed = mask.unsqueeze(-1)
214
+ mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
215
+ image_mask = mask_expanded.to(inputs_embeds.device)
216
+
217
+ image_embeds = image_embeds.to(
218
+ inputs_embeds.device, inputs_embeds.dtype
219
+ )
220
+ inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
221
+
222
+ if pixel_values_videos is not None:
223
+ raise NotImplementedError("video inputs are not supported yet.")
224
+ pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
225
+ video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
226
+ n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
227
+ n_video_features = video_embeds.shape[0]
228
+ if n_video_tokens != n_video_features:
229
+ raise ValueError(
230
+ f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
231
+ )
232
+
233
+ mask = input_ids == self.config.video_token_id
234
+ mask_unsqueezed = mask.unsqueeze(-1)
235
+ mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
236
+ video_mask = mask_expanded.to(inputs_embeds.device)
237
+
238
+ video_embeds = video_embeds.to(
239
+ inputs_embeds.device, inputs_embeds.dtype
240
+ )
241
+ inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
242
+
243
+ if attention_mask is not None:
244
+ attention_mask = attention_mask.to(inputs_embeds.device)
245
+
246
+ if self.config.do_copy:
247
+ copy_keys, copy_keys_mask = None, None
248
+ copy_values, copy_values_mask = None, None
249
+
250
+ has_cache = bool(past_key_values)
251
+ if has_cache:
252
+ copy_keys, copy_values = past_key_values[len(past_key_values) - 2]
253
+ copy_keys_mask, copy_values_mask = past_key_values[
254
+ len(past_key_values) - 1
255
+ ]
256
+ # we add channel dim to the mask for consistency in tensor shape in cache
257
+ copy_keys_mask = copy_keys_mask[..., 0]
258
+ copy_values_mask = copy_values_mask[..., 0]
259
+
260
+ inputs_embeds = self.copy_representations(
261
+ inputs_embeds, input_ids_with_ptrs, copy_values
262
+ )
263
+
264
+ # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
265
+ if position_ids is None and (
266
+ attention_mask is None or attention_mask.ndim == 2
267
+ ):
268
+ # calculate RoPE index once per generation in the pre-fill stage only
269
+ if (
270
+ (cache_position is not None and cache_position[0] == 0)
271
+ or self.rope_deltas is None
272
+ or (past_key_values is None or past_key_values.get_seq_length() == 0)
273
+ ):
274
+ position_ids, rope_deltas = self.get_rope_index(
275
+ input_ids,
276
+ image_grid_thw,
277
+ video_grid_thw,
278
+ second_per_grid_ts,
279
+ attention_mask,
280
+ )
281
+ self.rope_deltas = rope_deltas
282
+ # then use the prev pre-calculated rope-deltas to get the correct position ids
283
+ else:
284
+ batch_size, seq_length, _ = inputs_embeds.shape
285
+ delta = (
286
+ (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
287
+ if cache_position is not None
288
+ else 0
289
+ )
290
+ position_ids = torch.arange(seq_length, device=inputs_embeds.device)
291
+ position_ids = position_ids.view(1, -1).expand(batch_size, -1)
292
+ if cache_position is not None: # otherwise `deltas` is an int `0`
293
+ delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
294
+ position_ids = position_ids.add(delta)
295
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
296
+
297
+ outputs = self.model(
298
+ input_ids=None,
299
+ position_ids=position_ids,
300
+ attention_mask=attention_mask,
301
+ past_key_values=past_key_values,
302
+ inputs_embeds=inputs_embeds,
303
+ use_cache=use_cache,
304
+ output_attentions=output_attentions,
305
+ output_hidden_states=output_hidden_states,
306
+ return_dict=return_dict,
307
+ cache_position=cache_position,
308
+ )
309
+
310
+ hidden_states = outputs[0]
311
+
312
+ gen_logits = self.lm_head(hidden_states)
313
+
314
+ if self.config.do_copy:
315
+ assert (
316
+ self.config.copy_extraction_layer == -1
317
+ ), f"copy_extraction_layer should be -1: {self.config.copy_extraction_layer}"
318
+ copy_hidden_states = hidden_states
319
+ copy_q_states = copy_hidden_states
320
+ if self.config.normalize_copy_states:
321
+ copy_q_states = F.normalize(copy_q_states, 2, -1)
322
+ copy_q_states = self.copy_q_head(copy_q_states)
323
+
324
+ present_key_values = outputs.past_key_values
325
+
326
+ if not has_cache:
327
+ mask = input_ids == self.config.image_token_id
328
+ copy_k_states = (
329
+ inputs_embeds
330
+ if self.config.use_embeddings_as_keys
331
+ else copy_hidden_states
332
+ )
333
+ if self.config.normalize_copy_states:
334
+ copy_k_states = F.normalize(copy_k_states, 2, -1)
335
+ copy_k_states, copy_k_mask = self.extract_image_tokens(
336
+ self.copy_k_head(copy_k_states), mask
337
+ )
338
+ copy_v_states, copy_v_mask = self.extract_image_tokens(
339
+ inputs_embeds.detach(), mask
340
+ )
341
+
342
+ # we add channel dim to the mask for consistency in tensor shape in cache
343
+ copy_memories = [
344
+ (copy_k_states.detach(), copy_v_states.detach()),
345
+ (copy_k_mask[..., None], copy_v_mask[..., None]),
346
+ ]
347
+
348
+ if use_cache:
349
+ # only update at the first iteration
350
+ start = len(present_key_values)
351
+ for i, mem in enumerate(copy_memories):
352
+ present_key_values.update(*mem, start + i)
353
+ else:
354
+ copy_k_states = copy_keys
355
+ copy_k_mask = copy_keys_mask
356
+
357
+ assert copy_k_states is not None
358
+ assert copy_k_mask is not None
359
+ assert (
360
+ copy_k_states.shape[1] > 0
361
+ ), f"zero image tokens on batch elements: {copy_k_mask.sum(dim=1)}"
362
+
363
+ copy_logits = (copy_q_states @ copy_k_states.transpose(-1, -2)).to(
364
+ gen_logits.device
365
+ ) * self.copy_init_scale
366
+
367
+ if hasattr(self, "gate"):
368
+ gate = torch.sigmoid(self.gate(hidden_states))
369
+ gen_logits = gen_logits * (1 - gate)
370
+ copy_logits = copy_logits * gate
371
+
372
+ copy_logits = copy_logits.masked_fill(
373
+ ~copy_k_mask[:, None, :].to(copy_logits.device),
374
+ torch.finfo(copy_logits.dtype).min,
375
+ )
376
+ logits = torch.cat(
377
+ [gen_logits[..., : self.config.copy_token_start], copy_logits], dim=-1
378
+ )
379
+ else:
380
+ logits = gen_logits
381
+ loss = None
382
+ z_loss = None
383
+ gen_loss = None
384
+ if labels is not None:
385
+ gen_logits = gen_logits.float()
386
+ shift_gen_logits = gen_logits[:, :-1, :].contiguous().float()
387
+ shift_labels = labels[:, 1:].contiguous()
388
+ gen_loss_fct = CrossEntropyLoss(reduction="none")
389
+ gen_logits_flat = shift_gen_logits.view(-1, shift_gen_logits.shape[-1])
390
+ gen_labels_flat = shift_labels.view(-1)
391
+
392
+ gen_loss_all = gen_loss_fct(gen_logits_flat, gen_labels_flat)
393
+ gen_loss = gen_loss_all.mean()
394
+
395
+ loss = gen_loss
396
+
397
+ if self.config.z_loss_weight > 0:
398
+ valid_mask = shift_labels >= 0
399
+ # top-k approx z_loss for better memory usage
400
+ top_logits, _ = torch.topk(
401
+ shift_gen_logits, k=self.config.z_loss_top_k, dim=-1
402
+ )
403
+ lse = torch.logsumexp(top_logits, dim=-1)
404
+ z_loss = lse[valid_mask].pow(2).mean() * self.config.z_loss_weight
405
+
406
+ # z_loss = (
407
+ # torch.logsumexp(shift_logits, dim=-1).pow(2)[valid_mask].mean()
408
+ # * self.config.z_loss_weight
409
+ # )
410
+ loss = loss + z_loss
411
+ z_loss = z_loss.detach()
412
+
413
+ return V1CausalLMOutputWithPast(
414
+ loss=loss,
415
+ z_loss=z_loss,
416
+ gen_loss=gen_loss,
417
+ copy_loss=None,
418
+ logits=logits,
419
+ # copy_logits=copy_logits,
420
+ # gen_logits=gen_logits,
421
+ past_key_values=outputs.past_key_values,
422
+ hidden_states=outputs.hidden_states,
423
+ attentions=outputs.attentions,
424
+ rope_deltas=self.rope_deltas,
425
+ )
426
+
427
+ loss = None
428
+ z_loss = None
429
+ gen_loss = None
430
+ copy_loss = None
431
+ if labels is not None:
432
+ if self.config.separate_copy_loss:
433
+ # Shift labels and logits for next-token prediction
434
+ shift_gen_logits = gen_logits[:, :-1, :].contiguous().float()
435
+ shift_copy_logits = copy_logits[:, :-1, :].contiguous().float()
436
+ shift_labels = labels[:, 1:].contiguous()
437
+ shift_logits = shift_copy_logits
438
+
439
+ # Build masks
440
+ gen_mask = shift_labels < self.config.copy_token_start
441
+ copy_mask = shift_labels >= self.config.copy_token_start
442
+
443
+ # Generation loss
444
+ if gen_mask.any():
445
+ gen_loss_fct = CrossEntropyLoss(reduction="none")
446
+
447
+ G = shift_gen_logits.shape[-1]
448
+ gen_logits_flat = shift_gen_logits.view(-1, G)
449
+ gen_labels_flat = shift_labels.view(-1)
450
+ gen_mask_flat = gen_mask.view(-1)
451
+ # mask logits
452
+ gen_logits_flat_masked = gen_logits_flat[gen_mask_flat]
453
+ gen_labels_flat_masked = gen_labels_flat[gen_mask_flat]
454
+
455
+ gen_loss_all = gen_loss_fct(
456
+ gen_logits_flat_masked, gen_labels_flat_masked
457
+ )
458
+ gen_loss = gen_loss_all.mean()
459
+
460
+ # Copy loss (adjust label indices to match copy_logits range)
461
+ if copy_mask.any():
462
+ copy_loss_fct = CrossEntropyLoss(reduction="none")
463
+ C = shift_copy_logits.shape[-1]
464
+ copy_logits_flat = shift_copy_logits.view(-1, C)
465
+ copy_labels_flat = (
466
+ shift_labels.view(-1) - self.config.copy_token_start
467
+ )
468
+ copy_mask_flat = copy_mask.view(-1)
469
+ copy_logits_flat_masked = copy_logits_flat[copy_mask_flat]
470
+ copy_labels_flat_masked = copy_labels_flat[copy_mask_flat]
471
+ copy_loss_all = copy_loss_fct(
472
+ copy_logits_flat_masked, copy_labels_flat_masked
473
+ )
474
+ copy_loss = copy_loss_all.mean()
475
+ else:
476
+ # Upcast to float if we need to compute the loss to avoid potential precision issues
477
+ logits = logits.float()
478
+ # Shift so that tokens < n predict n
479
+ shift_logits = logits[..., :-1, :].contiguous()
480
+ shift_labels = labels[..., 1:].contiguous()
481
+ # Flatten the tokens
482
+ loss_fct = CrossEntropyLoss(label_smoothing=self.config.label_smoothing)
483
+ total_vocab_size = logits.shape[-1] # gen + copy
484
+ shift_logits = shift_logits.view(-1, total_vocab_size)
485
+ shift_labels = shift_labels.view(-1)
486
+ # Enable model parallelism
487
+ shift_labels = shift_labels.to(shift_logits.device)
488
+ gen_loss = loss_fct(shift_logits, shift_labels)
489
+
490
+ loss = 0.0
491
+ if gen_loss is not None:
492
+ loss += gen_loss
493
+ if copy_loss is not None:
494
+ loss += copy_loss
495
+
496
+ if self.config.z_loss_weight > 0:
497
+ valid_mask = shift_labels >= 0
498
+ # top-k approx z_loss for better memory usage
499
+ top_logits, _ = torch.topk(
500
+ shift_logits, k=self.config.z_loss_top_k, dim=-1
501
+ )
502
+ lse = torch.logsumexp(top_logits, dim=-1)
503
+ z_loss = lse[valid_mask].pow(2).mean() * self.config.z_loss_weight
504
+
505
+ # z_loss = (
506
+ # torch.logsumexp(shift_logits, dim=-1).pow(2)[valid_mask].mean()
507
+ # * self.config.z_loss_weight
508
+ # )
509
+ loss = loss + z_loss
510
+ z_loss = z_loss.detach()
511
+
512
+ if gen_loss is not None:
513
+ gen_loss = gen_loss.detach()
514
+ if copy_loss is not None:
515
+ copy_loss = copy_loss.detach()
516
+
517
+ if self.config.use_cfg:
518
+ # expand as max_size for logit processors
519
+ extended_vocab_size = self.config.vocab_size + self.config.copy_token_num
520
+ B, L, V = logits.shape
521
+ pads = torch.full(
522
+ (B, L, extended_vocab_size - V),
523
+ torch.finfo(gen_logits.dtype).min,
524
+ device=logits.device,
525
+ ).to(logits.dtype)
526
+ logits = torch.cat([logits, pads], dim=-1)
527
+ # logits = logits.clamp_min(-1e4)
528
+
529
+ if not return_dict:
530
+ output = (logits,) + outputs[1:]
531
+ return (loss,) + output if loss is not None else output
532
+
533
+ logits = logits.float()
534
+ return V1CausalLMOutputWithPast(
535
+ loss=loss,
536
+ z_loss=z_loss,
537
+ gen_loss=gen_loss,
538
+ copy_loss=copy_loss,
539
+ logits=logits,
540
+ # copy_logits=copy_logits,
541
+ # gen_logits=gen_logits,
542
+ past_key_values=present_key_values,
543
+ hidden_states=outputs.hidden_states,
544
+ attentions=outputs.attentions,
545
+ rope_deltas=self.rope_deltas,
546
+ )
547
+
548
+
549
+ def extract_image_tokens_right_pad(features: torch.FloatTensor, mask: torch.Tensor):
550
+ X, M = features, mask.long() # bool is not supported for sort in CUDA
551
+ B, L, _ = X.shape
552
+ device = X.device
553
+ M = M.to(device)
554
+
555
+ # Compute number of valid elements per batch
556
+ valid_counts = M.sum(dim=1) # Shape: [B]
557
+ # Replace `.item()` with `max()` and `clamp_min()` for Torch Dynamo compatibility
558
+ R = valid_counts.max().clamp_min(1) # Ensures at least 1 for tensor compatibility
559
+ # Create index tensors for selection
560
+ sorted_indices = M.argsort(dim=1, descending=True) # Move True values to front
561
+ batch_indices = torch.arange(B, device=device).unsqueeze(1).expand(B, L)
562
+
563
+ # Gather sorted X based on mask sorting
564
+ X_sorted = X[batch_indices, sorted_indices] # Shape: [B, L, C]
565
+ X_selected = X_sorted[:, :R, :] # Select the top valid elements per batch
566
+
567
+ # Create new mask M2 using `torch.arange`
568
+ M2 = torch.arange(L, device=device).expand(B, L) < valid_counts.unsqueeze(1)
569
+ M2 = M2[:, :R] # Trim to selected size
570
+
571
+ # Set out-of-bound values to zero
572
+ X_selected = torch.where(M2.unsqueeze(-1), X_selected, torch.zeros_like(X_selected))
573
+
574
+ return X_selected, M2
575
+
576
+
577
+ __all__ = ["V1ForConditionalGeneration"]
preprocessor_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "merge_size": 2,
19
  "min_pixels": 3136,
20
  "patch_size": 14,
21
- "processor_class": "Qwen2_5_VLPointerProcessor",
22
  "resample": 3,
23
  "rescale_factor": 0.00392156862745098,
24
  "size": {
 
18
  "merge_size": 2,
19
  "min_pixels": 3136,
20
  "patch_size": 14,
21
+ "processor_class": "V1Processor",
22
  "resample": 3,
23
  "rescale_factor": 0.00392156862745098,
24
  "size": {
processor.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, List, Optional, Union
2
+ import re
3
+ import math
4
+ from collections import defaultdict
5
+
6
+ from PIL import Image
7
+ import numpy as np
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from qwen_vl_utils import process_vision_info
11
+ from transformers.feature_extraction_utils import BatchFeature
12
+ from transformers.image_utils import ImageInput, VideoInput
13
+ from transformers.processing_utils import (
14
+ Unpack,
15
+ )
16
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
17
+ from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
18
+ smart_resize,
19
+ Qwen2VLImageProcessor,
20
+ )
21
+ from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import (
22
+ Qwen2_5_VLProcessorKwargs,
23
+ Qwen2_5_VLProcessor,
24
+ )
25
+
26
+ """
27
+ Qwen2.5-VL does not use AnyRes to my relief.
28
+ Things to take into account:
29
+ - smart_resize
30
+ - temporal dimension
31
+ - grid_t = patches.shape[0] // self.temporal_patch_size
32
+ - grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
33
+ - merge_size (2)
34
+
35
+
36
+ Usage:
37
+
38
+ model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
39
+
40
+
41
+ processor = Qwen2_5_VLPointerProcessor.from_pretrained(model_name)
42
+ processor.image_processor = Qwen2VLImagePointerProcessor.from_pretrained(model_name)
43
+
44
+ messages = [
45
+ {
46
+ "role": "user",
47
+ "content": [
48
+ {
49
+ "type": "image",
50
+ "image": "https://example---/demo.jpeg",
51
+ },
52
+ {"type": "text", "text": "Describe this image."},
53
+ ],
54
+ },
55
+ {
56
+ 'role': 'assistant',
57
+ 'content': [
58
+ {
59
+ 'type': 'text', 'text': '<think>Theres a cat at <|region|>, a dog at <|region|>.</think>A calico cat hanging out with a golden retriever.'
60
+ }
61
+ ]
62
+ }
63
+ ]
64
+
65
+ # Preparation for inference
66
+ text = processor.apply_chat_template(
67
+ messages, tokenize=False, add_generation_prompt=True
68
+ )
69
+ regions = [
70
+ [0, 10, 100, 200],
71
+ [300, 0, 600, 250]
72
+ ]
73
+ image_inputs, video_inputs = process_vision_info(messages)
74
+ inputs = processor(
75
+ text=[text],
76
+ images=image_inputs,
77
+ videos=video_inputs,
78
+ regions=[regions]
79
+ padding=True,
80
+ return_tensors="pt",
81
+ )
82
+ inputs = inputs.to("cuda")
83
+
84
+
85
+ # Qwen2VLImageProcessor in a nutshell
86
+ '(tl tp) c (hlm hm hp) (wlm wm wp) -> (tl hlm wlm hm wm) (c tp hp wp)'
87
+ """
88
+
89
+
90
+ BBOX = Tuple[int, int, int, int]
91
+
92
+
93
+ class PointerProcessor:
94
+ @staticmethod
95
+ def normalize_bbox(image_size: Tuple[int, int], bbox: BBOX):
96
+ w, h = image_size
97
+ bbox = [
98
+ bbox[0] / w,
99
+ bbox[1] / h,
100
+ bbox[2] / w,
101
+ bbox[3] / h,
102
+ ]
103
+ return "[{}]".format(", ".join([f"{v:.2f}" for v in bbox]))
104
+
105
+ def get_mask(self, image_size: Tuple[int, int], indices: List[int]):
106
+ width, height = image_size
107
+ resized_height, resized_width = smart_resize(
108
+ height,
109
+ width,
110
+ factor=self.patch_size * self.merge_size,
111
+ min_pixels=self.min_pixels,
112
+ max_pixels=self.max_pixels,
113
+ )
114
+
115
+ # grid_h = resized_height // self.patch_size // self.merge_size
116
+ grid_w_m = resized_width // self.patch_size // self.merge_size
117
+
118
+ mask = torch.zeros(resized_height, resized_width)
119
+ for index in indices:
120
+ index_h = index // grid_w_m
121
+ index_w = index % grid_w_m
122
+ bbox = (
123
+ max(index_w * self.patch_size * self.merge_size, 0),
124
+ max(index_h * self.patch_size * self.merge_size, 0),
125
+ min((index_w + 1) * self.patch_size * self.merge_size, resized_width),
126
+ min((index_h + 1) * self.patch_size * self.merge_size, resized_height),
127
+ )
128
+ x1, y1, x2, y2 = bbox
129
+ mask[y1:y2, x1:x2] = 1
130
+ # mask = mask.t() # to width, height
131
+ return mask, (resized_width, resized_height)
132
+
133
+ def get_patch_pointers(
134
+ self, image_size: Tuple[int, int], region: Union[BBOX, np.ndarray]
135
+ ):
136
+ if isinstance(region, np.ndarray):
137
+ return self.get_mask_patch_pointers(image_size, region)
138
+ else:
139
+ return self.get_bbox_patch_pointers(image_size, region)
140
+
141
+ def get_bbox_patch_pointers(self, image_size: Tuple[int, int], bbox: BBOX):
142
+ factor = self.merge_size
143
+ # factor = 1
144
+ width, height = image_size
145
+ resized_height, resized_width = smart_resize(
146
+ height,
147
+ width,
148
+ factor=self.patch_size * self.merge_size,
149
+ min_pixels=self.min_pixels,
150
+ max_pixels=self.max_pixels,
151
+ )
152
+ x0, y0, x1, y1 = bbox
153
+ resized_bbox = [
154
+ max(x0 / width * resized_width, 0),
155
+ max(y0 / height * resized_height, 0),
156
+ min(x1 / width * resized_width, resized_width),
157
+ min(y1 / height * resized_height, resized_height),
158
+ ]
159
+ # patch_bbox = [v / self.patch_size / self.merge_size for v in resized_bbox]
160
+ patch_bbox = [v / self.patch_size / factor for v in resized_bbox]
161
+ x0, y0, x1, y1 = patch_bbox
162
+ boundaries = [
163
+ math.floor(x0),
164
+ math.floor(y0),
165
+ math.ceil(x1),
166
+ math.ceil(y1),
167
+ ]
168
+ x0, y0, x1, y1 = boundaries
169
+
170
+ # t, h, w
171
+ grid_w = resized_width // self.patch_size
172
+ grid_w_m = grid_w // factor
173
+ rows, cols = np.meshgrid(np.arange(y0, y1), np.arange(x0, x1), indexing="ij")
174
+ grid_indices = np.column_stack((rows.ravel(), cols.ravel()))
175
+ indices = grid_indices[:, 0] * grid_w_m + grid_indices[:, 1]
176
+ base_ids = list(indices)
177
+ ids = np.array(base_ids)
178
+ return ids
179
+
180
+ def get_mask_patch_pointers(self, image_size: Tuple[int, int], mask: np.ndarray):
181
+ # mask size: w h
182
+ width, height = image_size
183
+ resized_height, resized_width = smart_resize(
184
+ height,
185
+ width,
186
+ factor=self.patch_size * self.merge_size,
187
+ min_pixels=self.min_pixels,
188
+ max_pixels=self.max_pixels,
189
+ )
190
+ grid_w_m = resized_width // self.patch_size // self.merge_size
191
+ grid_h_m = resized_height // self.patch_size // self.merge_size
192
+
193
+ m = torch.from_numpy(mask).float()
194
+ m = F.interpolate(
195
+ m[None, None], (grid_h_m, grid_w_m), mode="bilinear", antialias="bilinear"
196
+ )[0, 0]
197
+
198
+ grid_indices = m.nonzero(as_tuple=False)
199
+ indices = grid_indices[:, 0] * grid_w_m + grid_indices[:, 1]
200
+ ids = indices.numpy()
201
+ return ids
202
+
203
+ def renormalize(self, tensor):
204
+ # crude - non-accurate implementation for the lazy
205
+ mean = np.array(self.image_mean).mean()
206
+ std = np.array(self.image_std).mean()
207
+ return tensor * std + mean
208
+
209
+ class Qwen2VLImagePointerProcessor(Qwen2VLImageProcessor, PointerProcessor):
210
+ pass
211
+
212
+
213
+ class V1Processor(Qwen2_5_VLProcessor):
214
+ image_processor_class = "Qwen2VLImagePointerProcessor"
215
+
216
+ def __init__(
217
+ self,
218
+ image_processor=None,
219
+ tokenizer=None,
220
+ chat_template=None,
221
+ prepend_raw_region_to_text: bool = True,
222
+ separate_copy_loss: bool = False,
223
+ **kwargs,
224
+ ):
225
+ super().__init__(
226
+ image_processor=image_processor,
227
+ tokenizer=tokenizer,
228
+ chat_template=chat_template,
229
+ **kwargs,
230
+ )
231
+
232
+ self.region_token = "<|region|>"
233
+ self.copy_token_start = None
234
+ self.prepend_raw_region_to_text = prepend_raw_region_to_text
235
+ self.separate_copy_loss = separate_copy_loss
236
+ self.copy_start_token = "<|box_start|>"
237
+ self.copy_end_token = "<|box_end|>"
238
+
239
+ # def extract_masks(self, image_size: Tuple[int, int], text: str):
240
+ # # first, gather region indices from text
241
+ # region_pattern = re.compile(r"<region>(.*?)</region>")
242
+ # regions = region_pattern.findall(text)
243
+
244
+ # indices = []
245
+ # copy_pattern = re.compile(r"<\|copy_(\d+)\|>")
246
+
247
+ # for region in regions:
248
+ # # Extract all numbers inside <|copy_X|> tags within the region
249
+ # numbers = [int(match) for match in copy_pattern.findall(region)]
250
+ # indices.append(numbers)
251
+
252
+ # # Then, convert region indices into masks
253
+ # masks = []
254
+ # resized_image_size = image_size
255
+ # for region in indices:
256
+ # mask, resized_image_size = self.image_processor.get_mask(
257
+ # image_size, region
258
+ # )
259
+ # masks.append(mask)
260
+ # return masks, resized_image_size
261
+ #
262
+ def extract_masks(self, image_size: Tuple[int, int], text: str):
263
+ # Match full detect(...) blocks and extract their content
264
+ # detect_pattern = r"detect\([^)]+objects\s*=\s*\[(.*?)\]\)"
265
+ detect_pattern = r'detect\(\s*query\s*=\s*"([^"]+)"\s*,\s*objects\s*=\s*\["((?:[^"\\]|\\.)*)"\]\s*\)'
266
+ obj_region_pattern = r"<obj(\d+)><region>\[.*?\](.*?)</region>"
267
+ copy_pattern = r"<\|copy_(\d+)\|>"
268
+
269
+ # results = defaultdict(list)
270
+ results = {}
271
+
272
+ for detect_match in re.finditer(detect_pattern, text, re.DOTALL):
273
+ query_str = detect_match.group(1)
274
+ objects_content = detect_match.group(2)
275
+
276
+ for obj_match in re.finditer(
277
+ obj_region_pattern, objects_content, re.DOTALL
278
+ ):
279
+ obj_index = int(obj_match.group(1))
280
+ region_content = obj_match.group(2)
281
+ copy_ids = [int(m) for m in re.findall(copy_pattern, region_content)]
282
+ obj_key = f"<obj{obj_index}>"
283
+ results[obj_key] = (query_str, copy_ids)
284
+
285
+ results = dict(results)
286
+
287
+ masks = {}
288
+ resized_image_size = image_size
289
+ for k, (desc, region) in results.items():
290
+ mask, resized_image_size = self.image_processor.get_mask(image_size, region)
291
+ masks[k] = (desc, mask)
292
+ return masks, resized_image_size
293
+
294
+ def __call__(
295
+ self,
296
+ images: ImageInput = None,
297
+ text: Union[
298
+ TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
299
+ ] = None,
300
+ videos: VideoInput = None,
301
+ regions: Optional[List[dict[str, Union[BBOX, np.ndarray]]]] = None,
302
+ **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
303
+ ) -> BatchFeature:
304
+ """
305
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
306
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
307
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
308
+ Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
309
+
310
+ Args:
311
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
312
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
313
+ tensor. Both channels-first and channels-last formats are supported.
314
+ text (`str`, `List[str]`, `List[List[str]]`):
315
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
316
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
317
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
318
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
319
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
320
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
321
+ regions:
322
+ either bboxes: List[dict[str, Tuple[int, int, int, int]]]
323
+ or masks: List[dict[str, np.ndarray[width, height]]]
324
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
325
+ If set, will return tensors of a particular framework. Acceptable values are:
326
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
327
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
328
+ - `'np'`: Return NumPy `np.ndarray` objects.
329
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
330
+
331
+ Returns:
332
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
333
+
334
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
335
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
336
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
337
+ `None`).
338
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
339
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
340
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
341
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
342
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
343
+ """
344
+
345
+ output_kwargs = self._merge_kwargs(
346
+ Qwen2_5_VLProcessorKwargs,
347
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
348
+ **kwargs,
349
+ )
350
+ obj_ptrs = None
351
+ if images is not None:
352
+ image_inputs = self.image_processor(
353
+ images=images, videos=None, **output_kwargs["images_kwargs"]
354
+ )
355
+ image_grid_thw = image_inputs["image_grid_thw"]
356
+
357
+ for image in images:
358
+ assert isinstance(
359
+ image, Image.Image
360
+ ), "only supporting a single image per row for now"
361
+
362
+ if regions is not None:
363
+ obj_ptrs = [
364
+ {
365
+ name: (
366
+ self.image_processor.get_patch_pointers(image.size, region)
367
+ if region is not None
368
+ else np.array([])
369
+ )
370
+ for name, region in image_region.items()
371
+ }
372
+ for image, image_region in zip(images, regions)
373
+ ]
374
+ else:
375
+ image_inputs = {}
376
+ image_grid_thw = None
377
+
378
+ assert videos is None, "video inputs are not supported yet" # TODO
379
+ if videos is not None:
380
+ videos_inputs = self.image_processor(
381
+ images=None, videos=videos, **output_kwargs["images_kwargs"]
382
+ )
383
+ video_grid_thw = videos_inputs["video_grid_thw"]
384
+
385
+ fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
386
+ if isinstance(fps, (int, float)):
387
+ second_per_grid_ts = [
388
+ self.image_processor.temporal_patch_size / fps
389
+ ] * len(video_grid_thw)
390
+ elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
391
+ second_per_grid_ts = [
392
+ self.image_processor.temporal_patch_size / tmp for tmp in fps
393
+ ]
394
+ else:
395
+ raise ValueError(
396
+ f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
397
+ )
398
+ videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
399
+
400
+ else:
401
+ videos_inputs = {}
402
+ video_grid_thw = None
403
+
404
+ if not isinstance(text, list):
405
+ text = [text]
406
+
407
+ if image_grid_thw is not None:
408
+ merge_length = self.image_processor.merge_size**2
409
+ index = 0
410
+ for i in range(len(text)):
411
+ while self.image_token in text[i]:
412
+ text[i] = text[i].replace(
413
+ self.image_token,
414
+ "<|placeholder|>"
415
+ * (image_grid_thw[index].prod() // merge_length),
416
+ 1,
417
+ )
418
+ index += 1
419
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
420
+
421
+ if obj_ptrs is not None:
422
+ assert regions is not None
423
+ for i in range(len(text)):
424
+ image_ptrs = obj_ptrs[i]
425
+ image_region = regions[i]
426
+
427
+ for name, region in image_region.items():
428
+ region_ptr = image_ptrs[name]
429
+
430
+ assert name in text[i], f"object {name} not found in: {text[i]}"
431
+
432
+ ptrs_str = "".join([f"<|copy_{j}|>" for j in region_ptr])
433
+ region_str = self.image_processor.normalize_bbox(
434
+ image.size, region
435
+ )
436
+ if self.separate_copy_loss:
437
+ ptrs_str = (
438
+ self.copy_start_token + ptrs_str + self.copy_end_token
439
+ )
440
+ out_str = ("<region>" + ptrs_str + "</region>",)
441
+ if self.prepend_raw_region_to_text:
442
+ out_str = "<region>" + region_str + ptrs_str + "</region>"
443
+
444
+ text[i] = text[i].replace(name, out_str)
445
+
446
+ for name in image_region.keys():
447
+ assert name not in text[i]
448
+
449
+ if video_grid_thw is not None:
450
+ # TODO: support video inputs
451
+ raise NotImplementedError("video inputs are not yet supported")
452
+ merge_length = self.image_processor.merge_size**2
453
+ index = 0
454
+ for i in range(len(text)):
455
+ while self.video_token in text[i]:
456
+ text[i] = text[i].replace(
457
+ self.video_token,
458
+ "<patch>"
459
+ + "<|placeholder|>"
460
+ * (video_grid_thw[index].prod() // merge_length)
461
+ + "</patch>",
462
+ 1,
463
+ )
464
+ index += 1
465
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
466
+
467
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
468
+
469
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
470
+
471
+
472
+ def get_processor(model_name: str, **kwargs):
473
+ import ipdb; ipdb.set_trace()
474
+ processor = V1Processor.from_pretrained(model_name, **kwargs)
475
+ processor.image_processor = Qwen2VLImagePointerProcessor.from_pretrained(
476
+ model_name, **kwargs
477
+ )
478
+ # max_position_tokens = processor.tokenizer.model_max_length
479
+ # new_tokens = [f"<|copy_{i}|>" for i in range(max_position_tokens)] # too slow
480
+ processor.tokenizer.orig_vocab_size = len(processor.tokenizer)
481
+ new_tokens = [f"<|copy_{i}|>" for i in range(30000)]
482
+ processor.tokenizer.add_tokens(new_tokens)
483
+ processor.copy_token_start = processor.tokenizer.convert_tokens_to_ids("<|copy_0|>")
484
+ return processor
485
+
486
+
487
+ # Create a data collator to encode text and image pairs
488
+ def collate_fn(examples, processor):
489
+ convs = [row["conversation"] for row in examples]
490
+ regions = [row["region"] for row in examples]
491
+ image_sizes = [row["image_size"] for row in examples]
492
+
493
+ texts = [
494
+ processor.apply_chat_template(conv, tokenize=False, add_generation_prompt=False)
495
+ for conv in convs
496
+ ] # Prepare texts for processing
497
+ image_inputs = [
498
+ process_vision_info(conv)[0][0] for conv in convs
499
+ ] # Process the images to extract inputs
500
+ image_inputs = [
501
+ image.resize(image_size) for image, image_size in zip(image_inputs, image_sizes)
502
+ ]
503
+
504
+ # Tokenize the texts and process the images
505
+ batch = processor(
506
+ text=texts,
507
+ images=image_inputs,
508
+ videos=None,
509
+ regions=regions,
510
+ padding=True,
511
+ return_tensors="pt",
512
+ ) # Encode texts and images into tensors
513
+
514
+ # The labels are the input_ids, and we mask the padding tokens in the loss computation
515
+ labels = batch["input_ids"].clone() # Clone input IDs for labels
516
+ labels[labels == processor.tokenizer.pad_token_id] = (
517
+ -100
518
+ ) # Mask padding tokens in labels
519
+
520
+ # Ignore the image token index in the loss computation (model specific)
521
+ image_tokens = [
522
+ 151652,
523
+ 151653,
524
+ 151655,
525
+ ] # Specific image token IDs for Qwen2VLProcessor
526
+
527
+ # Mask image token IDs in the labels
528
+ for image_token_id in image_tokens:
529
+ labels[labels == image_token_id] = -100 # Mask image token IDs in labels
530
+
531
+ batch["labels"] = labels # Add labels to the batch
532
+
533
+ return batch # Return the prepared batch
534
+
535
+ if __name__ == '__main__':
536
+ import ipdb; ipdb.set_trace()
processor_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
  "prepend_raw_region_to_text": true,
3
- "processor_class": "Qwen2_5_VLPointerProcessor"
4
  }
 
1
  {
2
  "prepend_raw_region_to_text": true,
3
+ "processor_class": "V1Processor"
4
  }