Support Transformers AutoModel

Browse files

Files changed (9) hide show

.gitignore +5 -0
__init__.py +23 -0
config.json +7 -3
configuration_v1.py +236 -0
grounding.py +559 -0
modeling_v1.py +577 -0
preprocessor_config.json +1 -1
processor.py +536 -0
processor_config.json +1 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+backup/
+__pycache__/
+*.pyc
+*.swo
+*.swp

__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers import (
+    AutoModelForCausalLM,
+    AutoProcessor,
+    AutoImageProcessor,
+    AutoConfig,
+)
+from .processor import (
+    Qwen2VLImagePointerProcessor,
+    get_processor,
+    V1Processor,
+    collate_fn,
+)
+from .modeling_v1 import V1ForConditionalGeneration
+from .configuration_v1 import V1Config
+print("Registering V1 model and processor with Transformers")
+AutoConfig.register("v1", V1Config)
+AutoModelForCausalLM.register(
+    V1Config, V1ForConditionalGeneration
+)
+AutoProcessor.register(V1Config, V1Processor)
+AutoImageProcessor.register(V1Config, Qwen2VLImagePointerProcessor)

config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "architectures": [
-    "Qwen2_5_VL_PGNForConditionalGeneration"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
@@ -18,7 +18,7 @@
   "label_smoothing": 0.0,
   "max_position_embeddings": 128000,
   "max_window_layers": 28,
-  "model_type": "qwen2_5_vl_pgn",
   "normalize_copy_states": false,
   "num_attention_heads": 28,
   "num_hidden_layers": 28,
@@ -75,5 +75,9 @@
   "vision_token_id": 151654,
   "vocab_size": 152064,
   "z_loss_top_k": 40,
-  "z_loss_weight": 1e-05
 }

 {
   "architectures": [
+    "V1ForConditionalGeneration"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "label_smoothing": 0.0,
   "max_position_embeddings": 128000,
   "max_window_layers": 28,
+  "model_type": "v1",
   "normalize_copy_states": false,
   "num_attention_heads": 28,
   "num_hidden_layers": 28,
   "vision_token_id": 151654,
   "vocab_size": 152064,
   "z_loss_top_k": 40,
+  "z_loss_weight": 1e-05,
+  "auto_map": {
+    "AutoConfig": "configuration_v1.V1Config",
+    "AutoModelForConditionalGeneration": "modeling_v1.V1ForConditionalGeneration"
+  }
 }

configuration_v1.py ADDED Viewed

	@@ -0,0 +1,236 @@

+from typing import Optional
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+    Qwen2_5_VLVisionConfig,
+)
+class V1Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a
+    Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 152064):
+            Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2_5_VLModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 29568):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 80):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 80):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        vision_config (`Dict`, *optional*):
+            The config for the visual encoder initialization.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+    ```python
+    >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig
+    >>> # Initializing a Qwen2_5_VL style configuration
+    >>> configuration = Qwen2_5_VLConfig()
+    >>> # Initializing a model from the Qwen2-VL-7B style configuration
+    >>> model = Qwen2_5_VLForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "v1"
+    sub_configs = {"vision_config": Qwen2_5_VLVisionConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Qwen2_5_VL`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=8192,
+        intermediate_size=29568,
+        num_hidden_layers=80,
+        num_attention_heads=64,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=80,
+        attention_dropout=0.0,
+        vision_config=None,
+        rope_scaling=None,
+        region_token_id: int = 151662,  # <|fim_pad|>
+        copy_token_start: int = 151665,
+        copy_token_num: int = 30000,
+        copy_scaler: float = 0.1,
+        use_embeddings_as_keys: bool = False,
+        normalize_copy_states: bool = False,
+        copy_extraction_layer: int = -1,
+        tie_copy_heads: bool = False,
+        use_cfg: bool = False,
+        copy_hidden_size: Optional[int] = None,
+        z_loss_weight: float = 1e-5,
+        z_loss_top_k: int = 40,
+        use_gate: bool = False,
+        label_smoothing: bool = False,
+        separate_copy_loss: bool = False,
+        do_copy: bool = True,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations
+        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
+        # TODO: @raushan update config in the hub
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+        self.region_token_id = region_token_id
+        self.copy_token_start = copy_token_start
+        self.copy_token_num = copy_token_num
+        self.copy_scaler = copy_scaler
+        self.use_embeddings_as_keys = use_embeddings_as_keys
+        self.normalize_copy_states = normalize_copy_states
+        self.copy_extraction_layer = copy_extraction_layer
+        self.tie_copy_heads = tie_copy_heads
+        self.use_cfg = use_cfg
+        if copy_hidden_size is None:
+            copy_hidden_size = self.hidden_size
+        self.copy_hidden_size = copy_hidden_size
+        self.z_loss_weight = z_loss_weight
+        self.z_loss_top_k = z_loss_top_k
+        self.use_gate = use_gate
+        self.label_smoothing = label_smoothing
+        self.separate_copy_loss = separate_copy_loss
+        self.do_copy = do_copy
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+__all__ = ["V1Config"]

grounding.py ADDED Viewed

	@@ -0,0 +1,559 @@

+from typing import Tuple, List, Optional, Union
+import re
+import math
+from PIL import Image
+import numpy as np
+import torch
+import torch.nn.functional as F
+from qwen_vl_utils import process_vision_info
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, VideoInput
+from transformers.processing_utils import (
+    Unpack,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
+    smart_resize,
+    Qwen2VLImageProcessor,
+)
+from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import (
+    Qwen2_5_VLProcessorKwargs,
+    Qwen2_5_VLProcessor,
+)
+"""
+Qwen2.5-VL does not use AnyRes to my relief.
+Things to take into account:
+- smart_resize
+- temporal dimension
+    - grid_t = patches.shape[0] // self.temporal_patch_size
+- grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+- merge_size (2)
+Usage:
+model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
+processor = Qwen2_5_VLPointerProcessor.from_pretrained(model_name)
+processor.image_processor = Qwen2VLImagePointerProcessor.from_pretrained(model_name)
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "https://example---/demo.jpeg",
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    },
+    {
+        'role': 'assistant',
+        'content': [
+            {
+                'type': 'text', 'text': '<think>Theres a cat at <|region|>, a dog at <|region|>.</think>A calico cat hanging out with a golden retriever.'
+            }
+        ]
+    }
+]
+# Preparation for inference
+text = processor.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+regions = [
+    [0, 10, 100, 200],
+    [300, 0, 600, 250]
+]
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    regions=[regions]
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+# Qwen2VLImageProcessor in a nutshell
+'(tl tp) c (hlm hm hp) (wlm wm wp) -> (tl hlm wlm hm wm) (c tp hp wp)'
+"""
+BBOX = Tuple[int, int, int, int]
+class PointerProcessor:
+    @staticmethod
+    def normalize_bbox(image_size: Tuple[int, int], bbox: BBOX):
+        w, h = image_size
+        bbox = [
+            bbox[0] / w,
+            bbox[1] / h,
+            bbox[2] / w,
+            bbox[3] / h,
+        ]
+        return "[{}]".format(", ".join([f"{v:.2f}" for v in bbox]))
+    def get_masks(self, image_size: Tuple[int, int], indices: List[int]):
+        width, height = image_size
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels,
+        )
+        # grid_h = resized_height // self.patch_size // self.merge_size
+        grid_w_m = resized_width // self.patch_size // self.merge_size
+        mask = torch.zeros(resized_height, resized_width)
+        for index in indices:
+            index_h = index // grid_w_m
+            index_w = index % grid_w_m
+            bbox = (
+                max(index_w * self.patch_size * self.merge_size, 0),
+                max(index_h * self.patch_size * self.merge_size, 0),
+                min((index_w + 1) * self.patch_size * self.merge_size, resized_width),
+                min((index_h + 1) * self.patch_size * self.merge_size, resized_height),
+            )
+            x1, y1, x2, y2 = bbox
+            mask[y1:y2, x1:x2] = 1
+        # mask = mask.t()  # to width, height
+        return mask, (resized_width, resized_height)
+    def get_patch_pointers(
+        self, image_size: Tuple[int, int], region: Union[BBOX, np.ndarray]
+    ):
+        if isinstance(region, np.ndarray):
+            return self.get_mask_patch_pointers(image_size, region)
+        else:
+            return self.get_bbox_patch_pointers(image_size, region)
+    def get_bbox_patch_pointers(self, image_size: Tuple[int, int], bbox: BBOX):
+        factor = self.merge_size
+        # factor = 1
+        width, height = image_size
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels,
+        )
+        x0, y0, x1, y1 = bbox
+        resized_bbox = [
+            max(x0 / width * resized_width, 0),
+            max(y0 / height * resized_height, 0),
+            min(x1 / width * resized_width, resized_width),
+            min(y1 / height * resized_height, resized_height),
+        ]
+        # patch_bbox = [v / self.patch_size / self.merge_size for v in resized_bbox]
+        patch_bbox = [v / self.patch_size / factor for v in resized_bbox]
+        x0, y0, x1, y1 = patch_bbox
+        boundaries = [
+            math.floor(x0),
+            math.floor(y0),
+            math.ceil(x1),
+            math.ceil(y1),
+        ]
+        x0, y0, x1, y1 = boundaries
+        # t, h, w
+        grid_w = resized_width // self.patch_size
+        grid_w_m = grid_w // factor
+        rows, cols = np.meshgrid(np.arange(y0, y1), np.arange(x0, x1), indexing="ij")
+        grid_indices = np.column_stack((rows.ravel(), cols.ravel()))
+        indices = grid_indices[:, 0] * grid_w_m + grid_indices[:, 1]
+        base_ids = list(indices)
+        # reorder
+        # t, hl, wl, hm, wm
+        # ids_map = torch.arange(grid_h * grid_w).reshape(grid_h, grid_w)
+        # ids_map = rearrange(
+        #     ids_map,
+        #     "(hl hm) (wl wm) -> (hl wl) (hm wm)",
+        #     hm=self.merge_size,
+        #     wm=self.merge_size,
+        # ).reshape(-1)
+        # inv_map = ids_map.argsort()
+        # ids = inv_map[base_ids].numpy()
+        ids = np.array(base_ids)
+        # ids.sort()
+        return ids
+    def get_mask_patch_pointers(self, image_size: Tuple[int, int], mask: np.ndarray):
+        # mask size: w h
+        width, height = image_size
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels,
+        )
+        grid_w_m = resized_width // self.patch_size // self.merge_size
+        grid_h_m = resized_height // self.patch_size // self.merge_size
+        m = torch.from_numpy(mask).float()
+        m = F.interpolate(
+            m[None, None], (grid_h_m, grid_w_m), mode="bilinear", antialias="bilinear"
+        )[0, 0]
+        # m = m > 0  # upper bound
+        grid_indices = m.nonzero(as_tuple=False)
+        indices = grid_indices[:, 0] * grid_w_m + grid_indices[:, 1]
+        ids = indices.numpy()
+        return ids
+    def renormalize(self, tensor):
+        # crude - non-accurate implementation for the lazy
+        mean = np.array(self.image_mean).mean()
+        std = np.array(self.image_std).mean()
+        return tensor * std + mean
+class Qwen2VLImagePointerProcessor(Qwen2VLImageProcessor, PointerProcessor):
+    pass
+class Qwen2_5_VLPointerProcessor(Qwen2_5_VLProcessor):
+    image_processor_class = "Qwen2VLImagePointerProcessor"
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        prepend_raw_region_to_text: bool = True,
+        **kwargs,
+    ):
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            chat_template=chat_template,
+            **kwargs,
+        )
+        self.region_token = "<|region|>"
+        self.copy_token_start = None
+        self.prepend_raw_region_to_text = prepend_raw_region_to_text
+    def extract_masks(self, image_size: Tuple[int, int], text: str):
+        # first, gather region indices from text
+        region_pattern = re.compile(r"<region>(.*?)</region>")
+        regions = region_pattern.findall(text)
+        indices = []
+        copy_pattern = re.compile(r"<\|copy_(\d+)\|>")
+        for region in regions:
+            # Extract all numbers inside <|copy_X|> tags within the region
+            numbers = [int(match) for match in copy_pattern.findall(region)]
+            indices.append(numbers)
+        # Then, convert region indices into masks
+        masks = []
+        resized_image_size = image_size
+        for region in indices:
+            mask, resized_image_size = self.image_processor.get_masks(
+                image_size, region
+            )
+            masks.append(mask)
+        return masks, resized_image_size
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ] = None,
+        videos: VideoInput = None,
+        regions: Optional[List[Union[BBOX, np.ndarray]]] = None,
+        **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            regions:
+                either bboxes: List[Tuple[int, int, int, int]]
+                or masks: List[np.ndarray[width, height]]
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Qwen2_5_VLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        obj_ptrs = None
+        if images is not None:
+            image_inputs = self.image_processor(
+                images=images, videos=None, **output_kwargs["images_kwargs"]
+            )
+            image_grid_thw = image_inputs["image_grid_thw"]
+            for image in images:
+                assert isinstance(
+                    image, Image.Image
+                ), "only supporting a single image per row for now"
+            if regions is not None:
+                obj_ptrs = [
+                    [
+                        (
+                            self.image_processor.get_patch_pointers(image.size, region)
+                            if region is not None
+                            else np.array([])
+                        )
+                        for region in image_region
+                    ]
+                    for image, image_region in zip(images, regions)
+                ]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+        assert videos is None, "video inputs are not supported yet"  # TODO
+        if videos is not None:
+            videos_inputs = self.image_processor(
+                images=None, videos=videos, **output_kwargs["images_kwargs"]
+            )
+            video_grid_thw = videos_inputs["video_grid_thw"]
+            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
+            if isinstance(fps, (int, float)):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / fps
+                ] * len(video_grid_thw)
+            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / tmp for tmp in fps
+                ]
+            else:
+                raise ValueError(
+                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
+                )
+            videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+        if not isinstance(text, list):
+            text = [text]
+        if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        "<|placeholder|>"
+                        * (image_grid_thw[index].prod() // merge_length),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+            if obj_ptrs is not None:
+                assert regions is not None
+                for i in range(len(text)):
+                    ptrs = obj_ptrs[i]
+                    region = regions[i]
+                    assert len(ptrs) == text[i].count(self.region_token)
+                    index = 0
+                    while self.region_token in text[i]:
+                        ptrs_str = "".join([f"<|copy_{j}|>" for j in ptrs[index]])
+                        region_str = self.image_processor.normalize_bbox(
+                            image.size, region[index]
+                        )
+                        out_str = ("<region>" + ptrs_str + "</region>",)
+                        if self.prepend_raw_region_to_text:
+                            out_str = "<region>" + region_str + ptrs_str + "</region>"
+                        text[i] = text[i].replace(
+                            self.region_token,
+                            out_str,
+                            1,
+                        )
+                        index += 1
+                    # text[i] = text[i].replace("<|placeholder|>", self.region_token)
+        if video_grid_thw is not None:
+            # TODO: support video inputs
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.video_token,
+                        "<patch>"
+                        + "<|placeholder|>"
+                        * (video_grid_thw[index].prod() // merge_length)
+                        + "</patch>",
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+def get_processor(model_name: str, **kwargs):
+    processor = Qwen2_5_VLPointerProcessor.from_pretrained(model_name, **kwargs)
+    processor.image_processor = Qwen2VLImagePointerProcessor.from_pretrained(
+        model_name, **kwargs
+    )
+    # max_position_tokens = processor.tokenizer.model_max_length
+    # new_tokens = [f"<|copy_{i}|>" for i in range(max_position_tokens)]  # too slow
+    processor.tokenizer.orig_vocab_size = len(processor.tokenizer)
+    new_tokens = [f"<|copy_{i}|>" for i in range(30000)]
+    processor.tokenizer.add_tokens(new_tokens)
+    processor.copy_token_start = processor.tokenizer.convert_tokens_to_ids("<|copy_0|>")
+    return processor
+# Create a data collator to encode text and image pairs
+def collate_fn(examples, processor):
+    # Get the texts and images, and apply the chat template
+    examples, masks = zip(*examples)
+    texts = [
+        processor.apply_chat_template(example, tokenize=False) for example in examples
+    ]  # Prepare texts for processing
+    image_inputs = [
+        process_vision_info(example)[0][0] for example in examples
+    ]  # Process the images to extract inputs
+    # Tokenize the texts and process the images
+    batch = processor(
+        text=texts,
+        images=image_inputs,
+        videos=None,
+        regions=masks,
+        padding=True,
+        return_tensors="pt",
+    )  # Encode texts and images into tensors
+    # The labels are the input_ids, and we mask the padding tokens in the loss computation
+    labels = batch["input_ids"].clone()  # Clone input IDs for labels
+    labels[labels == processor.tokenizer.pad_token_id] = (
+        -100
+    )  # Mask padding tokens in labels
+    # Ignore the image token index in the loss computation (model specific)
+    if isinstance(
+        processor, Qwen2VLImagePointerProcessor
+    ):  # Check if the processor is Qwen2VLProcessor
+        image_tokens = [
+            151652,
+            151653,
+            151655,
+        ]  # Specific image token IDs for Qwen2VLProcessor
+    else:
+        image_tokens = [
+            processor.tokenizer.convert_tokens_to_ids(processor.image_token)
+        ]  # Convert image token to ID
+    # Mask image token IDs in the labels
+    for image_token_id in image_tokens:
+        labels[labels == image_token_id] = -100  # Mask image token IDs in labels
+    batch["labels"] = labels  # Add labels to the batch
+    return batch  # Return the prepared batch
+if __name__ == "__main__":
+    # processor = Qwen2VLImagePointerProcessor.from_pretrained(
+    #     "Qwen/Qwen2.5-VL-7B-Instruct"
+    # )
+    # image_size = [1036, 756]
+    # regions = [[0, 20, 25, 120], [512, 600, 800, 800], [0, 0, 1023, 740]]
+    # processor.test(image_size, regions)
+    model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
+    processor = get_processor(model_name)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": "https://example---/demo.jpeg",
+                },
+                {"type": "text", "text": "Describe this image."},
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "<think>Theres a cat at <|region|>, a dog at <|region|>.</think>A calico cat hanging out with a golden retriever.",
+                }
+            ],
+        },
+    ]
+    image = Image.new("RGB", (800, 500), "black")
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    bboxes = [[0, 10, 100, 200], [300, 0, 600, 250]]
+    inputs = processor(
+        text=[text],
+        images=[image],
+        videos=None,
+        regions=[bboxes],
+        padding=True,
+        return_tensors="pt",
+    )
+    text = processor.tokenizer.decode(inputs.input_ids[0])
+    print(text)
+    masks, image_size = processor.extract_masks(image.size, text)
+    import ipdb; ipdb.set_trace() # noqa # fmt: skip

modeling_v1.py ADDED Viewed

	@@ -0,0 +1,577 @@

+import math
+from typing import Optional, Union, Tuple, List
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VisionTransformerPretrainedModel,
+    Qwen2_5_VLModel,
+    Qwen2_5_VLForConditionalGeneration,
+    Qwen2_5_VLCausalLMOutputWithPast,
+)
+from .configuration_v1 import V1Config
+def init_identity(layer, scale: float = 1):
+    if isinstance(layer, nn.Linear):
+        with torch.no_grad():
+            # Ensure weight matrix is square
+            rows, cols = layer.weight.shape
+            identity_matrix = (
+                torch.eye(rows, cols) * scale
+            )  # Creates an identity matrix
+            layer.weight.copy_(
+                identity_matrix
+            )  # Copy identity matrix into layer weights
+            if hasattr(layer, "bias"):
+                layer.bias.fill_(0)  # Set bias to zero (or another value if needed)
+@dataclass
+class V1CausalLMOutputWithPast(Qwen2_5_VLCausalLMOutputWithPast):
+    z_loss: torch.Tensor = None
+    gen_loss: torch.Tensor = None
+    copy_loss: torch.Tensor = None
+class V1ForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
+    config_class = V1Config
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(
+            config.vision_config
+        )
+        self.model = Qwen2_5_VLModel(config)
+        self.copy_init_scale = 1 / math.sqrt(self.config.hidden_size)
+        # self.tokenizer_vocab_size = (
+        #     config.tokenizer_vocab_size
+        # )  # Qwen2.5-VL: different from embedding_size==vocab_size. 151665 vs. 152064
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.rope_deltas = None  # cache rope_deltas here
+        if self.config.do_copy:
+            if self.config.tie_copy_heads:
+                self._copy_head = nn.Linear(config.hidden_size, config.copy_hidden_size)
+            else:
+                self._copy_q_head = nn.Linear(
+                    config.hidden_size, config.copy_hidden_size
+                )
+                self._copy_k_head = nn.Linear(
+                    config.hidden_size, config.copy_hidden_size
+                )
+            if self.config.use_gate:
+                self.gate = nn.Linear(config.hidden_size, 1, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @torch.no_grad()
+    def after_loading(self):
+        if self.config.do_copy:
+            self.init_heads()
+            if self.config.use_gate:
+                self.lm_head.weight.data = self.lm_head.weight.data * 2
+                self.gate.weight.data.fill_(0)
+    @property
+    def copy_q_head(self):
+        return self._copy_head if self.config.tie_copy_heads else self._copy_q_head
+    @property
+    def copy_k_head(self):
+        return self._copy_head if self.config.tie_copy_heads else self._copy_k_head
+    def init_heads(self):
+        if hasattr(self, "_copy_head"):
+            init_identity(self._copy_head, self.copy_init_scale)
+        if hasattr(self, "_copy_k_head"):
+            init_identity(self._copy_k_head, self.copy_init_scale)
+        if hasattr(self, "_copy_q_head"):
+            init_identity(self._copy_q_head, self.copy_init_scale)
+    def copy_representations(
+        self,
+        inputs_embeds: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        copy_values: Optional[torch.FloatTensor] = None,
+    ):
+        if copy_values is None:
+            mask = input_ids == self.config.image_token_id
+            copy_values, _ = self.extract_image_tokens(inputs_embeds, mask)  # initial
+        assert copy_values is not None
+        copy_values = copy_values.to(inputs_embeds.device)
+        input_ids = input_ids.to(inputs_embeds.device)
+        input_ids = input_ids.clone()
+        input_ids = input_ids - self.config.copy_token_start
+        copy_mask = input_ids >= 0
+        input_ids[~copy_mask] = 0
+        assert copy_values is not None
+        extracted = copy_values.gather(
+            1, input_ids[..., None].repeat(1, 1, copy_values.shape[-1])
+        )
+        copy_mask = copy_mask.to(extracted.dtype)[..., None]
+        return copy_mask * extracted + (1 - copy_mask) * inputs_embeds
+    def extract_image_tokens(self, features: torch.FloatTensor, mask: torch.Tensor):
+        out_feat, out_mask = extract_image_tokens_right_pad(features, mask)
+        return out_feat, out_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+        >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+        >>> messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+        ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        input_ids = input_ids.clone()
+        input_ids_with_ptrs = input_ids.clone()
+        input_ids[input_ids >= self.config.copy_token_start] = (
+            self.config.region_token_id
+        )
+        if inputs_embeds is None:
+            inputs_embeds = self.model.embed_tokens(input_ids)
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(self.visual.dtype)
+                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                mask = input_ids == self.config.image_token_id
+                mask_unsqueezed = mask.unsqueeze(-1)
+                mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+                image_mask = mask_expanded.to(inputs_embeds.device)
+                image_embeds = image_embeds.to(
+                    inputs_embeds.device, inputs_embeds.dtype
+                )
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+            if pixel_values_videos is not None:
+                raise NotImplementedError("video inputs are not supported yet.")
+                pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
+                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
+                n_video_features = video_embeds.shape[0]
+                if n_video_tokens != n_video_features:
+                    raise ValueError(
+                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+                    )
+                mask = input_ids == self.config.video_token_id
+                mask_unsqueezed = mask.unsqueeze(-1)
+                mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+                video_mask = mask_expanded.to(inputs_embeds.device)
+                video_embeds = video_embeds.to(
+                    inputs_embeds.device, inputs_embeds.dtype
+                )
+                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+        if self.config.do_copy:
+            copy_keys, copy_keys_mask = None, None
+            copy_values, copy_values_mask = None, None
+            has_cache = bool(past_key_values)
+            if has_cache:
+                copy_keys, copy_values = past_key_values[len(past_key_values) - 2]
+                copy_keys_mask, copy_values_mask = past_key_values[
+                    len(past_key_values) - 1
+                ]
+                # we add channel dim to the mask for consistency in tensor shape in cache
+                copy_keys_mask = copy_keys_mask[..., 0]
+                copy_values_mask = copy_values_mask[..., 0]
+            inputs_embeds = self.copy_representations(
+                inputs_embeds, input_ids_with_ptrs, copy_values
+            )
+        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
+        if position_ids is None and (
+            attention_mask is None or attention_mask.ndim == 2
+        ):
+            # calculate RoPE index once per generation in the pre-fill stage only
+            if (
+                (cache_position is not None and cache_position[0] == 0)
+                or self.rope_deltas is None
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            ):
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    second_per_grid_ts,
+                    attention_mask,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        outputs = self.model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        gen_logits = self.lm_head(hidden_states)
+        if self.config.do_copy:
+            assert (
+                self.config.copy_extraction_layer == -1
+            ), f"copy_extraction_layer should be -1: {self.config.copy_extraction_layer}"
+            copy_hidden_states = hidden_states
+            copy_q_states = copy_hidden_states
+            if self.config.normalize_copy_states:
+                copy_q_states = F.normalize(copy_q_states, 2, -1)
+            copy_q_states = self.copy_q_head(copy_q_states)
+            present_key_values = outputs.past_key_values
+            if not has_cache:
+                mask = input_ids == self.config.image_token_id
+                copy_k_states = (
+                    inputs_embeds
+                    if self.config.use_embeddings_as_keys
+                    else copy_hidden_states
+                )
+                if self.config.normalize_copy_states:
+                    copy_k_states = F.normalize(copy_k_states, 2, -1)
+                copy_k_states, copy_k_mask = self.extract_image_tokens(
+                    self.copy_k_head(copy_k_states), mask
+                )
+                copy_v_states, copy_v_mask = self.extract_image_tokens(
+                    inputs_embeds.detach(), mask
+                )
+                # we add channel dim to the mask for consistency in tensor shape in cache
+                copy_memories = [
+                    (copy_k_states.detach(), copy_v_states.detach()),
+                    (copy_k_mask[..., None], copy_v_mask[..., None]),
+                ]
+                if use_cache:
+                    # only update at the first iteration
+                    start = len(present_key_values)
+                    for i, mem in enumerate(copy_memories):
+                        present_key_values.update(*mem, start + i)
+            else:
+                copy_k_states = copy_keys
+                copy_k_mask = copy_keys_mask
+            assert copy_k_states is not None
+            assert copy_k_mask is not None
+            assert (
+                copy_k_states.shape[1] > 0
+            ), f"zero image tokens on batch elements: {copy_k_mask.sum(dim=1)}"
+            copy_logits = (copy_q_states @ copy_k_states.transpose(-1, -2)).to(
+                gen_logits.device
+            ) * self.copy_init_scale
+            if hasattr(self, "gate"):
+                gate = torch.sigmoid(self.gate(hidden_states))
+                gen_logits = gen_logits * (1 - gate)
+                copy_logits = copy_logits * gate
+            copy_logits = copy_logits.masked_fill(
+                ~copy_k_mask[:, None, :].to(copy_logits.device),
+                torch.finfo(copy_logits.dtype).min,
+            )
+            logits = torch.cat(
+                [gen_logits[..., : self.config.copy_token_start], copy_logits], dim=-1
+            )
+        else:
+            logits = gen_logits
+            loss = None
+            z_loss = None
+            gen_loss = None
+            if labels is not None:
+                gen_logits = gen_logits.float()
+                shift_gen_logits = gen_logits[:, :-1, :].contiguous().float()
+                shift_labels = labels[:, 1:].contiguous()
+                gen_loss_fct = CrossEntropyLoss(reduction="none")
+                gen_logits_flat = shift_gen_logits.view(-1, shift_gen_logits.shape[-1])
+                gen_labels_flat = shift_labels.view(-1)
+                gen_loss_all = gen_loss_fct(gen_logits_flat, gen_labels_flat)
+                gen_loss = gen_loss_all.mean()
+                loss = gen_loss
+                if self.config.z_loss_weight > 0:
+                    valid_mask = shift_labels >= 0
+                    # top-k approx z_loss for better memory usage
+                    top_logits, _ = torch.topk(
+                        shift_gen_logits, k=self.config.z_loss_top_k, dim=-1
+                    )
+                    lse = torch.logsumexp(top_logits, dim=-1)
+                    z_loss = lse[valid_mask].pow(2).mean() * self.config.z_loss_weight
+                    # z_loss = (
+                    #     torch.logsumexp(shift_logits, dim=-1).pow(2)[valid_mask].mean()
+                    #     * self.config.z_loss_weight
+                    # )
+                    loss = loss + z_loss
+                    z_loss = z_loss.detach()
+            return V1CausalLMOutputWithPast(
+                loss=loss,
+                z_loss=z_loss,
+                gen_loss=gen_loss,
+                copy_loss=None,
+                logits=logits,
+                # copy_logits=copy_logits,
+                # gen_logits=gen_logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+                rope_deltas=self.rope_deltas,
+            )
+        loss = None
+        z_loss = None
+        gen_loss = None
+        copy_loss = None
+        if labels is not None:
+            if self.config.separate_copy_loss:
+                # Shift labels and logits for next-token prediction
+                shift_gen_logits = gen_logits[:, :-1, :].contiguous().float()
+                shift_copy_logits = copy_logits[:, :-1, :].contiguous().float()
+                shift_labels = labels[:, 1:].contiguous()
+                shift_logits = shift_copy_logits
+                # Build masks
+                gen_mask = shift_labels < self.config.copy_token_start
+                copy_mask = shift_labels >= self.config.copy_token_start
+                # Generation loss
+                if gen_mask.any():
+                    gen_loss_fct = CrossEntropyLoss(reduction="none")
+                    G = shift_gen_logits.shape[-1]
+                    gen_logits_flat = shift_gen_logits.view(-1, G)
+                    gen_labels_flat = shift_labels.view(-1)
+                    gen_mask_flat = gen_mask.view(-1)
+                    # mask logits
+                    gen_logits_flat_masked = gen_logits_flat[gen_mask_flat]
+                    gen_labels_flat_masked = gen_labels_flat[gen_mask_flat]
+                    gen_loss_all = gen_loss_fct(
+                        gen_logits_flat_masked, gen_labels_flat_masked
+                    )
+                    gen_loss = gen_loss_all.mean()
+                # Copy loss (adjust label indices to match copy_logits range)
+                if copy_mask.any():
+                    copy_loss_fct = CrossEntropyLoss(reduction="none")
+                    C = shift_copy_logits.shape[-1]
+                    copy_logits_flat = shift_copy_logits.view(-1, C)
+                    copy_labels_flat = (
+                        shift_labels.view(-1) - self.config.copy_token_start
+                    )
+                    copy_mask_flat = copy_mask.view(-1)
+                    copy_logits_flat_masked = copy_logits_flat[copy_mask_flat]
+                    copy_labels_flat_masked = copy_labels_flat[copy_mask_flat]
+                    copy_loss_all = copy_loss_fct(
+                        copy_logits_flat_masked, copy_labels_flat_masked
+                    )
+                    copy_loss = copy_loss_all.mean()
+            else:
+                # Upcast to float if we need to compute the loss to avoid potential precision issues
+                logits = logits.float()
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(label_smoothing=self.config.label_smoothing)
+                total_vocab_size = logits.shape[-1]  # gen + copy
+                shift_logits = shift_logits.view(-1, total_vocab_size)
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                gen_loss = loss_fct(shift_logits, shift_labels)
+            loss = 0.0
+            if gen_loss is not None:
+                loss += gen_loss
+            if copy_loss is not None:
+                loss += copy_loss
+            if self.config.z_loss_weight > 0:
+                valid_mask = shift_labels >= 0
+                # top-k approx z_loss for better memory usage
+                top_logits, _ = torch.topk(
+                    shift_logits, k=self.config.z_loss_top_k, dim=-1
+                )
+                lse = torch.logsumexp(top_logits, dim=-1)
+                z_loss = lse[valid_mask].pow(2).mean() * self.config.z_loss_weight
+                # z_loss = (
+                #     torch.logsumexp(shift_logits, dim=-1).pow(2)[valid_mask].mean()
+                #     * self.config.z_loss_weight
+                # )
+                loss = loss + z_loss
+                z_loss = z_loss.detach()
+            if gen_loss is not None:
+                gen_loss = gen_loss.detach()
+            if copy_loss is not None:
+                copy_loss = copy_loss.detach()
+        if self.config.use_cfg:
+            # expand as max_size for logit processors
+            extended_vocab_size = self.config.vocab_size + self.config.copy_token_num
+            B, L, V = logits.shape
+            pads = torch.full(
+                (B, L, extended_vocab_size - V),
+                torch.finfo(gen_logits.dtype).min,
+                device=logits.device,
+            ).to(logits.dtype)
+            logits = torch.cat([logits, pads], dim=-1)
+            # logits = logits.clamp_min(-1e4)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        logits = logits.float()
+        return V1CausalLMOutputWithPast(
+            loss=loss,
+            z_loss=z_loss,
+            gen_loss=gen_loss,
+            copy_loss=copy_loss,
+            logits=logits,
+            # copy_logits=copy_logits,
+            # gen_logits=gen_logits,
+            past_key_values=present_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=self.rope_deltas,
+        )
+def extract_image_tokens_right_pad(features: torch.FloatTensor, mask: torch.Tensor):
+    X, M = features, mask.long()  # bool is not supported for sort in CUDA
+    B, L, _ = X.shape
+    device = X.device
+    M = M.to(device)
+    # Compute number of valid elements per batch
+    valid_counts = M.sum(dim=1)  # Shape: [B]
+    # Replace `.item()` with `max()` and `clamp_min()` for Torch Dynamo compatibility
+    R = valid_counts.max().clamp_min(1)  # Ensures at least 1 for tensor compatibility
+    # Create index tensors for selection
+    sorted_indices = M.argsort(dim=1, descending=True)  # Move True values to front
+    batch_indices = torch.arange(B, device=device).unsqueeze(1).expand(B, L)
+    # Gather sorted X based on mask sorting
+    X_sorted = X[batch_indices, sorted_indices]  # Shape: [B, L, C]
+    X_selected = X_sorted[:, :R, :]  # Select the top valid elements per batch
+    # Create new mask M2 using `torch.arange`
+    M2 = torch.arange(L, device=device).expand(B, L) < valid_counts.unsqueeze(1)
+    M2 = M2[:, :R]  # Trim to selected size
+    # Set out-of-bound values to zero
+    X_selected = torch.where(M2.unsqueeze(-1), X_selected, torch.zeros_like(X_selected))
+    return X_selected, M2
+__all__ = ["V1ForConditionalGeneration"]

preprocessor_config.json CHANGED Viewed

@@ -18,7 +18,7 @@
   "merge_size": 2,
   "min_pixels": 3136,
   "patch_size": 14,
-  "processor_class": "Qwen2_5_VLPointerProcessor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "size": {

   "merge_size": 2,
   "min_pixels": 3136,
   "patch_size": 14,
+  "processor_class": "V1Processor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "size": {

processor.py ADDED Viewed

	@@ -0,0 +1,536 @@

+from typing import Tuple, List, Optional, Union
+import re
+import math
+from collections import defaultdict
+from PIL import Image
+import numpy as np
+import torch
+import torch.nn.functional as F
+from qwen_vl_utils import process_vision_info
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, VideoInput
+from transformers.processing_utils import (
+    Unpack,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
+    smart_resize,
+    Qwen2VLImageProcessor,
+)
+from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import (
+    Qwen2_5_VLProcessorKwargs,
+    Qwen2_5_VLProcessor,
+)
+"""
+Qwen2.5-VL does not use AnyRes to my relief.
+Things to take into account:
+- smart_resize
+- temporal dimension
+    - grid_t = patches.shape[0] // self.temporal_patch_size
+- grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+- merge_size (2)
+Usage:
+model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
+processor = Qwen2_5_VLPointerProcessor.from_pretrained(model_name)
+processor.image_processor = Qwen2VLImagePointerProcessor.from_pretrained(model_name)
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "https://example---/demo.jpeg",
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    },
+    {
+        'role': 'assistant',
+        'content': [
+            {
+                'type': 'text', 'text': '<think>Theres a cat at <|region|>, a dog at <|region|>.</think>A calico cat hanging out with a golden retriever.'
+            }
+        ]
+    }
+]
+# Preparation for inference
+text = processor.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+regions = [
+    [0, 10, 100, 200],
+    [300, 0, 600, 250]
+]
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    regions=[regions]
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+# Qwen2VLImageProcessor in a nutshell
+'(tl tp) c (hlm hm hp) (wlm wm wp) -> (tl hlm wlm hm wm) (c tp hp wp)'
+"""
+BBOX = Tuple[int, int, int, int]
+class PointerProcessor:
+    @staticmethod
+    def normalize_bbox(image_size: Tuple[int, int], bbox: BBOX):
+        w, h = image_size
+        bbox = [
+            bbox[0] / w,
+            bbox[1] / h,
+            bbox[2] / w,
+            bbox[3] / h,
+        ]
+        return "[{}]".format(", ".join([f"{v:.2f}" for v in bbox]))
+    def get_mask(self, image_size: Tuple[int, int], indices: List[int]):
+        width, height = image_size
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels,
+        )
+        # grid_h = resized_height // self.patch_size // self.merge_size
+        grid_w_m = resized_width // self.patch_size // self.merge_size
+        mask = torch.zeros(resized_height, resized_width)
+        for index in indices:
+            index_h = index // grid_w_m
+            index_w = index % grid_w_m
+            bbox = (
+                max(index_w * self.patch_size * self.merge_size, 0),
+                max(index_h * self.patch_size * self.merge_size, 0),
+                min((index_w + 1) * self.patch_size * self.merge_size, resized_width),
+                min((index_h + 1) * self.patch_size * self.merge_size, resized_height),
+            )
+            x1, y1, x2, y2 = bbox
+            mask[y1:y2, x1:x2] = 1
+        # mask = mask.t()  # to width, height
+        return mask, (resized_width, resized_height)
+    def get_patch_pointers(
+        self, image_size: Tuple[int, int], region: Union[BBOX, np.ndarray]
+    ):
+        if isinstance(region, np.ndarray):
+            return self.get_mask_patch_pointers(image_size, region)
+        else:
+            return self.get_bbox_patch_pointers(image_size, region)
+    def get_bbox_patch_pointers(self, image_size: Tuple[int, int], bbox: BBOX):
+        factor = self.merge_size
+        # factor = 1
+        width, height = image_size
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels,
+        )
+        x0, y0, x1, y1 = bbox
+        resized_bbox = [
+            max(x0 / width * resized_width, 0),
+            max(y0 / height * resized_height, 0),
+            min(x1 / width * resized_width, resized_width),
+            min(y1 / height * resized_height, resized_height),
+        ]
+        # patch_bbox = [v / self.patch_size / self.merge_size for v in resized_bbox]
+        patch_bbox = [v / self.patch_size / factor for v in resized_bbox]
+        x0, y0, x1, y1 = patch_bbox
+        boundaries = [
+            math.floor(x0),
+            math.floor(y0),
+            math.ceil(x1),
+            math.ceil(y1),
+        ]
+        x0, y0, x1, y1 = boundaries
+        # t, h, w
+        grid_w = resized_width // self.patch_size
+        grid_w_m = grid_w // factor
+        rows, cols = np.meshgrid(np.arange(y0, y1), np.arange(x0, x1), indexing="ij")
+        grid_indices = np.column_stack((rows.ravel(), cols.ravel()))
+        indices = grid_indices[:, 0] * grid_w_m + grid_indices[:, 1]
+        base_ids = list(indices)
+        ids = np.array(base_ids)
+        return ids
+    def get_mask_patch_pointers(self, image_size: Tuple[int, int], mask: np.ndarray):
+        # mask size: w h
+        width, height = image_size
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels,
+        )
+        grid_w_m = resized_width // self.patch_size // self.merge_size
+        grid_h_m = resized_height // self.patch_size // self.merge_size
+        m = torch.from_numpy(mask).float()
+        m = F.interpolate(
+            m[None, None], (grid_h_m, grid_w_m), mode="bilinear", antialias="bilinear"
+        )[0, 0]
+        grid_indices = m.nonzero(as_tuple=False)
+        indices = grid_indices[:, 0] * grid_w_m + grid_indices[:, 1]
+        ids = indices.numpy()
+        return ids
+    def renormalize(self, tensor):
+        # crude - non-accurate implementation for the lazy
+        mean = np.array(self.image_mean).mean()
+        std = np.array(self.image_std).mean()
+        return tensor * std + mean
+class Qwen2VLImagePointerProcessor(Qwen2VLImageProcessor, PointerProcessor):
+    pass
+class V1Processor(Qwen2_5_VLProcessor):
+    image_processor_class = "Qwen2VLImagePointerProcessor"
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        prepend_raw_region_to_text: bool = True,
+        separate_copy_loss: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            chat_template=chat_template,
+            **kwargs,
+        )
+        self.region_token = "<|region|>"
+        self.copy_token_start = None
+        self.prepend_raw_region_to_text = prepend_raw_region_to_text
+        self.separate_copy_loss = separate_copy_loss
+        self.copy_start_token = "<|box_start|>"
+        self.copy_end_token = "<|box_end|>"
+    # def extract_masks(self, image_size: Tuple[int, int], text: str):
+    #     # first, gather region indices from text
+    #     region_pattern = re.compile(r"<region>(.*?)</region>")
+    #     regions = region_pattern.findall(text)
+    #     indices = []
+    #     copy_pattern = re.compile(r"<\|copy_(\d+)\|>")
+    #     for region in regions:
+    #         # Extract all numbers inside <|copy_X|> tags within the region
+    #         numbers = [int(match) for match in copy_pattern.findall(region)]
+    #         indices.append(numbers)
+    #     # Then, convert region indices into masks
+    #     masks = []
+    #     resized_image_size = image_size
+    #     for region in indices:
+    #         mask, resized_image_size = self.image_processor.get_mask(
+    #             image_size, region
+    #         )
+    #         masks.append(mask)
+    #     return masks, resized_image_size
+    #
+    def extract_masks(self, image_size: Tuple[int, int], text: str):
+        # Match full detect(...) blocks and extract their content
+        # detect_pattern = r"detect\([^)]+objects\s*=\s*\[(.*?)\]\)"
+        detect_pattern = r'detect\(\s*query\s*=\s*"([^"]+)"\s*,\s*objects\s*=\s*\["((?:[^"\\]|\\.)*)"\]\s*\)'
+        obj_region_pattern = r"<obj(\d+)><region>\[.*?\](.*?)</region>"
+        copy_pattern = r"<\|copy_(\d+)\|>"
+        # results = defaultdict(list)
+        results = {}
+        for detect_match in re.finditer(detect_pattern, text, re.DOTALL):
+            query_str = detect_match.group(1)
+            objects_content = detect_match.group(2)
+            for obj_match in re.finditer(
+                obj_region_pattern, objects_content, re.DOTALL
+            ):
+                obj_index = int(obj_match.group(1))
+                region_content = obj_match.group(2)
+                copy_ids = [int(m) for m in re.findall(copy_pattern, region_content)]
+                obj_key = f"<obj{obj_index}>"
+                results[obj_key] = (query_str, copy_ids)
+        results = dict(results)
+        masks = {}
+        resized_image_size = image_size
+        for k, (desc, region) in results.items():
+            mask, resized_image_size = self.image_processor.get_mask(image_size, region)
+            masks[k] = (desc, mask)
+        return masks, resized_image_size
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ] = None,
+        videos: VideoInput = None,
+        regions: Optional[List[dict[str, Union[BBOX, np.ndarray]]]] = None,
+        **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            regions:
+                either bboxes: List[dict[str, Tuple[int, int, int, int]]]
+                or masks: List[dict[str, np.ndarray[width, height]]]
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Qwen2_5_VLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        obj_ptrs = None
+        if images is not None:
+            image_inputs = self.image_processor(
+                images=images, videos=None, **output_kwargs["images_kwargs"]
+            )
+            image_grid_thw = image_inputs["image_grid_thw"]
+            for image in images:
+                assert isinstance(
+                    image, Image.Image
+                ), "only supporting a single image per row for now"
+            if regions is not None:
+                obj_ptrs = [
+                    {
+                        name: (
+                            self.image_processor.get_patch_pointers(image.size, region)
+                            if region is not None
+                            else np.array([])
+                        )
+                        for name, region in image_region.items()
+                    }
+                    for image, image_region in zip(images, regions)
+                ]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+        assert videos is None, "video inputs are not supported yet"  # TODO
+        if videos is not None:
+            videos_inputs = self.image_processor(
+                images=None, videos=videos, **output_kwargs["images_kwargs"]
+            )
+            video_grid_thw = videos_inputs["video_grid_thw"]
+            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
+            if isinstance(fps, (int, float)):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / fps
+                ] * len(video_grid_thw)
+            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / tmp for tmp in fps
+                ]
+            else:
+                raise ValueError(
+                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
+                )
+            videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+        if not isinstance(text, list):
+            text = [text]
+        if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        "<|placeholder|>"
+                        * (image_grid_thw[index].prod() // merge_length),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+            if obj_ptrs is not None:
+                assert regions is not None
+                for i in range(len(text)):
+                    image_ptrs = obj_ptrs[i]
+                    image_region = regions[i]
+                    for name, region in image_region.items():
+                        region_ptr = image_ptrs[name]
+                        assert name in text[i], f"object {name} not found in: {text[i]}"
+                        ptrs_str = "".join([f"<|copy_{j}|>" for j in region_ptr])
+                        region_str = self.image_processor.normalize_bbox(
+                            image.size, region
+                        )
+                        if self.separate_copy_loss:
+                            ptrs_str = (
+                                self.copy_start_token + ptrs_str + self.copy_end_token
+                            )
+                        out_str = ("<region>" + ptrs_str + "</region>",)
+                        if self.prepend_raw_region_to_text:
+                            out_str = "<region>" + region_str + ptrs_str + "</region>"
+                        text[i] = text[i].replace(name, out_str)
+                    for name in image_region.keys():
+                        assert name not in text[i]
+        if video_grid_thw is not None:
+            # TODO: support video inputs
+            raise NotImplementedError("video inputs are not yet supported")
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.video_token,
+                        "<patch>"
+                        + "<|placeholder|>"
+                        * (video_grid_thw[index].prod() // merge_length)
+                        + "</patch>",
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+def get_processor(model_name: str, **kwargs):
+    import ipdb; ipdb.set_trace()
+    processor = V1Processor.from_pretrained(model_name, **kwargs)
+    processor.image_processor = Qwen2VLImagePointerProcessor.from_pretrained(
+        model_name, **kwargs
+    )
+    # max_position_tokens = processor.tokenizer.model_max_length
+    # new_tokens = [f"<|copy_{i}|>" for i in range(max_position_tokens)]  # too slow
+    processor.tokenizer.orig_vocab_size = len(processor.tokenizer)
+    new_tokens = [f"<|copy_{i}|>" for i in range(30000)]
+    processor.tokenizer.add_tokens(new_tokens)
+    processor.copy_token_start = processor.tokenizer.convert_tokens_to_ids("<|copy_0|>")
+    return processor
+# Create a data collator to encode text and image pairs
+def collate_fn(examples, processor):
+    convs = [row["conversation"] for row in examples]
+    regions = [row["region"] for row in examples]
+    image_sizes = [row["image_size"] for row in examples]
+    texts = [
+        processor.apply_chat_template(conv, tokenize=False, add_generation_prompt=False)
+        for conv in convs
+    ]  # Prepare texts for processing
+    image_inputs = [
+        process_vision_info(conv)[0][0] for conv in convs
+    ]  # Process the images to extract inputs
+    image_inputs = [
+        image.resize(image_size) for image, image_size in zip(image_inputs, image_sizes)
+    ]
+    # Tokenize the texts and process the images
+    batch = processor(
+        text=texts,
+        images=image_inputs,
+        videos=None,
+        regions=regions,
+        padding=True,
+        return_tensors="pt",
+    )  # Encode texts and images into tensors
+    # The labels are the input_ids, and we mask the padding tokens in the loss computation
+    labels = batch["input_ids"].clone()  # Clone input IDs for labels
+    labels[labels == processor.tokenizer.pad_token_id] = (
+        -100
+    )  # Mask padding tokens in labels
+    # Ignore the image token index in the loss computation (model specific)
+    image_tokens = [
+        151652,
+        151653,
+        151655,
+    ]  # Specific image token IDs for Qwen2VLProcessor
+    # Mask image token IDs in the labels
+    for image_token_id in image_tokens:
+        labels[labels == image_token_id] = -100  # Mask image token IDs in labels
+    batch["labels"] = labels  # Add labels to the batch
+    return batch  # Return the prepared batch
+if __name__ == '__main__':
+    import ipdb; ipdb.set_trace()

processor_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
   "prepend_raw_region_to_text": true,
-  "processor_class": "Qwen2_5_VLPointerProcessor"
 }

 {
   "prepend_raw_region_to_text": true,
+  "processor_class": "V1Processor"
 }