Upload 7 files (#2)

Browse files

- Upload 7 files (4842d25c2522c3297af5b90c6e5807a972082c84)

Files changed (7) hide show

config.json +2 -1
configuration_ernie_45t_vl.py +13 -11
generation_config.json +1 -0
modeling_ernie_45t_vl.py +118 -39
processing_ernie_45t_vl.py +7 -298
tokenization_ernie_45t_vl.py +323 -0
tokenizer_config.json +2 -2

config.json CHANGED Viewed

@@ -37,11 +37,12 @@
   "freq_allocation": 20,
   "spatial_conv_size": 2,
   "temporal_conv_size": 2,
-  "moe_use_aux_free": true,
   "moe_num_experts": [64, 64],
   "moe_intermediate_size": [3584, 1536],
   "torch_dtype": "bfloat16",
   "tie_word_embeddings": false,
   "vision_config": {
     "attn_implementation": "eager",
     "depth": 32,

   "freq_allocation": 20,
   "spatial_conv_size": 2,
   "temporal_conv_size": 2,
+  "moe_use_aux_free": false,
   "moe_num_experts": [64, 64],
   "moe_intermediate_size": [3584, 1536],
   "torch_dtype": "bfloat16",
   "tie_word_embeddings": false,
+  "moe_multimodal_dispatch_use_allgather": "v2-alltoall-unpad-text",
   "vision_config": {
     "attn_implementation": "eager",
     "depth": 32,

configuration_ernie_45t_vl.py CHANGED Viewed

@@ -539,17 +539,19 @@ class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig):
         "activation_function": "hidden_act",
     }
     base_model_tp_plan = {
-        "ernie.layers.*.self_attn.qkv_proj": "colwise",
-        "ernie.layers.*.self_attn.o_proj": "rowwise",
-        "ernie.layers.*.mlp_text.experts.*.up_gate_proj": "colwise",
-        "ernie.layers.*.mlp_text.experts.*.down_proj": "rowwise",
-        "ernie.layers.*.mlp_text.gate": "colwise_rep",
-        "ernie.layers.*.mlp.experts.*.up_gate_proj": "colwise",
-        "ernie.layers.*.mlp.experts.*.down_proj": "rowwise",
-        "ernie.layers.*.mlp.gate": "colwise_rep",
-        "ernie.layers.*.mlp.up_gate_proj": "colwise",
-        "ernie.layers.*.mlp.down_proj": "rowwise",
-        "lm_head": "colwise_rep",
     }
     def __init__(

         "activation_function": "hidden_act",
     }
     base_model_tp_plan = {
+        "model.layers.*.self_attn.q_proj": "colwise_rep",
+        "model.layers.*.self_attn.k_proj": "colwise_rep",
+        "model.layers.*.self_attn.v_proj": "colwise_rep",
+        "model.layers.*.self_attn.o_proj": "rowwise_rep",
+        "model.layers.*.mlp.experts.*.gate_proj": "colwise",
+        "model.layers.*.mlp.experts.*.up_proj": "colwise",
+        "model.layers.*.mlp.experts.*.down_proj": "rowwise",
+        "model.layers.*.mlp_text.experts.*.gate_proj": "colwise",
+        "model.layers.*.mlp_text.experts.*.up_proj": "colwise",
+        "model.layers.*.mlp_text.experts.*.down_proj": "rowwise",
+        "model.layers.*.mlp.gate_proj": "colwise",
+        "model.layers.*.mlp.up_proj": "colwise",
+        "model.layers.*.mlp.down_proj": "rowwise"
     }
     def __init__(

generation_config.json CHANGED Viewed

@@ -1,6 +1,7 @@
 {
     "top_p": 0.8,
     "temperature": 0.2,
     "pad_token_id": 0,
     "bos_token_id": 1,
     "eos_token_id": 2,

 {
     "top_p": 0.8,
     "temperature": 0.2,
+    "do_sample": true,
     "pad_token_id": 0,
     "bos_token_id": 1,
     "eos_token_id": 2,

modeling_ernie_45t_vl.py CHANGED Viewed

@@ -27,6 +27,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
@@ -321,6 +322,7 @@ class Ernie4_5_Attention(nn.Module):
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.is_gqa = (
             self.num_key_value_heads is not None
@@ -373,7 +375,10 @@ class Ernie4_5_Attention(nn.Module):
             freq_allocation=self.freq_allocation,
         )
         self.config = config
-        self.attn_func = self.core_attn
     def forward(
         self,
@@ -446,6 +451,47 @@ class Ernie4_5_Attention(nn.Module):
         )
         return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
     def core_attn(
         self,
         q,
@@ -493,19 +539,13 @@ class Ernie4_5_Attention(nn.Module):
         if getattr(self.config, "scale_qk_coeff", 1.0) != 1.0:
             product = product * getattr(self.config, "scale_qk_coeff", 1.0)
-        if attention_mask is not None:
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            attention_mask = attention_mask.to(torch.float32)
-            product = product + attention_mask
-            weights = F.softmax(product, dim=-1)
-        else:
-            seq_len = product.size(-1)
-            mask = torch.triu(
-                torch.ones((seq_len, seq_len), dtype=torch.bool, device=product.device),
-                diagonal=1,
-            )
-            product = product.masked_fill(mask, float("-inf"))
-            weights = F.softmax(product, dim=-1)
         weights = weights.to(origin_dtype)
@@ -1508,16 +1548,8 @@ class MOELayer(nn.Module):
             )
             assert self.gate.config.moe_use_aux_free
-        try:
-            self.world_size = torch.distributed.get_world_size()
-            self.rank = torch.distributed.get_rank()
-        except:
-            self.world_size = 1
-            self.rank = 0
-        if self.world_size < 1:
-            self.world_size = 1
-        if self.rank < 0:
-            self.rank = 0
         self.multimodal_experts = (
             isinstance(moe_num_experts, (tuple, list)) and len(moe_num_experts) > 1
@@ -1803,7 +1835,7 @@ class MOEAllGatherLayerV2(MOELayer):
         enable_reverse_token_drop=False,
         all_to_all_dropout=0,
         group_experts=False,
-        use_expert_out_alltoall=True,  #
         use_expert_alltoall_overlap=False,
         use_padding=True,
         dense_token_type=3,  # considerd as dense tokens (no moe)
@@ -2729,7 +2761,6 @@ class Ernie4_5_PretrainedModel(PreTrainedModel):
     config_class = Ernie4_5_MoEConfig
     base_model_prefix = "ernie"
     _no_split_modules = ["Ernie4_5_DecoderLayer"]
-    # _keep_in_fp32_modules = ["mlp.gate", "e_score_correction_bias"]
 class Ernie4_5_Model(Ernie4_5_PretrainedModel):
@@ -2876,7 +2907,6 @@ class Ernie4_5_Model(Ernie4_5_PretrainedModel):
             past_key_value = (
                 past_key_values[idx] if past_key_values is not None else None
             )
             layer_outputs = decoder_layer(
                 hidden_states,
                 attention_mask,
@@ -3224,15 +3254,61 @@ class Ernie4_5_MoeForCausalLM(Ernie4_5_PretrainedModel, GenerationMixin):
         """
         return self.model
-    def prepare_attention_mask_for_generation(
-        self, input_ids, pad_token_id, eos_token_id
-    ):
-        """Avoid using attention_mask with flash_attn on generation."""
-        if self.config.use_flash_attention:
-            return None
-        return super().prepare_attention_mask_for_generation(
-            input_ids, pad_token_id, eos_token_id
-        )
 class VisionMlp(nn.Module):
@@ -3943,7 +4019,10 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
                 image_type_ids[:, -1:] if image_type_ids is not None else None
             )
-        attention_mask = kwargs.get("attention_mask", None)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
@@ -4077,7 +4156,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
         if images is not None and image_features is not None:
             inputs_embeds = self.vision_mapping_forward(
-                token_type_ids,
                 token_type_ids_w_video,
                 input_ids,
                 mm_input_ids,
@@ -4091,7 +4170,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
         outputs = self.model(
             position_ids=position_ids,
-            attention_mask=None,
             token_type_ids=token_type_ids,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.is_gqa = (
             self.num_key_value_heads is not None
             freq_allocation=self.freq_allocation,
         )
         self.config = config
+        if self.config.use_flash_attention:
+            self.attn_func = self._flash_attention_wrapper
+        else:
+            self.attn_func = self.core_attn
     def forward(
         self,
         )
         return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+    def _flash_attention_wrapper(
+        self,
+        q,
+        k,
+        v,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        seq_length=None,
+    ):
+        """Wrapper for flash attention implementation.
+        Args:
+            q (torch.Tensor): Query tensor
+            k (torch.Tensor): Key tensor
+            v (torch.Tensor): Value tensor
+            attention_mask (Optional[torch.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length indices
+            seq_length (Optional[int]): Sequence length
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Attention output and weights
+        """
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            out = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=None,
+                dropout_p=self.config.attention_probs_dropout_prob,
+                is_causal=q.shape[-2] == k.shape[-2],
+                scale=1
+                / (getattr(self.config, "scale_qk_coeff", 1.0) * self.head_dim**0.5),
+                enable_gqa=self.is_gqa,
+            )
+        out = out.transpose(1, 2)
+        out = out.contiguous().view(out.size(0), out.size(1), -1)
+        return out, None
     def core_attn(
         self,
         q,
         if getattr(self.config, "scale_qk_coeff", 1.0) != 1.0:
             product = product * getattr(self.config, "scale_qk_coeff", 1.0)
+        seq_len = product.size(-1)
+        mask = torch.triu(
+            torch.ones((seq_len, seq_len), dtype=torch.bool, device=product.device),
+            diagonal=1,
+        )
+        product = product.masked_fill(mask, float("-inf"))
+        weights = F.softmax(product, dim=-1)
         weights = weights.to(origin_dtype)
             )
             assert self.gate.config.moe_use_aux_free
+        self.world_size = 1
+        self.rank = 0
         self.multimodal_experts = (
             isinstance(moe_num_experts, (tuple, list)) and len(moe_num_experts) > 1
         enable_reverse_token_drop=False,
         all_to_all_dropout=0,
         group_experts=False,
+        use_expert_out_alltoall=True,
         use_expert_alltoall_overlap=False,
         use_padding=True,
         dense_token_type=3,  # considerd as dense tokens (no moe)
     config_class = Ernie4_5_MoEConfig
     base_model_prefix = "ernie"
     _no_split_modules = ["Ernie4_5_DecoderLayer"]
 class Ernie4_5_Model(Ernie4_5_PretrainedModel):
             past_key_value = (
                 past_key_values[idx] if past_key_values is not None else None
             )
             layer_outputs = decoder_layer(
                 hidden_states,
                 attention_mask,
         """
         return self.model
+    # @staticmethod
+    def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder=False):
+        """
+        Updates model kwargs for generation.
+        Args:
+            outputs (Any): Model outputs.
+            model_kwargs (dict): Current model kwargs.
+            is_encoder_decoder (bool): Whether using encoder-decoder architecture.
+        Returns:
+            dict: Updated model kwargs.
+        """
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], torch.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+        if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1:]], dim=-1)
+        if not is_encoder_decoder and model_kwargs.get("attention_mask", None) is not None:
+            # update attention mask
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [
+                    attention_mask,
+                    torch.ones((attention_mask.shape[0], 1), dtype=torch.int64, device=attention_mask.device),
+                ],
+                dim=-1,
+            )
+        # update role_ids
+        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
+            role_ids = model_kwargs["role_ids"]
+            model_kwargs["role_ids"] = torch.cat([role_ids, role_ids[:, -1:]], dim=-1)
+        if self.config.get('rope_3d', False):
+            assert "position_ids" in model_kwargs, "position_ids must be provided if rope_3d is on"
+            position_ids = model_kwargs["position_ids"]
+            bsz = position_ids.shape[0]
+            max_position = position_ids.max(dim=1, keepdim=True)[0]  # [batch_size, 1, hidden_dim]
+            new_positions = max_position + 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_positions],
+                dim=1
+            )
+        return model_kwargs
 class VisionMlp(nn.Module):
                 image_type_ids[:, -1:] if image_type_ids is not None else None
             )
+        if self.config.use_flash_attention:
+            attention_mask = None
+        else:
+            attention_mask = kwargs.get("attention_mask", None)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
         if images is not None and image_features is not None:
             inputs_embeds = self.vision_mapping_forward(
+                token_type_ids[..., :-1],
                 token_type_ids_w_video,
                 input_ids,
                 mm_input_ids,
         outputs = self.model(
             position_ids=position_ids,
+            attention_mask=attention_mask,
             token_type_ids=token_type_ids,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,

processing_ernie_45t_vl.py CHANGED Viewed

@@ -17,7 +17,6 @@
 import copy
 import io
 import os
-import re
 import math
 import random
 import requests
@@ -27,15 +26,13 @@ import hashlib
 import threading
 import uuid
 import decord
-from shutil import copyfile
-from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from PIL.ExifTags import TAGS
 from collections import defaultdict
-from typing import Any, Dict, List, Union
 from pathlib import Path
 from tempfile import NamedTemporaryFile as ntf
@@ -46,13 +43,8 @@ except:
     # moviepy 2.0
     import moviepy as mp
-import sentencepiece as spm
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.tokenization_utils_base import (
-    PaddingStrategy,
-    TextInput,
-)
-from transformers.utils import logging
 from transformers.utils import TensorType, logging
 from transformers.video_utils import VideoInput
 from transformers.processing_utils import ProcessorMixin
@@ -618,298 +610,15 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor):
         return BatchFeature(data=data, tensor_type=return_tensors)
-class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
-    """
-    Ernie4_5_VLTokenizer
-    """
-    vocab_files_names = {
-        "vocab_file": "tokenizer.model",
-    }
-    # Model input names expected by the tokenizer
-    model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
-    # Padding side (where to add padding tokens)
-    padding_side = "right"
-    def __init__(
-        self,
-        vocab_file,
-        bos_token="<s>",
-        cls_token="<cls>",
-        eos_token="</s>",
-        mask_token="<mask:0>",
-        pad_token="<pad>",
-        sep_token="<sep>",
-        unk_token="<unk>",
-        additional_special_tokens=None,
-        **kwargs,
-    ):
-        """
-        Initialize the Ernie4_5_VLTokenizer
-        Args:
-            vocab_file (str): Path to the tokenizer vocabulary model.
-            bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
-            cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
-            eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
-            mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
-            pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
-            sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
-            unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
-            additional_special_tokens (List[str], optional): Additional special tokens to use.
-                Defaults to `["<mask:1>", "<mask:7>"]`.
-            **kwargs (dict): Additional keyword arguments passed along to the superclass.
-        """
-        # Store vocabulary file path
-        self.vocab_file = vocab_file
-        # Initialize SentencePiece processor
-        self.sp_model = spm.SentencePieceProcessor()
-        # Load the vocabulary model
-        self.sp_model.Load(vocab_file)
-        # Set default additional special tokens if none provided
-        if additional_special_tokens is None:
-            additional_special_tokens = ["<mask:1>", "<mask:7>"]
-        super().__init__(
-            bos_token=bos_token,
-            cls_token=cls_token,
-            eos_token=eos_token,
-            mask_token=mask_token,
-            pad_token=pad_token,
-            sep_token=sep_token,
-            unk_token=unk_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-    @property
-    def space_token(self):
-        """Return the space token"""
-        return "<mask:1>"
-    @property
-    def space_token_id(self):
-        """Return the ID of the space token"""
-        return self.sp_model.piece_to_id("<mask:1>")
-    @property
-    def gend_token(self):
-        """Return the gender token"""
-        return "<mask:7>"
-    @property
-    def gend_token_id(self):
-        """Return the ID of the gender token"""
-        return self.sp_model.piece_to_id("<mask:7>")
-    @property
-    def im_start_id(self):
-        """Return the ID of the image start token"""
-        return self.sp_model.piece_to_id("<|im_start|>")
-    @property
-    def im_end_id(self):
-        """Return the ID of the image end token"""
-        return self.sp_model.piece_to_id("<|im_end|>")
-    @property
-    def vocab_size(self):
-        """Return the size of the vocabulary"""
-        return self.sp_model.vocab_size()
-    def get_vocab(self):
-        """Return the vocabulary as a dictionary mapping tokens to IDs"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-    def _tokenize(self, text):
-        """Tokenize the input text into pieces"""
-        return self.sp_model.encode_as_pieces(text)
-    def _convert_token_to_id(self, token):
-        """Convert a token to its corresponding ID"""
-        return self.sp_model.piece_to_id(token)
-    def _convert_id_to_token(self, id):
-        """Convert an ID to its corresponding token"""
-        return self.sp_model.id_to_piece(id)
-    def convert_tokens_to_string(self, tokens):
-        """Convert a sequence of tokens back to a string"""
-        current_sub_tokens = []
-        out_string = ""
-        for token in tokens:
-            # Handle special tokens differently
-            if token in self.all_special_tokens:
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-        # Add any remaining sub-tokens
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string
-    def prepare_for_model(self, *args, **kwargs):
-        """Prepare the tokenized inputs for the model"""
-        # Remove add_special_tokens if present (not supported)
-        if "add_special_tokens" in kwargs:
-            kwargs.pop("add_special_tokens")
-        return super().prepare_for_model(*args, **kwargs)
-    def save_vocabulary(
-        self, save_directory, filename_prefix: Optional[str] = None
-    ) -> Tuple[str]:
-        """
-        Save the vocabulary and special tokens file to a directory.
-        Args:
-            save_directory (`str`): The directory to save the vocabulary to
-            filename_prefix (`str`, optional): Prefix to add to the filename
-        Returns:
-            `Tuple(str)`: Paths to the saved files
-        """
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        # Construct output vocabulary file path
-        out_vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "")
-            + self.vocab_files_names["vocab_file"],
-        )
-        # Copy or create vocabulary file
-        if os.path.abspath(self.vocab_file) != os.path.abspath(
-            out_vocab_file
-        ) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-        return (out_vocab_file,)
-    def _decode(self, *args, **kwargs):
-        """Decode token_id back to text"""
-        # Remove some parameters that aren't used
-        kwargs.pop("clean_up_tokenization_spaces", None)
-        kwargs.pop("spaces_between_special_tokens", None)
-        # Call parent decode method with specific parameters
-        return super()._decode(
-            *args,
-            **kwargs,
-            clean_up_tokenization_spaces=False,
-            spaces_between_special_tokens=False,
-        )
-    def _pad(
-        self,
-        encoded_inputs: Dict,
-        max_length: Optional[int] = None,
-        padding_strategy=PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """Pad the encoded inputs to the specified length"""
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-        if return_attention_mask:
-            required_input = encoded_inputs[self.model_input_names[0]]
-            if padding_strategy == PaddingStrategy.LONGEST:
-                max_length = len(required_input)
-            # Adjust max_length if needed for multiple of padding
-            if (
-                max_length is not None
-                and pad_to_multiple_of is not None
-                and (max_length % pad_to_multiple_of != 0)
-            ):
-                max_length = (
-                    (max_length // pad_to_multiple_of) + 1
-                ) * pad_to_multiple_of
-            # Check if padding is needed
-            needs_to_be_padded = (
-                padding_strategy != PaddingStrategy.DO_NOT_PAD
-                and len(required_input) != max_length
-            )
-            # Handle attention mask if present
-            if (
-                "attention_mask" in encoded_inputs
-                and encoded_inputs["attention_mask"] is not None
-            ):
-                attention_mask = encoded_inputs.pop("attention_mask")
-                if isinstance(attention_mask, torch.Tensor):
-                    attention_mask = attention_mask.numpy()
-                elif isinstance(attention_mask, list):
-                    attention_mask = np.array(attention_mask)
-                elif not isinstance(attention_mask, np.ndarray):
-                    raise ValueError(
-                        f"Unexpected type {type(attention_mask)} of attention_mask, "
-                    )
-            else:
-                # Create default attention mask if none provided
-                attention_mask = np.tril(
-                    np.ones((len(required_input), len(required_input)), dtype=np.int64)
-                )
-                attention_mask = np.expand_dims(attention_mask, axis=0)
-            # Perform padding if needed
-            if needs_to_be_padded:
-                difference = max_length - len(required_input)
-                if self.padding_side == "right":
-                    if attention_mask.ndim == 1:
-                        pad_width = [(0, difference)]
-                    else:
-                        pad_width = [(0, 0), (0, difference), (0, difference)]
-                elif self.padding_side == "left":
-                    if attention_mask.ndim == 1:
-                        pad_width = [(difference, 0)]
-                    else:
-                        pad_width = [(0, 0), (difference, 0), (difference, 0)]
-                else:
-                    raise ValueError(
-                        "Invalid padding strategy:" + str(self.padding_side)
-                    )
-                attention_mask = np.pad(
-                    attention_mask,
-                    pad_width=pad_width,
-                    mode="constant",
-                    constant_values=0,
-                )
-        # Call parent padding method
-        encoded_inputs = super()._pad(
-            encoded_inputs,
-            max_length,
-            padding_strategy=padding_strategy,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=False,
-        )
-        # Add attention mask back if needed
-        if return_attention_mask:
-            encoded_inputs["attention_mask"] = attention_mask.tolist()
-        return encoded_inputs
 RAW_VIDEO_DIR = "./download_tmp/raw_video/"
 RAW_IMAGE_DIR = "./download_tmp/raw_images/"
 EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
 TMP_DIR = "./download_tmp/upload_tmp/"
 FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
 def is_gif(data: bytes) -> bool:
@@ -1811,4 +1520,4 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
         return list(tokenizer_input_names) + list(image_processor_input_names)
-__all__ = ["Ernie_45T_VLImageProcessor", "Ernie4_5_VLTokenizer", "Ernie_45T_VLProcessor"]

 import copy
 import io
 import os
 import math
 import random
 import requests
 import threading
 import uuid
 import decord
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from PIL.ExifTags import TAGS
 from collections import defaultdict
 from pathlib import Path
 from tempfile import NamedTemporaryFile as ntf
     # moviepy 2.0
     import moviepy as mp
+from .tokenization_ernie_45t_vl import Ernie4_5_VLTokenizer
 from transformers.utils import TensorType, logging
 from transformers.video_utils import VideoInput
 from transformers.processing_utils import ProcessorMixin
         return BatchFeature(data=data, tensor_type=return_tensors)
 RAW_VIDEO_DIR = "./download_tmp/raw_video/"
 RAW_IMAGE_DIR = "./download_tmp/raw_images/"
 EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
 TMP_DIR = "./download_tmp/upload_tmp/"
 FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
+if not os.path.exists(FONT_PATH):
+    ttf = requests.get("https://paddlenlp.bj.bcebos.com/vision-language-models/materials/Roboto-Regular.ttf")
+    open(FONT_PATH, "wb").write(ttf.content)
 def is_gif(data: bytes) -> bool:
         return list(tokenizer_input_names) + list(image_processor_input_names)
+__all__ = ["Ernie_45T_VLImageProcessor", "Ernie_45T_VLProcessor"]

tokenization_ernie_45t_vl.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Ernie_45T_VL."""
+import os
+from shutil import copyfile
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import torch
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import (
+    PaddingStrategy,
+    TextInput,
+)
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
+    """
+    Ernie4_5_VLTokenizer
+    """
+    vocab_files_names = {
+        "vocab_file": "tokenizer.model",
+    }
+    # Model input names expected by the tokenizer
+    model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
+    # Padding side (where to add padding tokens)
+    padding_side = "right"
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        cls_token="<cls>",
+        eos_token="</s>",
+        mask_token="<mask:0>",
+        pad_token="<pad>",
+        sep_token="<sep>",
+        unk_token="<unk>",
+        additional_special_tokens=None,
+        **kwargs,
+    ):
+        """
+        Initialize the Ernie4_5_VLTokenizer
+        Args:
+            vocab_file (str): Path to the tokenizer vocabulary model.
+            bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
+            cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
+            eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
+            mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
+            pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
+            sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
+            unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
+            additional_special_tokens (List[str], optional): Additional special tokens to use.
+                Defaults to `["<mask:1>", "<mask:7>"]`.
+            **kwargs (dict): Additional keyword arguments passed along to the superclass.
+        """
+        # Store vocabulary file path
+        self.vocab_file = vocab_file
+        # Initialize SentencePiece processor
+        self.sp_model = spm.SentencePieceProcessor()
+        # Load the vocabulary model
+        self.sp_model.Load(vocab_file)
+        # Set default additional special tokens if none provided
+        if additional_special_tokens is None:
+            additional_special_tokens = ["<mask:1>", "<mask:7>"]
+        super().__init__(
+            bos_token=bos_token,
+            cls_token=cls_token,
+            eos_token=eos_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+    @property
+    def space_token(self):
+        """Return the space token"""
+        return "<mask:1>"
+    @property
+    def space_token_id(self):
+        """Return the ID of the space token"""
+        return self.sp_model.piece_to_id("<mask:1>")
+    @property
+    def gend_token(self):
+        """Return the gender token"""
+        return "<mask:7>"
+    @property
+    def gend_token_id(self):
+        """Return the ID of the gender token"""
+        return self.sp_model.piece_to_id("<mask:7>")
+    @property
+    def im_start_id(self):
+        """Return the ID of the image start token"""
+        return self.sp_model.piece_to_id("<|im_start|>")
+    @property
+    def im_end_id(self):
+        """Return the ID of the image end token"""
+        return self.sp_model.piece_to_id("<|im_end|>")
+    @property
+    def vocab_size(self):
+        """Return the size of the vocabulary"""
+        return self.sp_model.vocab_size()
+    def get_vocab(self):
+        """Return the vocabulary as a dictionary mapping tokens to IDs"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Tokenize the input text into pieces"""
+        return self.sp_model.encode_as_pieces(text)
+    def _convert_token_to_id(self, token):
+        """Convert a token to its corresponding ID"""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, id):
+        """Convert an ID to its corresponding token"""
+        return self.sp_model.id_to_piece(id)
+    def convert_tokens_to_string(self, tokens):
+        """Convert a sequence of tokens back to a string"""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # Handle special tokens differently
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        # Add any remaining sub-tokens
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def prepare_for_model(self, *args, **kwargs):
+        """Prepare the tokenized inputs for the model"""
+        # Remove add_special_tokens if present (not supported)
+        if "add_special_tokens" in kwargs:
+            kwargs.pop("add_special_tokens")
+        return super().prepare_for_model(*args, **kwargs)
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`): The directory to save the vocabulary to
+            filename_prefix (`str`, optional): Prefix to add to the filename
+        Returns:
+            `Tuple(str)`: Paths to the saved files
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        # Construct output vocabulary file path
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + self.vocab_files_names["vocab_file"],
+        )
+        # Copy or create vocabulary file
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def _decode(self, *args, **kwargs):
+        """Decode token_id back to text"""
+        # Remove some parameters that aren't used
+        kwargs.pop("clean_up_tokenization_spaces", None)
+        kwargs.pop("spaces_between_special_tokens", None)
+        # Call parent decode method with specific parameters
+        return super()._decode(
+            *args,
+            **kwargs,
+            clean_up_tokenization_spaces=False,
+            spaces_between_special_tokens=False,
+        )
+    def _pad(
+        self,
+        encoded_inputs: Dict,
+        max_length: Optional[int] = None,
+        padding_strategy=PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        **kwargs
+    ) -> dict:
+        """Pad the encoded inputs to the specified length"""
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        if return_attention_mask:
+            required_input = encoded_inputs[self.model_input_names[0]]
+            if padding_strategy == PaddingStrategy.LONGEST:
+                max_length = len(required_input)
+            # Adjust max_length if needed for multiple of padding
+            if (
+                max_length is not None
+                and pad_to_multiple_of is not None
+                and (max_length % pad_to_multiple_of != 0)
+            ):
+                max_length = (
+                    (max_length // pad_to_multiple_of) + 1
+                ) * pad_to_multiple_of
+            # Check if padding is needed
+            needs_to_be_padded = (
+                padding_strategy != PaddingStrategy.DO_NOT_PAD
+                and len(required_input) != max_length
+            )
+            # Handle attention mask if present
+            if (
+                "attention_mask" in encoded_inputs
+                and encoded_inputs["attention_mask"] is not None
+            ):
+                attention_mask = encoded_inputs.pop("attention_mask")
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = attention_mask.numpy()
+                elif isinstance(attention_mask, list):
+                    attention_mask = np.array(attention_mask)
+                elif not isinstance(attention_mask, np.ndarray):
+                    raise ValueError(
+                        f"Unexpected type {type(attention_mask)} of attention_mask, "
+                    )
+            else:
+                # Create default attention mask if none provided
+                attention_mask = np.tril(
+                    np.ones((len(required_input), len(required_input)), dtype=np.int64)
+                )
+                attention_mask = np.expand_dims(attention_mask, axis=0)
+            # Perform padding if needed
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if self.padding_side == "right":
+                    if attention_mask.ndim == 1:
+                        pad_width = [(0, difference)]
+                    else:
+                        pad_width = [(0, 0), (0, difference), (0, difference)]
+                elif self.padding_side == "left":
+                    if attention_mask.ndim == 1:
+                        pad_width = [(difference, 0)]
+                    else:
+                        pad_width = [(0, 0), (difference, 0), (difference, 0)]
+                else:
+                    raise ValueError(
+                        "Invalid padding strategy:" + str(self.padding_side)
+                    )
+                attention_mask = np.pad(
+                    attention_mask,
+                    pad_width=pad_width,
+                    mode="constant",
+                    constant_values=0,
+                )
+        # Call parent padding method
+        encoded_inputs = super()._pad(
+            encoded_inputs,
+            max_length,
+            padding_strategy=padding_strategy,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=False,
+        )
+        # Add attention mask back if needed
+        if return_attention_mask:
+            encoded_inputs["attention_mask"] = attention_mask.tolist()
+        return encoded_inputs
+__all__ = ["Ernie4_5_VLTokenizer"]

tokenizer_config.json CHANGED Viewed

@@ -14,9 +14,9 @@
     "tokenizer_class": "Ernie4_5_VLTokenizer",
     "auto_map": {
         "AutoTokenizer": [
-            "processing_ernie_45t_vl.Ernie4_5_VLTokenizer",
             null
         ]
     },
     "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n    {%- if message.role in ['system', 'user'] -%}\n        {%- if message.role == 'user' -%}\n            {{- 'User: ' -}}\n        {%- endif -%}\n        {%- if message.content is string -%}\n            {{- message.content -}}\n        {%- else -%}\n            {%- for content_item in message.content -%}\n                {%- if content_item.type == 'text' -%}\n                    {{- content_item.text -}}\n                {%- elif content_item.type == 'image_url' -%}\n                    {%- set image_count.value = image_count.value + 1 -%}\n                    Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n                {%- elif content_item.type == 'video_url' -%}\n                    {%- set video_count.value = video_count.value + 1 -%}\n                    Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {%- if message.role == 'system' -%}\n            {{- '\n' -}}\n        {%- endif -%}\n    {%- elif message.role == 'assistant' -%}\n        {%- macro extract_text_content(content_field) -%}\n            {%- if content_field is string -%}\n                {{- content_field -}}\n            {%- elif content_field is iterable and content_field is not string -%}\n                {%- set ns = namespace(text_parts=[]) -%}\n                {%- set text_parts = [] -%}\n                {%- for item in content_field -%}\n                    {%- if item.type == 'text' -%}\n                        {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n                    {%- endif -%}\n                {%- endfor -%}\n                {{- ns.text_parts | join('') -}}\n            {%- else -%}\n                {{- '' -}}\n            {%- endif -%}\n        {%- endmacro -%}\n        {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n        {%- set content = extract_text_content(message.content) -%}\n        {%- if '</think>' in content %}\n            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n            {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n        {%- endif %}\n        {%- if reasoning_content %}\n            {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n        {%- else %}\n            {{- '\n' + 'Assistant: ' + content }}\n        {%- endif %}\n        {{- '<|end_of_sentence|>' }}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n    {{- '\nAssistant: ' -}}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\n\n</think>\n\n' }}\n    {%- endif %}\n    {%- if enable_thinking is not defined or enable_thinking is true %}\n        {{- '<think>' }}\n    {%- endif %}\n{%- endif %}\n"
-}

     "tokenizer_class": "Ernie4_5_VLTokenizer",
     "auto_map": {
         "AutoTokenizer": [
+            "tokenization_ernie_45t_vl.Ernie4_5_VLTokenizer",
             null
         ]
     },
     "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n    {%- if message.role in ['system', 'user'] -%}\n        {%- if message.role == 'user' -%}\n            {{- 'User: ' -}}\n        {%- endif -%}\n        {%- if message.content is string -%}\n            {{- message.content -}}\n        {%- else -%}\n            {%- for content_item in message.content -%}\n                {%- if content_item.type == 'text' -%}\n                    {{- content_item.text -}}\n                {%- elif content_item.type == 'image_url' -%}\n                    {%- set image_count.value = image_count.value + 1 -%}\n                    Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n                {%- elif content_item.type == 'video_url' -%}\n                    {%- set video_count.value = video_count.value + 1 -%}\n                    Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {%- if message.role == 'system' -%}\n            {{- '\n' -}}\n        {%- endif -%}\n    {%- elif message.role == 'assistant' -%}\n        {%- macro extract_text_content(content_field) -%}\n            {%- if content_field is string -%}\n                {{- content_field -}}\n            {%- elif content_field is iterable and content_field is not string -%}\n                {%- set ns = namespace(text_parts=[]) -%}\n                {%- set text_parts = [] -%}\n                {%- for item in content_field -%}\n                    {%- if item.type == 'text' -%}\n                        {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n                    {%- endif -%}\n                {%- endfor -%}\n                {{- ns.text_parts | join('') -}}\n            {%- else -%}\n                {{- '' -}}\n            {%- endif -%}\n        {%- endmacro -%}\n        {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n        {%- set content = extract_text_content(message.content) -%}\n        {%- if '</think>' in content %}\n            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n            {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n        {%- endif %}\n        {%- if reasoning_content %}\n            {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n        {%- else %}\n            {{- '\n' + 'Assistant: ' + content }}\n        {%- endif %}\n        {{- '<|end_of_sentence|>' }}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n    {{- '\nAssistant: ' -}}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\n\n</think>\n\n' }}\n    {%- endif %}\n    {%- if enable_thinking is not defined or enable_thinking is true %}\n        {{- '<think>' }}\n    {%- endif %}\n{%- endif %}\n"
+}