liaojc commited on
Commit
d52e35d
·
verified ·
1 Parent(s): 711fbdc
config.json CHANGED
@@ -37,11 +37,12 @@
37
  "freq_allocation": 20,
38
  "spatial_conv_size": 2,
39
  "temporal_conv_size": 2,
40
- "moe_use_aux_free": true,
41
  "moe_num_experts": [64, 64],
42
  "moe_intermediate_size": [3584, 1536],
43
  "torch_dtype": "bfloat16",
44
  "tie_word_embeddings": false,
 
45
  "vision_config": {
46
  "attn_implementation": "eager",
47
  "depth": 32,
 
37
  "freq_allocation": 20,
38
  "spatial_conv_size": 2,
39
  "temporal_conv_size": 2,
40
+ "moe_use_aux_free": false,
41
  "moe_num_experts": [64, 64],
42
  "moe_intermediate_size": [3584, 1536],
43
  "torch_dtype": "bfloat16",
44
  "tie_word_embeddings": false,
45
+ "moe_multimodal_dispatch_use_allgather": "v2-alltoall-unpad-text",
46
  "vision_config": {
47
  "attn_implementation": "eager",
48
  "depth": 32,
configuration_ernie_45t_vl.py CHANGED
@@ -539,17 +539,19 @@ class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig):
539
  "activation_function": "hidden_act",
540
  }
541
  base_model_tp_plan = {
542
- "ernie.layers.*.self_attn.qkv_proj": "colwise",
543
- "ernie.layers.*.self_attn.o_proj": "rowwise",
544
- "ernie.layers.*.mlp_text.experts.*.up_gate_proj": "colwise",
545
- "ernie.layers.*.mlp_text.experts.*.down_proj": "rowwise",
546
- "ernie.layers.*.mlp_text.gate": "colwise_rep",
547
- "ernie.layers.*.mlp.experts.*.up_gate_proj": "colwise",
548
- "ernie.layers.*.mlp.experts.*.down_proj": "rowwise",
549
- "ernie.layers.*.mlp.gate": "colwise_rep",
550
- "ernie.layers.*.mlp.up_gate_proj": "colwise",
551
- "ernie.layers.*.mlp.down_proj": "rowwise",
552
- "lm_head": "colwise_rep",
 
 
553
  }
554
 
555
  def __init__(
 
539
  "activation_function": "hidden_act",
540
  }
541
  base_model_tp_plan = {
542
+ "model.layers.*.self_attn.q_proj": "colwise_rep",
543
+ "model.layers.*.self_attn.k_proj": "colwise_rep",
544
+ "model.layers.*.self_attn.v_proj": "colwise_rep",
545
+ "model.layers.*.self_attn.o_proj": "rowwise_rep",
546
+ "model.layers.*.mlp.experts.*.gate_proj": "colwise",
547
+ "model.layers.*.mlp.experts.*.up_proj": "colwise",
548
+ "model.layers.*.mlp.experts.*.down_proj": "rowwise",
549
+ "model.layers.*.mlp_text.experts.*.gate_proj": "colwise",
550
+ "model.layers.*.mlp_text.experts.*.up_proj": "colwise",
551
+ "model.layers.*.mlp_text.experts.*.down_proj": "rowwise",
552
+ "model.layers.*.mlp.gate_proj": "colwise",
553
+ "model.layers.*.mlp.up_proj": "colwise",
554
+ "model.layers.*.mlp.down_proj": "rowwise"
555
  }
556
 
557
  def __init__(
generation_config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "top_p": 0.8,
3
  "temperature": 0.2,
 
4
  "pad_token_id": 0,
5
  "bos_token_id": 1,
6
  "eos_token_id": 2,
 
1
  {
2
  "top_p": 0.8,
3
  "temperature": 0.2,
4
+ "do_sample": true,
5
  "pad_token_id": 0,
6
  "bos_token_id": 1,
7
  "eos_token_id": 2,
modeling_ernie_45t_vl.py CHANGED
@@ -27,6 +27,7 @@ import numpy as np
27
  import torch
28
  import torch.nn as nn
29
  import torch.nn.functional as F
 
30
 
31
  from transformers.activations import ACT2FN
32
  from transformers.generation import GenerationMixin
@@ -321,6 +322,7 @@ class Ernie4_5_Attention(nn.Module):
321
  self.hidden_size = config.hidden_size
322
  self.num_heads = config.num_attention_heads
323
  self.num_key_value_heads = config.num_key_value_heads
 
324
  self.head_dim = self.hidden_size // self.num_heads
325
  self.is_gqa = (
326
  self.num_key_value_heads is not None
@@ -373,7 +375,10 @@ class Ernie4_5_Attention(nn.Module):
373
  freq_allocation=self.freq_allocation,
374
  )
375
  self.config = config
376
- self.attn_func = self.core_attn
 
 
 
377
 
378
  def forward(
379
  self,
@@ -446,6 +451,47 @@ class Ernie4_5_Attention(nn.Module):
446
  )
447
  return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  def core_attn(
450
  self,
451
  q,
@@ -493,19 +539,13 @@ class Ernie4_5_Attention(nn.Module):
493
  if getattr(self.config, "scale_qk_coeff", 1.0) != 1.0:
494
  product = product * getattr(self.config, "scale_qk_coeff", 1.0)
495
 
496
- if attention_mask is not None:
497
- attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
498
- attention_mask = attention_mask.to(torch.float32)
499
- product = product + attention_mask
500
- weights = F.softmax(product, dim=-1)
501
- else:
502
- seq_len = product.size(-1)
503
- mask = torch.triu(
504
- torch.ones((seq_len, seq_len), dtype=torch.bool, device=product.device),
505
- diagonal=1,
506
- )
507
- product = product.masked_fill(mask, float("-inf"))
508
- weights = F.softmax(product, dim=-1)
509
 
510
  weights = weights.to(origin_dtype)
511
 
@@ -1508,16 +1548,8 @@ class MOELayer(nn.Module):
1508
  )
1509
  assert self.gate.config.moe_use_aux_free
1510
 
1511
- try:
1512
- self.world_size = torch.distributed.get_world_size()
1513
- self.rank = torch.distributed.get_rank()
1514
- except:
1515
- self.world_size = 1
1516
- self.rank = 0
1517
- if self.world_size < 1:
1518
- self.world_size = 1
1519
- if self.rank < 0:
1520
- self.rank = 0
1521
 
1522
  self.multimodal_experts = (
1523
  isinstance(moe_num_experts, (tuple, list)) and len(moe_num_experts) > 1
@@ -1803,7 +1835,7 @@ class MOEAllGatherLayerV2(MOELayer):
1803
  enable_reverse_token_drop=False,
1804
  all_to_all_dropout=0,
1805
  group_experts=False,
1806
- use_expert_out_alltoall=True, #
1807
  use_expert_alltoall_overlap=False,
1808
  use_padding=True,
1809
  dense_token_type=3, # considerd as dense tokens (no moe)
@@ -2729,7 +2761,6 @@ class Ernie4_5_PretrainedModel(PreTrainedModel):
2729
  config_class = Ernie4_5_MoEConfig
2730
  base_model_prefix = "ernie"
2731
  _no_split_modules = ["Ernie4_5_DecoderLayer"]
2732
- # _keep_in_fp32_modules = ["mlp.gate", "e_score_correction_bias"]
2733
 
2734
 
2735
  class Ernie4_5_Model(Ernie4_5_PretrainedModel):
@@ -2876,7 +2907,6 @@ class Ernie4_5_Model(Ernie4_5_PretrainedModel):
2876
  past_key_value = (
2877
  past_key_values[idx] if past_key_values is not None else None
2878
  )
2879
-
2880
  layer_outputs = decoder_layer(
2881
  hidden_states,
2882
  attention_mask,
@@ -3224,15 +3254,61 @@ class Ernie4_5_MoeForCausalLM(Ernie4_5_PretrainedModel, GenerationMixin):
3224
  """
3225
  return self.model
3226
 
3227
- def prepare_attention_mask_for_generation(
3228
- self, input_ids, pad_token_id, eos_token_id
3229
- ):
3230
- """Avoid using attention_mask with flash_attn on generation."""
3231
- if self.config.use_flash_attention:
3232
- return None
3233
- return super().prepare_attention_mask_for_generation(
3234
- input_ids, pad_token_id, eos_token_id
3235
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3236
 
3237
 
3238
  class VisionMlp(nn.Module):
@@ -3943,7 +4019,10 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
3943
  image_type_ids[:, -1:] if image_type_ids is not None else None
3944
  )
3945
 
3946
- attention_mask = kwargs.get("attention_mask", None)
 
 
 
3947
 
3948
  # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
3949
  if inputs_embeds is not None and past_key_values is None:
@@ -4077,7 +4156,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
4077
 
4078
  if images is not None and image_features is not None:
4079
  inputs_embeds = self.vision_mapping_forward(
4080
- token_type_ids,
4081
  token_type_ids_w_video,
4082
  input_ids,
4083
  mm_input_ids,
@@ -4091,7 +4170,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
4091
 
4092
  outputs = self.model(
4093
  position_ids=position_ids,
4094
- attention_mask=None,
4095
  token_type_ids=token_type_ids,
4096
  inputs_embeds=inputs_embeds,
4097
  use_cache=use_cache,
 
27
  import torch
28
  import torch.nn as nn
29
  import torch.nn.functional as F
30
+ from torch.nn.attention import SDPBackend, sdpa_kernel
31
 
32
  from transformers.activations import ACT2FN
33
  from transformers.generation import GenerationMixin
 
322
  self.hidden_size = config.hidden_size
323
  self.num_heads = config.num_attention_heads
324
  self.num_key_value_heads = config.num_key_value_heads
325
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
326
  self.head_dim = self.hidden_size // self.num_heads
327
  self.is_gqa = (
328
  self.num_key_value_heads is not None
 
375
  freq_allocation=self.freq_allocation,
376
  )
377
  self.config = config
378
+ if self.config.use_flash_attention:
379
+ self.attn_func = self._flash_attention_wrapper
380
+ else:
381
+ self.attn_func = self.core_attn
382
 
383
  def forward(
384
  self,
 
451
  )
452
  return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
453
 
454
+ def _flash_attention_wrapper(
455
+ self,
456
+ q,
457
+ k,
458
+ v,
459
+ attention_mask=None,
460
+ attn_mask_start_row_indices=None,
461
+ seq_length=None,
462
+ ):
463
+ """Wrapper for flash attention implementation.
464
+ Args:
465
+ q (torch.Tensor): Query tensor
466
+ k (torch.Tensor): Key tensor
467
+ v (torch.Tensor): Value tensor
468
+ attention_mask (Optional[torch.Tensor]): Attention mask
469
+ attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length indices
470
+ seq_length (Optional[int]): Sequence length
471
+ Returns:
472
+ Tuple[torch.Tensor, torch.Tensor]: Attention output and weights
473
+ """
474
+ q = q.transpose(1, 2)
475
+ k = k.transpose(1, 2)
476
+ v = v.transpose(1, 2)
477
+
478
+ with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
479
+ out = F.scaled_dot_product_attention(
480
+ q,
481
+ k,
482
+ v,
483
+ attn_mask=None,
484
+ dropout_p=self.config.attention_probs_dropout_prob,
485
+ is_causal=q.shape[-2] == k.shape[-2],
486
+ scale=1
487
+ / (getattr(self.config, "scale_qk_coeff", 1.0) * self.head_dim**0.5),
488
+ enable_gqa=self.is_gqa,
489
+ )
490
+ out = out.transpose(1, 2)
491
+ out = out.contiguous().view(out.size(0), out.size(1), -1)
492
+
493
+ return out, None
494
+
495
  def core_attn(
496
  self,
497
  q,
 
539
  if getattr(self.config, "scale_qk_coeff", 1.0) != 1.0:
540
  product = product * getattr(self.config, "scale_qk_coeff", 1.0)
541
 
542
+ seq_len = product.size(-1)
543
+ mask = torch.triu(
544
+ torch.ones((seq_len, seq_len), dtype=torch.bool, device=product.device),
545
+ diagonal=1,
546
+ )
547
+ product = product.masked_fill(mask, float("-inf"))
548
+ weights = F.softmax(product, dim=-1)
 
 
 
 
 
 
549
 
550
  weights = weights.to(origin_dtype)
551
 
 
1548
  )
1549
  assert self.gate.config.moe_use_aux_free
1550
 
1551
+ self.world_size = 1
1552
+ self.rank = 0
 
 
 
 
 
 
 
 
1553
 
1554
  self.multimodal_experts = (
1555
  isinstance(moe_num_experts, (tuple, list)) and len(moe_num_experts) > 1
 
1835
  enable_reverse_token_drop=False,
1836
  all_to_all_dropout=0,
1837
  group_experts=False,
1838
+ use_expert_out_alltoall=True,
1839
  use_expert_alltoall_overlap=False,
1840
  use_padding=True,
1841
  dense_token_type=3, # considerd as dense tokens (no moe)
 
2761
  config_class = Ernie4_5_MoEConfig
2762
  base_model_prefix = "ernie"
2763
  _no_split_modules = ["Ernie4_5_DecoderLayer"]
 
2764
 
2765
 
2766
  class Ernie4_5_Model(Ernie4_5_PretrainedModel):
 
2907
  past_key_value = (
2908
  past_key_values[idx] if past_key_values is not None else None
2909
  )
 
2910
  layer_outputs = decoder_layer(
2911
  hidden_states,
2912
  attention_mask,
 
3254
  """
3255
  return self.model
3256
 
3257
+ # @staticmethod
3258
+ def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder=False):
3259
+ """
3260
+ Updates model kwargs for generation.
3261
+
3262
+ Args:
3263
+ outputs (Any): Model outputs.
3264
+ model_kwargs (dict): Current model kwargs.
3265
+ is_encoder_decoder (bool): Whether using encoder-decoder architecture.
3266
+
3267
+ Returns:
3268
+ dict: Updated model kwargs.
3269
+ """
3270
+ # update cache
3271
+ if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], torch.Tensor):
3272
+ model_kwargs["past_key_values"] = outputs[1]
3273
+
3274
+ if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
3275
+ model_kwargs["past_key_values"] = outputs.past_key_values
3276
+
3277
+ # update token_type_ids with last value
3278
+ if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
3279
+ token_type_ids = model_kwargs["token_type_ids"]
3280
+ model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1:]], dim=-1)
3281
+
3282
+ if not is_encoder_decoder and model_kwargs.get("attention_mask", None) is not None:
3283
+ # update attention mask
3284
+ attention_mask = model_kwargs["attention_mask"]
3285
+ model_kwargs["attention_mask"] = torch.cat(
3286
+ [
3287
+ attention_mask,
3288
+ torch.ones((attention_mask.shape[0], 1), dtype=torch.int64, device=attention_mask.device),
3289
+ ],
3290
+ dim=-1,
3291
+ )
3292
+
3293
+ # update role_ids
3294
+ if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
3295
+ role_ids = model_kwargs["role_ids"]
3296
+ model_kwargs["role_ids"] = torch.cat([role_ids, role_ids[:, -1:]], dim=-1)
3297
+
3298
+ if self.config.get('rope_3d', False):
3299
+ assert "position_ids" in model_kwargs, "position_ids must be provided if rope_3d is on"
3300
+ position_ids = model_kwargs["position_ids"]
3301
+ bsz = position_ids.shape[0]
3302
+
3303
+ max_position = position_ids.max(dim=1, keepdim=True)[0] # [batch_size, 1, hidden_dim]
3304
+ new_positions = max_position + 1
3305
+
3306
+ model_kwargs["position_ids"] = torch.cat(
3307
+ [position_ids, new_positions],
3308
+ dim=1
3309
+ )
3310
+
3311
+ return model_kwargs
3312
 
3313
 
3314
  class VisionMlp(nn.Module):
 
4019
  image_type_ids[:, -1:] if image_type_ids is not None else None
4020
  )
4021
 
4022
+ if self.config.use_flash_attention:
4023
+ attention_mask = None
4024
+ else:
4025
+ attention_mask = kwargs.get("attention_mask", None)
4026
 
4027
  # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
4028
  if inputs_embeds is not None and past_key_values is None:
 
4156
 
4157
  if images is not None and image_features is not None:
4158
  inputs_embeds = self.vision_mapping_forward(
4159
+ token_type_ids[..., :-1],
4160
  token_type_ids_w_video,
4161
  input_ids,
4162
  mm_input_ids,
 
4170
 
4171
  outputs = self.model(
4172
  position_ids=position_ids,
4173
+ attention_mask=attention_mask,
4174
  token_type_ids=token_type_ids,
4175
  inputs_embeds=inputs_embeds,
4176
  use_cache=use_cache,
processing_ernie_45t_vl.py CHANGED
@@ -17,7 +17,6 @@
17
  import copy
18
  import io
19
  import os
20
- import re
21
  import math
22
  import random
23
  import requests
@@ -27,15 +26,13 @@ import hashlib
27
  import threading
28
  import uuid
29
  import decord
30
- from shutil import copyfile
31
- from typing import Dict, List, Optional, Tuple, Union
32
 
33
  import numpy as np
34
  import torch
35
  from PIL import Image, ImageDraw, ImageFont
36
  from PIL.ExifTags import TAGS
37
  from collections import defaultdict
38
- from typing import Any, Dict, List, Union
39
  from pathlib import Path
40
  from tempfile import NamedTemporaryFile as ntf
41
 
@@ -46,13 +43,8 @@ except:
46
  # moviepy 2.0
47
  import moviepy as mp
48
 
49
- import sentencepiece as spm
50
- from transformers.tokenization_utils import PreTrainedTokenizer
51
- from transformers.tokenization_utils_base import (
52
- PaddingStrategy,
53
- TextInput,
54
- )
55
- from transformers.utils import logging
56
  from transformers.utils import TensorType, logging
57
  from transformers.video_utils import VideoInput
58
  from transformers.processing_utils import ProcessorMixin
@@ -618,298 +610,15 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor):
618
  return BatchFeature(data=data, tensor_type=return_tensors)
619
 
620
 
621
- class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
622
- """
623
- Ernie4_5_VLTokenizer
624
- """
625
-
626
- vocab_files_names = {
627
- "vocab_file": "tokenizer.model",
628
- }
629
- # Model input names expected by the tokenizer
630
- model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
631
- # Padding side (where to add padding tokens)
632
- padding_side = "right"
633
-
634
- def __init__(
635
- self,
636
- vocab_file,
637
- bos_token="<s>",
638
- cls_token="<cls>",
639
- eos_token="</s>",
640
- mask_token="<mask:0>",
641
- pad_token="<pad>",
642
- sep_token="<sep>",
643
- unk_token="<unk>",
644
- additional_special_tokens=None,
645
- **kwargs,
646
- ):
647
- """
648
- Initialize the Ernie4_5_VLTokenizer
649
-
650
- Args:
651
- vocab_file (str): Path to the tokenizer vocabulary model.
652
- bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
653
- cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
654
- eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
655
- mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
656
- pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
657
- sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
658
- unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
659
- additional_special_tokens (List[str], optional): Additional special tokens to use.
660
- Defaults to `["<mask:1>", "<mask:7>"]`.
661
- **kwargs (dict): Additional keyword arguments passed along to the superclass.
662
- """
663
-
664
- # Store vocabulary file path
665
- self.vocab_file = vocab_file
666
- # Initialize SentencePiece processor
667
- self.sp_model = spm.SentencePieceProcessor()
668
- # Load the vocabulary model
669
- self.sp_model.Load(vocab_file)
670
-
671
- # Set default additional special tokens if none provided
672
- if additional_special_tokens is None:
673
- additional_special_tokens = ["<mask:1>", "<mask:7>"]
674
- super().__init__(
675
- bos_token=bos_token,
676
- cls_token=cls_token,
677
- eos_token=eos_token,
678
- mask_token=mask_token,
679
- pad_token=pad_token,
680
- sep_token=sep_token,
681
- unk_token=unk_token,
682
- additional_special_tokens=additional_special_tokens,
683
- **kwargs,
684
- )
685
-
686
- @property
687
- def space_token(self):
688
- """Return the space token"""
689
- return "<mask:1>"
690
-
691
- @property
692
- def space_token_id(self):
693
- """Return the ID of the space token"""
694
- return self.sp_model.piece_to_id("<mask:1>")
695
-
696
- @property
697
- def gend_token(self):
698
- """Return the gender token"""
699
- return "<mask:7>"
700
-
701
- @property
702
- def gend_token_id(self):
703
- """Return the ID of the gender token"""
704
- return self.sp_model.piece_to_id("<mask:7>")
705
-
706
- @property
707
- def im_start_id(self):
708
- """Return the ID of the image start token"""
709
- return self.sp_model.piece_to_id("<|im_start|>")
710
-
711
- @property
712
- def im_end_id(self):
713
- """Return the ID of the image end token"""
714
- return self.sp_model.piece_to_id("<|im_end|>")
715
-
716
- @property
717
- def vocab_size(self):
718
- """Return the size of the vocabulary"""
719
- return self.sp_model.vocab_size()
720
-
721
- def get_vocab(self):
722
- """Return the vocabulary as a dictionary mapping tokens to IDs"""
723
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
724
- vocab.update(self.added_tokens_encoder)
725
- return vocab
726
-
727
- def _tokenize(self, text):
728
- """Tokenize the input text into pieces"""
729
- return self.sp_model.encode_as_pieces(text)
730
-
731
- def _convert_token_to_id(self, token):
732
- """Convert a token to its corresponding ID"""
733
- return self.sp_model.piece_to_id(token)
734
-
735
- def _convert_id_to_token(self, id):
736
- """Convert an ID to its corresponding token"""
737
- return self.sp_model.id_to_piece(id)
738
-
739
- def convert_tokens_to_string(self, tokens):
740
- """Convert a sequence of tokens back to a string"""
741
- current_sub_tokens = []
742
- out_string = ""
743
-
744
- for token in tokens:
745
- # Handle special tokens differently
746
- if token in self.all_special_tokens:
747
- out_string += self.sp_model.decode(current_sub_tokens) + token
748
- current_sub_tokens = []
749
- else:
750
- current_sub_tokens.append(token)
751
-
752
- # Add any remaining sub-tokens
753
- out_string += self.sp_model.decode(current_sub_tokens)
754
- return out_string
755
-
756
- def prepare_for_model(self, *args, **kwargs):
757
- """Prepare the tokenized inputs for the model"""
758
- # Remove add_special_tokens if present (not supported)
759
- if "add_special_tokens" in kwargs:
760
- kwargs.pop("add_special_tokens")
761
- return super().prepare_for_model(*args, **kwargs)
762
-
763
- def save_vocabulary(
764
- self, save_directory, filename_prefix: Optional[str] = None
765
- ) -> Tuple[str]:
766
- """
767
- Save the vocabulary and special tokens file to a directory.
768
-
769
- Args:
770
- save_directory (`str`): The directory to save the vocabulary to
771
- filename_prefix (`str`, optional): Prefix to add to the filename
772
-
773
- Returns:
774
- `Tuple(str)`: Paths to the saved files
775
- """
776
- if not os.path.isdir(save_directory):
777
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
778
- return
779
-
780
- # Construct output vocabulary file path
781
- out_vocab_file = os.path.join(
782
- save_directory,
783
- (filename_prefix + "-" if filename_prefix else "")
784
- + self.vocab_files_names["vocab_file"],
785
- )
786
-
787
- # Copy or create vocabulary file
788
- if os.path.abspath(self.vocab_file) != os.path.abspath(
789
- out_vocab_file
790
- ) and os.path.isfile(self.vocab_file):
791
- copyfile(self.vocab_file, out_vocab_file)
792
- elif not os.path.isfile(self.vocab_file):
793
- with open(out_vocab_file, "wb") as fi:
794
- content_spiece_model = self.sp_model.serialized_model_proto()
795
- fi.write(content_spiece_model)
796
-
797
- return (out_vocab_file,)
798
-
799
- def _decode(self, *args, **kwargs):
800
- """Decode token_id back to text"""
801
- # Remove some parameters that aren't used
802
- kwargs.pop("clean_up_tokenization_spaces", None)
803
- kwargs.pop("spaces_between_special_tokens", None)
804
-
805
- # Call parent decode method with specific parameters
806
- return super()._decode(
807
- *args,
808
- **kwargs,
809
- clean_up_tokenization_spaces=False,
810
- spaces_between_special_tokens=False,
811
- )
812
-
813
- def _pad(
814
- self,
815
- encoded_inputs: Dict,
816
- max_length: Optional[int] = None,
817
- padding_strategy=PaddingStrategy.DO_NOT_PAD,
818
- pad_to_multiple_of: Optional[int] = None,
819
- return_attention_mask: Optional[bool] = None,
820
- ) -> dict:
821
- """Pad the encoded inputs to the specified length"""
822
- if return_attention_mask is None:
823
- return_attention_mask = "attention_mask" in self.model_input_names
824
- if return_attention_mask:
825
- required_input = encoded_inputs[self.model_input_names[0]]
826
- if padding_strategy == PaddingStrategy.LONGEST:
827
- max_length = len(required_input)
828
-
829
- # Adjust max_length if needed for multiple of padding
830
- if (
831
- max_length is not None
832
- and pad_to_multiple_of is not None
833
- and (max_length % pad_to_multiple_of != 0)
834
- ):
835
- max_length = (
836
- (max_length // pad_to_multiple_of) + 1
837
- ) * pad_to_multiple_of
838
-
839
- # Check if padding is needed
840
- needs_to_be_padded = (
841
- padding_strategy != PaddingStrategy.DO_NOT_PAD
842
- and len(required_input) != max_length
843
- )
844
-
845
- # Handle attention mask if present
846
- if (
847
- "attention_mask" in encoded_inputs
848
- and encoded_inputs["attention_mask"] is not None
849
- ):
850
- attention_mask = encoded_inputs.pop("attention_mask")
851
- if isinstance(attention_mask, torch.Tensor):
852
- attention_mask = attention_mask.numpy()
853
- elif isinstance(attention_mask, list):
854
- attention_mask = np.array(attention_mask)
855
- elif not isinstance(attention_mask, np.ndarray):
856
- raise ValueError(
857
- f"Unexpected type {type(attention_mask)} of attention_mask, "
858
- )
859
- else:
860
- # Create default attention mask if none provided
861
- attention_mask = np.tril(
862
- np.ones((len(required_input), len(required_input)), dtype=np.int64)
863
- )
864
- attention_mask = np.expand_dims(attention_mask, axis=0)
865
-
866
- # Perform padding if needed
867
- if needs_to_be_padded:
868
- difference = max_length - len(required_input)
869
- if self.padding_side == "right":
870
- if attention_mask.ndim == 1:
871
- pad_width = [(0, difference)]
872
- else:
873
- pad_width = [(0, 0), (0, difference), (0, difference)]
874
- elif self.padding_side == "left":
875
- if attention_mask.ndim == 1:
876
- pad_width = [(difference, 0)]
877
- else:
878
- pad_width = [(0, 0), (difference, 0), (difference, 0)]
879
- else:
880
- raise ValueError(
881
- "Invalid padding strategy:" + str(self.padding_side)
882
- )
883
-
884
- attention_mask = np.pad(
885
- attention_mask,
886
- pad_width=pad_width,
887
- mode="constant",
888
- constant_values=0,
889
- )
890
-
891
- # Call parent padding method
892
- encoded_inputs = super()._pad(
893
- encoded_inputs,
894
- max_length,
895
- padding_strategy=padding_strategy,
896
- pad_to_multiple_of=pad_to_multiple_of,
897
- return_attention_mask=False,
898
- )
899
-
900
- # Add attention mask back if needed
901
- if return_attention_mask:
902
- encoded_inputs["attention_mask"] = attention_mask.tolist()
903
-
904
- return encoded_inputs
905
-
906
-
907
  RAW_VIDEO_DIR = "./download_tmp/raw_video/"
908
  RAW_IMAGE_DIR = "./download_tmp/raw_images/"
909
  EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
910
  TMP_DIR = "./download_tmp/upload_tmp/"
911
 
912
  FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
 
 
 
913
 
914
 
915
  def is_gif(data: bytes) -> bool:
@@ -1811,4 +1520,4 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
1811
  return list(tokenizer_input_names) + list(image_processor_input_names)
1812
 
1813
 
1814
- __all__ = ["Ernie_45T_VLImageProcessor", "Ernie4_5_VLTokenizer", "Ernie_45T_VLProcessor"]
 
17
  import copy
18
  import io
19
  import os
 
20
  import math
21
  import random
22
  import requests
 
26
  import threading
27
  import uuid
28
  import decord
29
+ from typing import Any, Dict, List, Optional, Tuple, Union
 
30
 
31
  import numpy as np
32
  import torch
33
  from PIL import Image, ImageDraw, ImageFont
34
  from PIL.ExifTags import TAGS
35
  from collections import defaultdict
 
36
  from pathlib import Path
37
  from tempfile import NamedTemporaryFile as ntf
38
 
 
43
  # moviepy 2.0
44
  import moviepy as mp
45
 
46
+ from .tokenization_ernie_45t_vl import Ernie4_5_VLTokenizer
47
+
 
 
 
 
 
48
  from transformers.utils import TensorType, logging
49
  from transformers.video_utils import VideoInput
50
  from transformers.processing_utils import ProcessorMixin
 
610
  return BatchFeature(data=data, tensor_type=return_tensors)
611
 
612
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  RAW_VIDEO_DIR = "./download_tmp/raw_video/"
614
  RAW_IMAGE_DIR = "./download_tmp/raw_images/"
615
  EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
616
  TMP_DIR = "./download_tmp/upload_tmp/"
617
 
618
  FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
619
+ if not os.path.exists(FONT_PATH):
620
+ ttf = requests.get("https://paddlenlp.bj.bcebos.com/vision-language-models/materials/Roboto-Regular.ttf")
621
+ open(FONT_PATH, "wb").write(ttf.content)
622
 
623
 
624
  def is_gif(data: bytes) -> bool:
 
1520
  return list(tokenizer_input_names) + list(image_processor_input_names)
1521
 
1522
 
1523
+ __all__ = ["Ernie_45T_VLImageProcessor", "Ernie_45T_VLProcessor"]
tokenization_ernie_45t_vl.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Tokenization classes for Ernie_45T_VL."""
16
+
17
+ import os
18
+ from shutil import copyfile
19
+ from typing import Dict, List, Optional, Tuple
20
+ import numpy as np
21
+ import torch
22
+ import sentencepiece as spm
23
+ from transformers.tokenization_utils import PreTrainedTokenizer
24
+ from transformers.tokenization_utils_base import (
25
+ PaddingStrategy,
26
+ TextInput,
27
+ )
28
+ from transformers.utils import logging
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+
34
+
35
+ class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
36
+ """
37
+ Ernie4_5_VLTokenizer
38
+ """
39
+
40
+ vocab_files_names = {
41
+ "vocab_file": "tokenizer.model",
42
+ }
43
+ # Model input names expected by the tokenizer
44
+ model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
45
+ # Padding side (where to add padding tokens)
46
+ padding_side = "right"
47
+
48
+ def __init__(
49
+ self,
50
+ vocab_file,
51
+ bos_token="<s>",
52
+ cls_token="<cls>",
53
+ eos_token="</s>",
54
+ mask_token="<mask:0>",
55
+ pad_token="<pad>",
56
+ sep_token="<sep>",
57
+ unk_token="<unk>",
58
+ additional_special_tokens=None,
59
+ **kwargs,
60
+ ):
61
+ """
62
+ Initialize the Ernie4_5_VLTokenizer
63
+
64
+ Args:
65
+ vocab_file (str): Path to the tokenizer vocabulary model.
66
+ bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
67
+ cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
68
+ eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
69
+ mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
70
+ pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
71
+ sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
72
+ unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
73
+ additional_special_tokens (List[str], optional): Additional special tokens to use.
74
+ Defaults to `["<mask:1>", "<mask:7>"]`.
75
+ **kwargs (dict): Additional keyword arguments passed along to the superclass.
76
+ """
77
+
78
+ # Store vocabulary file path
79
+ self.vocab_file = vocab_file
80
+ # Initialize SentencePiece processor
81
+ self.sp_model = spm.SentencePieceProcessor()
82
+ # Load the vocabulary model
83
+ self.sp_model.Load(vocab_file)
84
+
85
+ # Set default additional special tokens if none provided
86
+ if additional_special_tokens is None:
87
+ additional_special_tokens = ["<mask:1>", "<mask:7>"]
88
+ super().__init__(
89
+ bos_token=bos_token,
90
+ cls_token=cls_token,
91
+ eos_token=eos_token,
92
+ mask_token=mask_token,
93
+ pad_token=pad_token,
94
+ sep_token=sep_token,
95
+ unk_token=unk_token,
96
+ additional_special_tokens=additional_special_tokens,
97
+ **kwargs,
98
+ )
99
+
100
+ @property
101
+ def space_token(self):
102
+ """Return the space token"""
103
+ return "<mask:1>"
104
+
105
+ @property
106
+ def space_token_id(self):
107
+ """Return the ID of the space token"""
108
+ return self.sp_model.piece_to_id("<mask:1>")
109
+
110
+ @property
111
+ def gend_token(self):
112
+ """Return the gender token"""
113
+ return "<mask:7>"
114
+
115
+ @property
116
+ def gend_token_id(self):
117
+ """Return the ID of the gender token"""
118
+ return self.sp_model.piece_to_id("<mask:7>")
119
+
120
+ @property
121
+ def im_start_id(self):
122
+ """Return the ID of the image start token"""
123
+ return self.sp_model.piece_to_id("<|im_start|>")
124
+
125
+ @property
126
+ def im_end_id(self):
127
+ """Return the ID of the image end token"""
128
+ return self.sp_model.piece_to_id("<|im_end|>")
129
+
130
+ @property
131
+ def vocab_size(self):
132
+ """Return the size of the vocabulary"""
133
+ return self.sp_model.vocab_size()
134
+
135
+ def get_vocab(self):
136
+ """Return the vocabulary as a dictionary mapping tokens to IDs"""
137
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
138
+ vocab.update(self.added_tokens_encoder)
139
+ return vocab
140
+
141
+ def _tokenize(self, text):
142
+ """Tokenize the input text into pieces"""
143
+ return self.sp_model.encode_as_pieces(text)
144
+
145
+ def _convert_token_to_id(self, token):
146
+ """Convert a token to its corresponding ID"""
147
+ return self.sp_model.piece_to_id(token)
148
+
149
+ def _convert_id_to_token(self, id):
150
+ """Convert an ID to its corresponding token"""
151
+ return self.sp_model.id_to_piece(id)
152
+
153
+ def convert_tokens_to_string(self, tokens):
154
+ """Convert a sequence of tokens back to a string"""
155
+ current_sub_tokens = []
156
+ out_string = ""
157
+
158
+ for token in tokens:
159
+ # Handle special tokens differently
160
+ if token in self.all_special_tokens:
161
+ out_string += self.sp_model.decode(current_sub_tokens) + token
162
+ current_sub_tokens = []
163
+ else:
164
+ current_sub_tokens.append(token)
165
+
166
+ # Add any remaining sub-tokens
167
+ out_string += self.sp_model.decode(current_sub_tokens)
168
+ return out_string
169
+
170
+ def prepare_for_model(self, *args, **kwargs):
171
+ """Prepare the tokenized inputs for the model"""
172
+ # Remove add_special_tokens if present (not supported)
173
+ if "add_special_tokens" in kwargs:
174
+ kwargs.pop("add_special_tokens")
175
+ return super().prepare_for_model(*args, **kwargs)
176
+
177
+ def save_vocabulary(
178
+ self, save_directory, filename_prefix: Optional[str] = None
179
+ ) -> Tuple[str]:
180
+ """
181
+ Save the vocabulary and special tokens file to a directory.
182
+
183
+ Args:
184
+ save_directory (`str`): The directory to save the vocabulary to
185
+ filename_prefix (`str`, optional): Prefix to add to the filename
186
+
187
+ Returns:
188
+ `Tuple(str)`: Paths to the saved files
189
+ """
190
+ if not os.path.isdir(save_directory):
191
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
192
+ return
193
+
194
+ # Construct output vocabulary file path
195
+ out_vocab_file = os.path.join(
196
+ save_directory,
197
+ (filename_prefix + "-" if filename_prefix else "")
198
+ + self.vocab_files_names["vocab_file"],
199
+ )
200
+
201
+ # Copy or create vocabulary file
202
+ if os.path.abspath(self.vocab_file) != os.path.abspath(
203
+ out_vocab_file
204
+ ) and os.path.isfile(self.vocab_file):
205
+ copyfile(self.vocab_file, out_vocab_file)
206
+ elif not os.path.isfile(self.vocab_file):
207
+ with open(out_vocab_file, "wb") as fi:
208
+ content_spiece_model = self.sp_model.serialized_model_proto()
209
+ fi.write(content_spiece_model)
210
+
211
+ return (out_vocab_file,)
212
+
213
+ def _decode(self, *args, **kwargs):
214
+ """Decode token_id back to text"""
215
+ # Remove some parameters that aren't used
216
+ kwargs.pop("clean_up_tokenization_spaces", None)
217
+ kwargs.pop("spaces_between_special_tokens", None)
218
+
219
+ # Call parent decode method with specific parameters
220
+ return super()._decode(
221
+ *args,
222
+ **kwargs,
223
+ clean_up_tokenization_spaces=False,
224
+ spaces_between_special_tokens=False,
225
+ )
226
+
227
+ def _pad(
228
+ self,
229
+ encoded_inputs: Dict,
230
+ max_length: Optional[int] = None,
231
+ padding_strategy=PaddingStrategy.DO_NOT_PAD,
232
+ pad_to_multiple_of: Optional[int] = None,
233
+ return_attention_mask: Optional[bool] = None,
234
+ **kwargs
235
+ ) -> dict:
236
+ """Pad the encoded inputs to the specified length"""
237
+ if return_attention_mask is None:
238
+ return_attention_mask = "attention_mask" in self.model_input_names
239
+ if return_attention_mask:
240
+ required_input = encoded_inputs[self.model_input_names[0]]
241
+ if padding_strategy == PaddingStrategy.LONGEST:
242
+ max_length = len(required_input)
243
+
244
+ # Adjust max_length if needed for multiple of padding
245
+ if (
246
+ max_length is not None
247
+ and pad_to_multiple_of is not None
248
+ and (max_length % pad_to_multiple_of != 0)
249
+ ):
250
+ max_length = (
251
+ (max_length // pad_to_multiple_of) + 1
252
+ ) * pad_to_multiple_of
253
+
254
+ # Check if padding is needed
255
+ needs_to_be_padded = (
256
+ padding_strategy != PaddingStrategy.DO_NOT_PAD
257
+ and len(required_input) != max_length
258
+ )
259
+
260
+ # Handle attention mask if present
261
+ if (
262
+ "attention_mask" in encoded_inputs
263
+ and encoded_inputs["attention_mask"] is not None
264
+ ):
265
+ attention_mask = encoded_inputs.pop("attention_mask")
266
+ if isinstance(attention_mask, torch.Tensor):
267
+ attention_mask = attention_mask.numpy()
268
+ elif isinstance(attention_mask, list):
269
+ attention_mask = np.array(attention_mask)
270
+ elif not isinstance(attention_mask, np.ndarray):
271
+ raise ValueError(
272
+ f"Unexpected type {type(attention_mask)} of attention_mask, "
273
+ )
274
+ else:
275
+ # Create default attention mask if none provided
276
+ attention_mask = np.tril(
277
+ np.ones((len(required_input), len(required_input)), dtype=np.int64)
278
+ )
279
+ attention_mask = np.expand_dims(attention_mask, axis=0)
280
+
281
+ # Perform padding if needed
282
+ if needs_to_be_padded:
283
+ difference = max_length - len(required_input)
284
+ if self.padding_side == "right":
285
+ if attention_mask.ndim == 1:
286
+ pad_width = [(0, difference)]
287
+ else:
288
+ pad_width = [(0, 0), (0, difference), (0, difference)]
289
+ elif self.padding_side == "left":
290
+ if attention_mask.ndim == 1:
291
+ pad_width = [(difference, 0)]
292
+ else:
293
+ pad_width = [(0, 0), (difference, 0), (difference, 0)]
294
+ else:
295
+ raise ValueError(
296
+ "Invalid padding strategy:" + str(self.padding_side)
297
+ )
298
+
299
+ attention_mask = np.pad(
300
+ attention_mask,
301
+ pad_width=pad_width,
302
+ mode="constant",
303
+ constant_values=0,
304
+ )
305
+
306
+ # Call parent padding method
307
+ encoded_inputs = super()._pad(
308
+ encoded_inputs,
309
+ max_length,
310
+ padding_strategy=padding_strategy,
311
+ pad_to_multiple_of=pad_to_multiple_of,
312
+ return_attention_mask=False,
313
+ )
314
+
315
+ # Add attention mask back if needed
316
+ if return_attention_mask:
317
+ encoded_inputs["attention_mask"] = attention_mask.tolist()
318
+
319
+ return encoded_inputs
320
+
321
+
322
+ __all__ = ["Ernie4_5_VLTokenizer"]
323
+
tokenizer_config.json CHANGED
@@ -14,9 +14,9 @@
14
  "tokenizer_class": "Ernie4_5_VLTokenizer",
15
  "auto_map": {
16
  "AutoTokenizer": [
17
- "processing_ernie_45t_vl.Ernie4_5_VLTokenizer",
18
  null
19
  ]
20
  },
21
  "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n {%- if message.role in ['system', 'user'] -%}\n {%- if message.role == 'user' -%}\n {{- 'User: ' -}}\n {%- endif -%}\n {%- if message.content is string -%}\n {{- message.content -}}\n {%- else -%}\n {%- for content_item in message.content -%}\n {%- if content_item.type == 'text' -%}\n {{- content_item.text -}}\n {%- elif content_item.type == 'image_url' -%}\n {%- set image_count.value = image_count.value + 1 -%}\n Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n {%- elif content_item.type == 'video_url' -%}\n {%- set video_count.value = video_count.value + 1 -%}\n Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- if message.role == 'system' -%}\n {{- '\n' -}}\n {%- endif -%}\n {%- elif message.role == 'assistant' -%}\n {%- macro extract_text_content(content_field) -%}\n {%- if content_field is string -%}\n {{- content_field -}}\n {%- elif content_field is iterable and content_field is not string -%}\n {%- set ns = namespace(text_parts=[]) -%}\n {%- set text_parts = [] -%}\n {%- for item in content_field -%}\n {%- if item.type == 'text' -%}\n {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n {%- endif -%}\n {%- endfor -%}\n {{- ns.text_parts | join('') -}}\n {%- else -%}\n {{- '' -}}\n {%- endif -%}\n {%- endmacro -%}\n {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n {%- set content = extract_text_content(message.content) -%}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n {%- endif %}\n {%- if reasoning_content %}\n {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n {%- else %}\n {{- '\n' + 'Assistant: ' + content }}\n {%- endif %}\n {{- '<|end_of_sentence|>' }}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n {{- '\nAssistant: ' -}}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\n\n</think>\n\n' }}\n {%- endif %}\n {%- if enable_thinking is not defined or enable_thinking is true %}\n {{- '<think>' }}\n {%- endif %}\n{%- endif %}\n"
22
- }
 
14
  "tokenizer_class": "Ernie4_5_VLTokenizer",
15
  "auto_map": {
16
  "AutoTokenizer": [
17
+ "tokenization_ernie_45t_vl.Ernie4_5_VLTokenizer",
18
  null
19
  ]
20
  },
21
  "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n {%- if message.role in ['system', 'user'] -%}\n {%- if message.role == 'user' -%}\n {{- 'User: ' -}}\n {%- endif -%}\n {%- if message.content is string -%}\n {{- message.content -}}\n {%- else -%}\n {%- for content_item in message.content -%}\n {%- if content_item.type == 'text' -%}\n {{- content_item.text -}}\n {%- elif content_item.type == 'image_url' -%}\n {%- set image_count.value = image_count.value + 1 -%}\n Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n {%- elif content_item.type == 'video_url' -%}\n {%- set video_count.value = video_count.value + 1 -%}\n Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- if message.role == 'system' -%}\n {{- '\n' -}}\n {%- endif -%}\n {%- elif message.role == 'assistant' -%}\n {%- macro extract_text_content(content_field) -%}\n {%- if content_field is string -%}\n {{- content_field -}}\n {%- elif content_field is iterable and content_field is not string -%}\n {%- set ns = namespace(text_parts=[]) -%}\n {%- set text_parts = [] -%}\n {%- for item in content_field -%}\n {%- if item.type == 'text' -%}\n {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n {%- endif -%}\n {%- endfor -%}\n {{- ns.text_parts | join('') -}}\n {%- else -%}\n {{- '' -}}\n {%- endif -%}\n {%- endmacro -%}\n {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n {%- set content = extract_text_content(message.content) -%}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n {%- endif %}\n {%- if reasoning_content %}\n {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n {%- else %}\n {{- '\n' + 'Assistant: ' + content }}\n {%- endif %}\n {{- '<|end_of_sentence|>' }}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n {{- '\nAssistant: ' -}}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\n\n</think>\n\n' }}\n {%- endif %}\n {%- if enable_thinking is not defined or enable_thinking is true %}\n {{- '<think>' }}\n {%- endif %}\n{%- endif %}\n"
22
+ }