Guilherme34 commited on
Commit
e7b2eac
·
verified ·
1 Parent(s): b04a533

Upload tokenization_minicpmo_fast.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenization_minicpmo_fast.py +110 -0
tokenization_minicpmo_fast.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The OpenBMB Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from transformers import Qwen2TokenizerFast
17
+
18
+
19
+ class MiniCPMOTokenizerFast(Qwen2TokenizerFast):
20
+ def __init__(self, **kwargs):
21
+ super().__init__(**kwargs)
22
+ # image
23
+ self.im_start = "<image>"
24
+ self.im_end = "</image>"
25
+ self.ref_start = "<ref>"
26
+ self.ref_end = "</ref>"
27
+ self.box_start = "<box>"
28
+ self.box_end = "</box>"
29
+ self.quad_start = "<quad>"
30
+ self.quad_end = "</quad>"
31
+ self.slice_start = "<slice>"
32
+ self.slice_end = "</slice>"
33
+ self.im_id_start = "<image_id>"
34
+ self.im_id_end = "</image_id>"
35
+
36
+ # audio
37
+ self.audio_start = "<|audio_start|>"
38
+ self.audio_end = "<|audio_end|>"
39
+ self.spk_start = "<|spk_bos|>"
40
+ self.spk_end = "<|spk_eos|>"
41
+ self.tts_start = "<|tts_bos|>"
42
+ self.tts_end = "<|tts_eos|>"
43
+
44
+ @property
45
+ def eos_id(self):
46
+ return self.eos_token_id
47
+
48
+ @property
49
+ def bos_id(self):
50
+ return self.bos_token_id
51
+
52
+ @property
53
+ def unk_id(self):
54
+ return self.unk_token_id
55
+
56
+ @property
57
+ def im_start_id(self):
58
+ return self.convert_tokens_to_ids(self.im_start)
59
+
60
+ @property
61
+ def im_end_id(self):
62
+ return self.convert_tokens_to_ids(self.im_end)
63
+
64
+ @property
65
+ def slice_start_id(self):
66
+ return self.convert_tokens_to_ids(self.slice_start)
67
+
68
+ @property
69
+ def slice_end_id(self):
70
+ return self.convert_tokens_to_ids(self.slice_end)
71
+
72
+ @property
73
+ def im_id_start_id(self):
74
+ return self.convert_tokens_to_ids(self.im_id_start)
75
+
76
+ @property
77
+ def im_id_end_id(self):
78
+ return self.convert_tokens_to_ids(self.im_id_end)
79
+
80
+ @property
81
+ def audio_start_id(self):
82
+ return self.convert_tokens_to_ids(self.audio_start)
83
+
84
+ @property
85
+ def audio_end_id(self):
86
+ return self.convert_tokens_to_ids(self.audio_end)
87
+
88
+ @property
89
+ def spk_start_id(self):
90
+ return self.convert_tokens_to_ids(self.spk_start)
91
+
92
+ @property
93
+ def spk_end_id(self):
94
+ return self.convert_tokens_to_ids(self.spk_end)
95
+
96
+ @property
97
+ def tts_start_id(self):
98
+ return self.convert_tokens_to_ids(self.tts_start)
99
+
100
+ @property
101
+ def tts_end_id(self):
102
+ return self.convert_tokens_to_ids(self.tts_end)
103
+
104
+ @staticmethod
105
+ def escape(text: str) -> str:
106
+ return text
107
+
108
+ @staticmethod
109
+ def unescape(text: str) -> str:
110
+ return text