simonJJJ commited on
Commit
7595aae
·
1 Parent(s): 0288b7e

Update tokenization_qwen.py

Browse files
Files changed (1) hide show
  1. tokenization_qwen.py +19 -7
tokenization_qwen.py CHANGED
@@ -27,12 +27,6 @@ logger = logging.getLogger(__name__)
27
 
28
 
29
  VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
30
- FONT_PATH = try_to_load_from_cache("Qwen/Qwen-VL-Chat", "SimSun.ttf")
31
- if FONT_PATH is None:
32
- if not os.path.exists("SimSun.ttf"):
33
- ttf = requests.get("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/SimSun.ttf")
34
- open("SimSun.ttf", "wb").write(ttf.content)
35
- FONT_PATH = "SimSun.ttf"
36
 
37
  PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
38
  ENDOFTEXT = "<|endoftext|>"
@@ -175,6 +169,24 @@ class QWenTokenizer(PreTrainedTokenizer):
175
  self.im_start_id = self.special_tokens[IMSTART]
176
  self.im_end_id = self.special_tokens[IMEND]
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  def __len__(self) -> int:
179
  return self.tokenizer.n_vocab
180
 
@@ -503,7 +515,7 @@ class VisImage:
503
  class Visualizer:
504
  def __init__(self, img_rgb, metadata=None, scale=1.0):
505
  self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
506
- self.font_path = FONT_PATH
507
  self.output = VisImage(self.img, scale=scale)
508
  self.cpu_device = torch.device("cpu")
509
 
 
27
 
28
 
29
  VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
 
 
 
 
 
 
30
 
31
  PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
32
  ENDOFTEXT = "<|endoftext|>"
 
169
  self.im_start_id = self.special_tokens[IMSTART]
170
  self.im_end_id = self.special_tokens[IMEND]
171
 
172
+ def __getstate__(self):
173
+ # for pickle lovers
174
+ state = self.__dict__.copy()
175
+ del state['tokenizer']
176
+ return state
177
+
178
+ def __setstate__(self, state):
179
+ # tokenizer is not python native; don't pass it; rebuild it
180
+ self.__dict__.update(state)
181
+ enc = tiktoken.Encoding(
182
+ "Qwen",
183
+ pat_str=PAT_STR,
184
+ mergeable_ranks=self.mergeable_ranks,
185
+ special_tokens=self.special_tokens,
186
+ )
187
+ self.tokenizer = enc
188
+
189
+
190
  def __len__(self) -> int:
191
  return self.tokenizer.n_vocab
192
 
 
515
  class Visualizer:
516
  def __init__(self, img_rgb, metadata=None, scale=1.0):
517
  self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
518
+ self.font_path = try_to_load_from_cache("Qwen/Qwen-VL-Chat", "SimSun.ttf")
519
  self.output = VisImage(self.img, scale=scale)
520
  self.cpu_device = torch.device("cpu")
521