duzx16 commited on
Commit
71189e7
1 Parent(s): eb3e683

Fix tokenization space

Browse files
Files changed (1) hide show
  1. tokenization_chatglm.py +5 -1
tokenization_chatglm.py CHANGED
@@ -66,7 +66,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
66
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
67
 
68
  def __init__(self, vocab_file, padding_side="left", **kwargs):
69
- super().__init__(padding_side=padding_side, **kwargs)
70
  self.name = "GLMTokenizer"
71
 
72
  self.vocab_file = vocab_file
@@ -83,6 +83,10 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
83
  assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
84
  return self.tokenizer.special_tokens[token]
85
 
 
 
 
 
86
  @property
87
  def pad_token(self) -> str:
88
  return "<unk>"
 
66
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
67
 
68
  def __init__(self, vocab_file, padding_side="left", **kwargs):
69
+ super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=False, **kwargs)
70
  self.name = "GLMTokenizer"
71
 
72
  self.vocab_file = vocab_file
 
83
  assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
84
  return self.tokenizer.special_tokens[token]
85
 
86
+ @property
87
+ def unk_token(self) -> str:
88
+ return "<unk>"
89
+
90
  @property
91
  def pad_token(self) -> str:
92
  return "<unk>"