Encountered text corresponding to disallowed special token '<|im_start|>'
#11
by
Cornmonster
- opened
To reproduce:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
s = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHi there."
"<|im_end|>\n<|im_start|>assistant\nHello, how may I help you?<|im_end|>\n<|im_start|>"
"user\nWrite a poem.<|im_end|>\n<|im_start|>assistant\n")
tokenizer(s)
File ~/.cache/huggingface/modules/transformers_modules/Qwen/Qwen-7B-Chat/f2e5005d0d03224a30e2dca1c943341a454648c1/tokenization_qwen.py:196, in QWenTokenizer.tokenize(self, text, **kwargs)
193 tokens = []
194 text = unicodedata.normalize("NFC", text)
--> 196 for t in self.tokenizer.encode(text, **kwargs):
197 tokens.append(self.decoder[t])
199 return tokens
File ~/Documents/miniconda3/envs/ml/lib/python3.10/site-packages/tiktoken/core.py:112, in Encoding.encode(self, text, allowed_special, disallowed_special)
110 disallowed_special = frozenset(disallowed_special)
111 if match := _special_token_regex(disallowed_special).search(text):
--> 112 raise_disallowed_special_token(match.group())
114 return self._core_bpe.encode(text, allowed_special)
File ~/Documents/miniconda3/envs/ml/lib/python3.10/site-packages/tiktoken/core.py:322, in raise_disallowed_special_token(token)
321 def raise_disallowed_special_token(token: str) -> NoReturn:
--> 322 raise ValueError(
323 f"Encountered text corresponding to disallowed special token {token!r}.\n"
324 "If you want this text to be encoded as a special token, "
325 f"pass it to `allowed_special`, e.g. `allowed_special={{{token!r}, ...}}`.\n"
326 f"If you want this text to be encoded as normal text, disable the check for this token "
327 f"by passing `disallowed_special=(enc.special_tokens_set - {{{token!r}}})`.\n"
328 "To disable this check for all special tokens, pass `disallowed_special=()`.\n"
329 )
ValueError: Encountered text corresponding to disallowed special token '<|im_start|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|im_start|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|im_start|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.
Please refer to the updated model card.
# Note: our tokenizer rejects attacks and so that you cannot input special tokens like <|endoftext|> or it will throw an error.
# To remove the strategy, you can add `allowed_special`, which accepts the string "all" or a `set` of special tokens.
# For example: tokens = tokenizer(text, allowed_special="all")
jklj077
changed discussion status to
closed