DRAFT: Add a fast tokenizer implementation and converter
#8
by
chielo
- opened
Usage
import transformers, tokenizers
transformers.__version__, tokenizers.__version__
# >>> ('4.35.0', '0.14.1')
from transformers import AutoTokenizer
fast_tokenizer = AutoTokenizer.from_pretrained(
"./", trust_remote_code=True, use_fast=True
)
slow_tokenizer = AutoTokenizer.from_pretrained(
"./", trust_remote_code=True, use_fast=False
)
content = "是哪个星球的呢?"
history = [
{
"role": "user",
"content": "这是什么语言?“aburaka dabura ”",
"metadata": {"username": "Chielo"},
},
{"role": "assistant", "content": "这是来自外星的语言。"},
]
old_inputs = slow_tokenizer.build_chat_input(content, history=history)
new_inputs = fast_tokenizer.build_chat_input(content, history=history)
new_text = fast_tokenizer.build_chat_text(content, history=history)
old_input_ids = old_inputs["input_ids"][0].tolist()
new_inputs["input_ids"][0].tolist() == old_input_ids
# >>> True
fast_tokenizer.encode(new_text) == old_input_ids
# >>> True
fast_tokenizer.decode(old_input_ids) == slow_tokenizer.decode(old_input_ids)
# >>> True
new_text
# >>> "<|user|><!encode-sep!>{'username': 'Chielo'}\n<!encode-sep!>这是什么语言?“aburaka dabura ”<|assistant|><!encode-sep!>\n<!encode-sep!>这是来自外星的语言。<|user|><!encode-sep!>\n<!encode-sep!>是哪个星球的呢?<|assistant|>"
chielo
changed pull request title from
Add a fast tokenizer implementation and converter
to DRAFT: Add a fast tokenizer implementation and converter
chielo
changed pull request status to
closed