DRAFT: Add a fast tokenizer implementation and converter

#8
by chielo - opened

Usage

import transformers, tokenizers

transformers.__version__, tokenizers.__version__
# >>> ('4.35.0', '0.14.1')

from transformers import AutoTokenizer

fast_tokenizer = AutoTokenizer.from_pretrained(
    "./", trust_remote_code=True, use_fast=True
)
slow_tokenizer = AutoTokenizer.from_pretrained(
    "./", trust_remote_code=True, use_fast=False
)

content = "是哪个星球的呢?"
history = [
    {
        "role": "user",
        "content": "这是什么语言?“aburaka    dabura   ”",
        "metadata": {"username": "Chielo"},
    },
    {"role": "assistant", "content": "这是来自外星的语言。"},
]

old_inputs = slow_tokenizer.build_chat_input(content, history=history)
new_inputs = fast_tokenizer.build_chat_input(content, history=history)
new_text = fast_tokenizer.build_chat_text(content, history=history)

old_input_ids = old_inputs["input_ids"][0].tolist()

new_inputs["input_ids"][0].tolist() == old_input_ids
# >>> True

fast_tokenizer.encode(new_text) == old_input_ids
# >>> True

fast_tokenizer.decode(old_input_ids) == slow_tokenizer.decode(old_input_ids)
# >>> True

new_text
# >>> "<|user|><!encode-sep!>{'username': 'Chielo'}\n<!encode-sep!>这是什么语言?“aburaka    dabura   ”<|assistant|><!encode-sep!>\n<!encode-sep!>这是来自外星的语言。<|user|><!encode-sep!>\n<!encode-sep!>是哪个星球的呢?<|assistant|>"
chielo changed pull request title from Add a fast tokenizer implementation and converter to DRAFT: Add a fast tokenizer implementation and converter
chielo changed pull request status to closed

Sign up or log in to comment