|
from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
|
|
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
|
|
|
|
|
tokenizer = Tokenizer(models.BPE())
|
|
|
|
|
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
|
tokenizer.decoder = decoders.ByteLevel()
|
|
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
|
|
|
|
|
|
trainer = trainers.BpeTrainer(
|
|
vocab_size=0,
|
|
min_frequency=2,
|
|
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
|
special_tokens=[
|
|
"<|begin_of_sequence|>",
|
|
"<|end_of_sequence|>",
|
|
"<|im_start|>",
|
|
"<|im_sep|>",
|
|
"<|im_end|>",
|
|
"<|semantic|>",
|
|
"<|pad|>",
|
|
],
|
|
)
|
|
|
|
|
|
|
|
tokenizer.train_from_iterator([], trainer=trainer)
|
|
|
|
print(len(tokenizer.get_vocab()))
|
|
x = tokenizer.encode(
|
|
"Hello, how are you? dfgnviadfjoiviouajeiodfjv 你好世界 🈶<|semantic|>"
|
|
).ids
|
|
print(x, len(x))
|
|
print(tokenizer.decode(x, skip_special_tokens=True))
|
|
|
|
|
|
tokenizer = PreTrainedTokenizerFast(
|
|
tokenizer_object=tokenizer,
|
|
pad_token="<|pad|>",
|
|
bos_token="<|begin_of_sequence|>",
|
|
eos_token="<|end_of_sequence|>",
|
|
)
|
|
|
|
|
|
sequence = "All around, too, lay vast quantities of the costliest merchandise, and treasures were heaped in every cranny of the rocks, but all these things only added to the desolation of the scene. 测试中文, 你好世界 🈶<|semantic|>"
|
|
encoded = tokenizer(sequence).input_ids
|
|
|
|
print("Test encoding....")
|
|
print(f"\tSentence: {sequence}")
|
|
print(f"\tEncoded: {encoded}")
|
|
print(f"\tDecoded: {tokenizer.batch_decode(encoded)}")
|
|
print(f"\tDecoded: {tokenizer.decode(encoded)}")
|
|
|
|
tokenizer.push_to_hub("fishaudio/fish-speech-1", private=True)
|
|
|