Upload tokenizer
#1
by
ArthurZ
HF staff
- opened
No description provided.
This is the git diff to support this, use LlamaTokenizer.from_pretrained("tokenizer.model.v3", legacy=False)
, then converted to fast.
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 4b0a53b704..a1346d4c0c 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -1385,6 +1385,7 @@ class LlamaConverter(SpmConverter):
AddedToken(self.original_tokenizer.convert_ids_to_tokens(2), normalized=False, special=True),
]
)
+ tokenizer.add_special_tokens(list(proto.trainer_spec.control_symbols))
else:
raise Exception(
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
patrickvonplaten
changed pull request status to
merged