|
|
|
|
|
from .tokenization import ChineseSPTokenizer, make_tokenizer |
|
|
|
|
|
def DecodeIds(self, Ids, type_token=False): |
|
try: |
|
decode_str = self.DecodeIds(Ids, type_token=type_token) |
|
except Exception as e: |
|
print("WARNING", Ids, e) |
|
decode_str = "" |
|
return decode_str |
|
|
|
ChineseSPTokenizer.decode = DecodeIds |
|
|
|
|
|
add_sentinel_token = 0 |
|
tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048", |
|
None, add_block_symbols=True, cache_dir="cache", |
|
add_sentinel_token=add_sentinel_token, add_task_mask=True, |
|
add_decoder_mask=False, |
|
fix_command_token=False) |
|
|
|
tokenizer.vocab_size = tokenizer.num_tokens |
|
|
|
|
|
|
|
|
|
def get_vocab(self, token_type="str"): |
|
"""Returns vocab as a dict |
|
:return: |
|
""" |
|
vocab = {} |
|
for i in range(self.vocab_size): |
|
try: |
|
token_byte = self.convert_ids_to_tokens([i])[0] |
|
if token_byte is None: |
|
continue |
|
|
|
vocab[token_byte] = i |
|
|
|
except Exception as e: |
|
print("exception") |
|
|
|
return vocab |
|
|
|
|
|
ChineseSPTokenizer.get_vocab = get_vocab |
|
|
|
|
|
|
|
|