Spaces:
Running
Running
| """ 封装 sentencepiece.SentencePieceProcessor,以便符合transformers中的tokenizer标准 | |
| ## reference | |
| ## usage | |
| - grok | |
| """ | |
| import sentencepiece as spm | |
| from transformers import PreTrainedTokenizer | |
| class SPTokenizerWrapper(PreTrainedTokenizer): | |
| """ | |
| ## impl in PreTrainedTokenizer | |
| - convert_ids_to_tokens | |
| """ | |
| def __init__(self, vocab_file): | |
| self.vocab_file = vocab_file | |
| self.sp_model = spm.SentencePieceProcessor(self.vocab_file) | |
| super().__init__() | |
| def vocab_size(self): | |
| """Returns vocab size""" | |
| return self.sp_model.get_piece_size() | |
| def get_vocab(self): | |
| """Returns vocab as a dict""" | |
| vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} | |
| return vocab | |
| def _convert_token_to_id(self, token): | |
| """Converts a token (str) in an id using the vocab.""" | |
| return self.sp_model.piece_to_id(token) | |
| def _convert_id_to_token(self, index): | |
| """Converts an index (integer) in a token (str) using the vocab.""" | |
| token = self.sp_model.IdToPiece(index) | |
| return token | |
| # def (self, ids, skip_special_tokens=False): # impl in PreTrainedTokenizer | |
| def encode(self, *args, **kwargs): | |
| kwargs.pop("add_special_tokens", None) | |
| kwargs.pop("allowed_special", None) | |
| return self.sp_model.Encode(*args, **kwargs) | |
| def decode(self, *args, **kwargs): | |
| kwargs.pop("skip_special_tokens", None) | |
| return self.sp_model.Decode(*args, **kwargs) | |
| # PreTrainedTokenizer.convert_ids_to_tokens |