Spaces:
Running
Running
| """ | |
| ## adapt to transformer tokenizer | |
| https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/tokenization_utils.py#L379 | |
| ## usage | |
| - grok | |
| ## 风险评估 | |
| - 可能会干扰 sentencepiece.SentencePieceProcessor的正常使用,比如 .vocab_size 原来是个方法,patch后是个property | |
| ## TODO | |
| 不用patch,改用wrapper。常见的 tokenizer通常是封装的 sentencepiece, | |
| """ | |
| import sentencepiece | |
| def vocab_size(self): | |
| """Returns vocab size""" | |
| return self.get_piece_size() | |
| def get_vocab(self): | |
| """Returns vocab as a dict""" | |
| vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} | |
| # vocab.update(self.added_tokens_encoder) | |
| return vocab | |
| def _tokenize(self, text): | |
| """Returns a tokenized string.""" | |
| return self.encode(text, out_type=str) | |
| def _convert_token_to_id(self, token): | |
| """Converts a token (str) in an id using the vocab.""" | |
| return self.piece_to_id(token) | |
| def _convert_id_to_token(self, index): | |
| """Converts an index (integer) in a token (str) using the vocab.""" | |
| token = self.IdToPiece(index) | |
| return token | |
| def convert_ids_to_tokens(self, ids, skip_special_tokens=False): | |
| """ copy from transformers.PreTrainedTokenizer | |
| Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and | |
| added tokens. | |
| Args: | |
| ids (`int` or `List[int]`): | |
| The token id (or token ids) to convert to tokens. | |
| skip_special_tokens (`bool`, *optional*, defaults to `False`): | |
| Whether or not to remove special tokens in the decoding. | |
| Returns: | |
| `str` or `List[str]`: The decoded token(s). | |
| """ | |
| self._added_tokens_decoder = {} # add by xs | |
| if isinstance(ids, int): | |
| if ids in self._added_tokens_decoder: | |
| return self._added_tokens_decoder[ids].content | |
| else: | |
| return self._convert_id_to_token(ids) | |
| tokens = [] | |
| for index in ids: | |
| index = int(index) | |
| if skip_special_tokens and index in self.all_special_ids: | |
| continue | |
| if index in self._added_tokens_decoder: | |
| tokens.append(self._added_tokens_decoder[index].content) | |
| else: | |
| tokens.append(self._convert_id_to_token(index)) | |
| return tokens | |
| def encode(self, *args, **kwargs): | |
| """ | |
| add_special_token 是为了兼容 hf_tokenizer | |
| """ | |
| kwargs.pop("add_special_tokens", None) | |
| kwargs.pop("allowed_special", None) | |
| return self.Encode(*args, **kwargs) | |
| def decode(self, *args, **kwargs): | |
| kwargs.pop("skip_special_tokens", None) | |
| return self.Decode(*args, **kwargs) | |
| sentencepiece.SentencePieceProcessor.vocab_size = vocab_size # | |
| sentencepiece.SentencePieceProcessor.get_vocab = get_vocab | |
| sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token | |
| sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens | |
| # sentencepiece.SentencePieceProcessor.tokenize = _tokenize | |
| sentencepiece.SentencePieceProcessor.encode = encode | |
| sentencepiece.SentencePieceProcessor.decode = decode | |