Spaces:
Runtime error
Runtime error
| from __future__ import annotations | |
| import abc | |
| from typing import ( | |
| List, | |
| Optional, | |
| Any, | |
| ) | |
| import llama_cpp | |
| from llama_cpp.llama_types import List | |
| class BaseLlamaTokenizer(abc.ABC): | |
| def tokenize( | |
| self, text: bytes, add_bos: bool = True, special: bool = True | |
| ) -> List[int]: | |
| """Tokenize the text into tokens. | |
| Args: | |
| text: The text to tokenize. | |
| add_bos: Whether to add a beginning of sequence token. | |
| special: Whether to tokenize text literally or as special tokens.""" | |
| raise NotImplementedError | |
| def detokenize( | |
| self, tokens: List[int], prev_tokens: Optional[List[int]] = None | |
| ) -> bytes: | |
| """Detokenize the tokens into text. | |
| Args: | |
| tokens: The tokens to detokenize. | |
| prev_tokens: If tokens is a continuation of a previous sequence, the previous tokens.""" | |
| raise NotImplementedError | |
| class LlamaTokenizer(BaseLlamaTokenizer): | |
| def __init__(self, llama: llama_cpp.Llama): | |
| self._model = llama._model # type: ignore | |
| def tokenize( | |
| self, text: bytes, add_bos: bool = True, special: bool = True | |
| ) -> List[int]: | |
| return self._model.tokenize(text, add_bos=add_bos, special=special) | |
| def detokenize( | |
| self, tokens: List[int], prev_tokens: Optional[List[int]] = None | |
| ) -> bytes: | |
| return self._model.detokenize(tokens) | |
| def encode( | |
| self, text: str, add_bos: bool = True, special: bool = True | |
| ) -> List[int]: | |
| return self.tokenize( | |
| text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special | |
| ) | |
| def decode(self, tokens: List[int]) -> str: | |
| return self.detokenize(tokens).decode("utf-8", errors="ignore") | |
| def from_ggml_file(cls, path: str) -> "LlamaTokenizer": | |
| return cls(llama_cpp.Llama(model_path=path, vocab_only=True)) | |
| class LlamaHFTokenizer(BaseLlamaTokenizer): | |
| def __init__(self, hf_tokenizer: Any): | |
| self.hf_tokenizer = hf_tokenizer | |
| def tokenize( | |
| self, text: bytes, add_bos: bool = True, special: bool = True | |
| ) -> List[int]: | |
| return self.hf_tokenizer.encode( | |
| text.decode("utf-8", errors="ignore"), add_special_tokens=special | |
| ) | |
| def detokenize( | |
| self, tokens: List[int], prev_tokens: Optional[List[int]] = None | |
| ) -> bytes: | |
| if prev_tokens is not None: | |
| text = self.hf_tokenizer.decode(prev_tokens + tokens).encode("utf-8", errors="ignore") | |
| prev_text = self.hf_tokenizer.decode(prev_tokens).encode( | |
| "utf-8", errors="ignore" | |
| ) | |
| return text[len(prev_text) :] | |
| else: | |
| return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore") | |
| def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer": | |
| try: | |
| from transformers import AutoTokenizer | |
| except ImportError: | |
| raise ImportError( | |
| "The `transformers` library is required to use the `HFTokenizer`." | |
| "You can install it with `pip install transformers`." | |
| ) | |
| hf_tokenizer = AutoTokenizer.from_pretrained( | |
| pretrained_model_name_or_path=pretrained_model_name_or_path | |
| ) | |
| return cls(hf_tokenizer) | |