better handling since xgen tokenizer breaks with convert_tokens_to_ids
Browse files
src/axolotl/prompt_tokenizers.py
CHANGED
|
@@ -48,16 +48,22 @@ class PromptTokenizingStrategy(abc.ABC):
|
|
| 48 |
|
| 49 |
@functools.lru_cache(maxsize=128)
|
| 50 |
def _get_user_token(self):
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
| 54 |
return False
|
| 55 |
|
| 56 |
@functools.lru_cache(maxsize=128)
|
| 57 |
def _get_assistant_token(self):
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
| 61 |
return False
|
| 62 |
|
| 63 |
def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
|
|
|
|
| 48 |
|
| 49 |
@functools.lru_cache(maxsize=128)
|
| 50 |
def _get_user_token(self):
|
| 51 |
+
try:
|
| 52 |
+
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
|
| 53 |
+
if isinstance(id_or_ids, (int,)):
|
| 54 |
+
return id_or_ids
|
| 55 |
+
except KeyError:
|
| 56 |
+
pass
|
| 57 |
return False
|
| 58 |
|
| 59 |
@functools.lru_cache(maxsize=128)
|
| 60 |
def _get_assistant_token(self):
|
| 61 |
+
try:
|
| 62 |
+
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
|
| 63 |
+
if isinstance(id_or_ids, (int,)):
|
| 64 |
+
return id_or_ids
|
| 65 |
+
except KeyError:
|
| 66 |
+
pass
|
| 67 |
return False
|
| 68 |
|
| 69 |
def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
|