better handling of empty input ids when tokenizing (#395)
Browse files* better handling of empty input ids when tokenizing
* Add warning if tokenizer resulted in empty result
* fix len comparison for linter
src/axolotl/prompt_tokenizers.py
CHANGED
|
@@ -74,8 +74,11 @@ class PromptTokenizingStrategy(abc.ABC):
|
|
| 74 |
padding=False,
|
| 75 |
return_tensors=None,
|
| 76 |
)
|
|
|
|
|
|
|
| 77 |
if (
|
| 78 |
-
result["input_ids"]
|
|
|
|
| 79 |
and len(result["input_ids"]) < self.sequence_len
|
| 80 |
and add_eos_token
|
| 81 |
):
|
|
|
|
| 74 |
padding=False,
|
| 75 |
return_tensors=None,
|
| 76 |
)
|
| 77 |
+
if len(result["input_ids"]) == 0:
|
| 78 |
+
LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
|
| 79 |
if (
|
| 80 |
+
len(result["input_ids"]) > 0
|
| 81 |
+
and result["input_ids"][-1] != self.tokenizer.eos_token_id
|
| 82 |
and len(result["input_ids"]) < self.sequence_len
|
| 83 |
and add_eos_token
|
| 84 |
):
|