Update data.py for signature generation (#851)
Browse files* Update data.py
Change of conversation formatting type should also trigger updating the preprocessed dataset, so it should be part of the signature.
* chore: lint
---------
Co-authored-by: Wing Lian <[email protected]>
src/axolotl/utils/data.py
CHANGED
|
@@ -99,7 +99,12 @@ def load_tokenized_prepared_datasets(
|
|
| 99 |
str(cfg.sequence_len)
|
| 100 |
+ "@"
|
| 101 |
+ "|".join(
|
| 102 |
-
sorted(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
)
|
| 104 |
+ "|"
|
| 105 |
+ tokenizer_name
|
|
|
|
| 99 |
str(cfg.sequence_len)
|
| 100 |
+ "@"
|
| 101 |
+ "|".join(
|
| 102 |
+
sorted(
|
| 103 |
+
[
|
| 104 |
+
f"{d.path}:{d.type}:{d.shards}:{d.conversation}"
|
| 105 |
+
for d in cfg.datasets
|
| 106 |
+
]
|
| 107 |
+
)
|
| 108 |
)
|
| 109 |
+ "|"
|
| 110 |
+ tokenizer_name
|