Tom Jobbins
commited on
Debug tokenization output: Add ability to output text only (no tokens), and/or specify num samples to see (#511)
Browse files- scripts/finetune.py +6 -1
- src/axolotl/common/cli.py +2 -0
- src/axolotl/utils/tokenization.py +8 -6
scripts/finetune.py
CHANGED
|
@@ -246,9 +246,14 @@ def load_datasets(
|
|
| 246 |
LOG.info("check_dataset_labels...")
|
| 247 |
check_dataset_labels(
|
| 248 |
train_dataset.select(
|
| 249 |
-
[
|
|
|
|
|
|
|
|
|
|
| 250 |
),
|
| 251 |
tokenizer,
|
|
|
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
return TrainDatasetMeta(
|
|
|
|
| 246 |
LOG.info("check_dataset_labels...")
|
| 247 |
check_dataset_labels(
|
| 248 |
train_dataset.select(
|
| 249 |
+
[
|
| 250 |
+
random.randrange(0, len(train_dataset) - 1) # nosec
|
| 251 |
+
for _ in range(cli_args.debug_num_examples)
|
| 252 |
+
]
|
| 253 |
),
|
| 254 |
tokenizer,
|
| 255 |
+
num_examples=cli_args.debug_num_examples,
|
| 256 |
+
text_only=cli_args.debug_text_only,
|
| 257 |
)
|
| 258 |
|
| 259 |
return TrainDatasetMeta(
|
src/axolotl/common/cli.py
CHANGED
|
@@ -21,6 +21,8 @@ class TrainerCliArgs:
|
|
| 21 |
"""
|
| 22 |
|
| 23 |
debug: bool = field(default=False)
|
|
|
|
|
|
|
| 24 |
inference: bool = field(default=False)
|
| 25 |
merge_lora: bool = field(default=False)
|
| 26 |
prepare_ds_only: bool = field(default=False)
|
|
|
|
| 21 |
"""
|
| 22 |
|
| 23 |
debug: bool = field(default=False)
|
| 24 |
+
debug_text_only: bool = field(default=False)
|
| 25 |
+
debug_num_examples: int = field(default=5)
|
| 26 |
inference: bool = field(default=False)
|
| 27 |
merge_lora: bool = field(default=False)
|
| 28 |
prepare_ds_only: bool = field(default=False)
|
src/axolotl/utils/tokenization.py
CHANGED
|
@@ -8,13 +8,13 @@ from termcolor import colored
|
|
| 8 |
LOG = logging.getLogger("axolotl")
|
| 9 |
|
| 10 |
|
| 11 |
-
def check_dataset_labels(dataset, tokenizer):
|
| 12 |
# the dataset is already shuffled, so let's just check the first 5 elements
|
| 13 |
-
for idx in range(
|
| 14 |
-
check_example_labels(dataset[idx], tokenizer)
|
| 15 |
|
| 16 |
|
| 17 |
-
def check_example_labels(example, tokenizer):
|
| 18 |
# Get the input_ids, labels, and attention_mask from the dataset
|
| 19 |
input_ids = example["input_ids"]
|
| 20 |
labels = example["labels"]
|
|
@@ -29,8 +29,10 @@ def check_example_labels(example, tokenizer):
|
|
| 29 |
decoded_input_token = tokenizer.decode(input_id)
|
| 30 |
# Choose the color based on whether the label has the ignore value or not
|
| 31 |
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
| 32 |
-
colored_token = colored(decoded_input_token, color) +
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
)
|
| 35 |
colored_tokens.append(colored_token)
|
| 36 |
|
|
|
|
| 8 |
LOG = logging.getLogger("axolotl")
|
| 9 |
|
| 10 |
|
| 11 |
+
def check_dataset_labels(dataset, tokenizer, num_examples=5, text_only=False):
|
| 12 |
# the dataset is already shuffled, so let's just check the first 5 elements
|
| 13 |
+
for idx in range(num_examples):
|
| 14 |
+
check_example_labels(dataset[idx], tokenizer, text_only=text_only)
|
| 15 |
|
| 16 |
|
| 17 |
+
def check_example_labels(example, tokenizer, text_only=False):
|
| 18 |
# Get the input_ids, labels, and attention_mask from the dataset
|
| 19 |
input_ids = example["input_ids"]
|
| 20 |
labels = example["labels"]
|
|
|
|
| 29 |
decoded_input_token = tokenizer.decode(input_id)
|
| 30 |
# Choose the color based on whether the label has the ignore value or not
|
| 31 |
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
| 32 |
+
colored_token = colored(decoded_input_token, color) + (
|
| 33 |
+
not text_only
|
| 34 |
+
and colored(f"({label_id}, {mask}, {input_id})", "white")
|
| 35 |
+
or ""
|
| 36 |
)
|
| 37 |
colored_tokens.append(colored_token)
|
| 38 |
|