more gpt-neox long ctx fixes
Browse files
src/axolotl/utils/callbacks.py
CHANGED
|
@@ -61,6 +61,7 @@ class SaveBetterTransformerModelCallback(
|
|
| 61 |
|
| 62 |
model = BetterTransformer.reverse(kwargs["model"])
|
| 63 |
model.save_pretrained(checkpoint_folder)
|
|
|
|
| 64 |
|
| 65 |
# since we're saving here, we don't need the trainer loop to attempt to save too b/c
|
| 66 |
# the trainer will raise an exception since it can't save a BetterTransformer wrapped model
|
|
|
|
| 61 |
|
| 62 |
model = BetterTransformer.reverse(kwargs["model"])
|
| 63 |
model.save_pretrained(checkpoint_folder)
|
| 64 |
+
# FIXME - need to cleanup old checkpoints
|
| 65 |
|
| 66 |
# since we're saving here, we don't need the trainer loop to attempt to save too b/c
|
| 67 |
# the trainer will raise an exception since it can't save a BetterTransformer wrapped model
|
src/axolotl/utils/data.py
CHANGED
|
@@ -388,9 +388,13 @@ def load_prepare_datasets(
|
|
| 388 |
index=cfg.dataset_shard_idx,
|
| 389 |
)
|
| 390 |
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
return train_dataset, eval_dataset
|
| 396 |
|
|
|
|
| 388 |
index=cfg.dataset_shard_idx,
|
| 389 |
)
|
| 390 |
|
| 391 |
+
if cfg.val_set_size:
|
| 392 |
+
dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
|
| 393 |
+
train_dataset = dataset["train"]
|
| 394 |
+
eval_dataset = dataset["test"]
|
| 395 |
+
else:
|
| 396 |
+
train_dataset = dataset
|
| 397 |
+
eval_dataset = None
|
| 398 |
|
| 399 |
return train_dataset, eval_dataset
|
| 400 |
|
src/axolotl/utils/models.py
CHANGED
|
@@ -300,6 +300,12 @@ def load_model(
|
|
| 300 |
embeddings_len = math.ceil(len(tokenizer) / 32) * 32
|
| 301 |
model.resize_token_embeddings(embeddings_len)
|
| 302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
if not cfg.gptq and (
|
| 304 |
(cfg.adapter == "lora" and load_in_8bit)
|
| 305 |
or (cfg.adapter == "qlora" and cfg.load_in_4bit)
|
|
|
|
| 300 |
embeddings_len = math.ceil(len(tokenizer) / 32) * 32
|
| 301 |
model.resize_token_embeddings(embeddings_len)
|
| 302 |
|
| 303 |
+
if cfg.sequence_len >= model.config.max_position_embeddings:
|
| 304 |
+
logging.warning(
|
| 305 |
+
f"increasing model.config.max_position_embeddings to {cfg.sequence_len}"
|
| 306 |
+
)
|
| 307 |
+
model.config.max_position_embeddings = cfg.sequence_len
|
| 308 |
+
|
| 309 |
if not cfg.gptq and (
|
| 310 |
(cfg.adapter == "lora" and load_in_8bit)
|
| 311 |
or (cfg.adapter == "qlora" and cfg.load_in_4bit)
|
src/axolotl/utils/validation.py
CHANGED
|
@@ -80,4 +80,11 @@ def validate_config(cfg):
|
|
| 80 |
# TODO
|
| 81 |
# MPT 7b
|
| 82 |
# https://github.com/facebookresearch/bitsandbytes/issues/25
|
| 83 |
-
# no 8bit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
# TODO
|
| 81 |
# MPT 7b
|
| 82 |
# https://github.com/facebookresearch/bitsandbytes/issues/25
|
| 83 |
+
# no 8bit adaAmw w bf16
|
| 84 |
+
|
| 85 |
+
# GPT-NeoX
|
| 86 |
+
# evals broken when extending context len
|
| 87 |
+
# File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 162, in forward attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
|
| 88 |
+
# File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/optimum/bettertransformer/models/attention.py", line 74, in gpt2_wrapped_scaled_dot_product
|
| 89 |
+
# attention_mask = causal_mask + attention_mask
|
| 90 |
+
# RuntimeError: The size of tensor a (2048) must match the size of tensor b (8132) at non-singleton dimension 3
|