fix eval_steps to be a sane default (#797)
Browse files* fix eval_steps to be a sane default
* update docs for fractional eval_steps
- README.md +2 -2
- examples/cerebras/qlora.yml +1 -1
- examples/code-llama/13b/lora.yml +2 -2
- examples/code-llama/13b/qlora.yml +2 -2
- examples/code-llama/34b/lora.yml +2 -2
- examples/code-llama/34b/qlora.yml +2 -2
- examples/code-llama/7b/lora.yml +2 -2
- examples/code-llama/7b/qlora.yml +2 -2
- examples/falcon/config-7b-qlora.yml +1 -1
- examples/gptj/qlora.yml +1 -1
- examples/jeopardy-bot/config.yml +1 -1
- examples/llama-2/gptq-lora.yml +1 -1
- examples/llama-2/lora.yml +2 -2
- examples/llama-2/qlora.yml +2 -2
- examples/llama-2/relora.yml +2 -2
- examples/llama-2/tiny-llama.yml +2 -2
- examples/mistral/config.yml +2 -2
- examples/mistral/qlora.yml +1 -1
- examples/mpt-7b/config.yml +1 -1
- examples/pythia/lora.yml +2 -2
- examples/redpajama/config-3b.yml +1 -1
- examples/replit-3b/config-lora.yml +1 -1
- examples/xgen-7b/xgen-7b-8k-qlora.yml +1 -1
README.md
CHANGED
|
@@ -618,14 +618,14 @@ gradient_accumulation_steps: 1
|
|
| 618 |
# The number of samples to include in each batch. This is the number of samples sent to each GPU.
|
| 619 |
micro_batch_size: 2
|
| 620 |
eval_batch_size:
|
| 621 |
-
num_epochs:
|
| 622 |
warmup_steps: 100
|
| 623 |
learning_rate: 0.00003
|
| 624 |
lr_quadratic_warmup:
|
| 625 |
logging_steps:
|
| 626 |
save_strategy: # Set to `no` to skip checkpoint saves
|
| 627 |
save_steps: # Leave empty to save at each epoch
|
| 628 |
-
eval_steps: # Leave empty to eval at each epoch
|
| 629 |
save_total_limit: # Checkpoints saved at a time
|
| 630 |
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
| 631 |
# if both are set, num_epochs will not be guaranteed.
|
|
|
|
| 618 |
# The number of samples to include in each batch. This is the number of samples sent to each GPU.
|
| 619 |
micro_batch_size: 2
|
| 620 |
eval_batch_size:
|
| 621 |
+
num_epochs: 4
|
| 622 |
warmup_steps: 100
|
| 623 |
learning_rate: 0.00003
|
| 624 |
lr_quadratic_warmup:
|
| 625 |
logging_steps:
|
| 626 |
save_strategy: # Set to `no` to skip checkpoint saves
|
| 627 |
save_steps: # Leave empty to save at each epoch
|
| 628 |
+
eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
|
| 629 |
save_total_limit: # Checkpoints saved at a time
|
| 630 |
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
| 631 |
# if both are set, num_epochs will not be guaranteed.
|
examples/cerebras/qlora.yml
CHANGED
|
@@ -49,7 +49,7 @@ flash_attention:
|
|
| 49 |
gptq_groupsize:
|
| 50 |
gptq_model_v1:
|
| 51 |
warmup_steps: 10
|
| 52 |
-
eval_steps:
|
| 53 |
save_steps:
|
| 54 |
debug:
|
| 55 |
deepspeed:
|
|
|
|
| 49 |
gptq_groupsize:
|
| 50 |
gptq_model_v1:
|
| 51 |
warmup_steps: 10
|
| 52 |
+
eval_steps: 0.05
|
| 53 |
save_steps:
|
| 54 |
debug:
|
| 55 |
deepspeed:
|
examples/code-llama/13b/lora.yml
CHANGED
|
@@ -34,7 +34,7 @@ wandb_log_model:
|
|
| 34 |
|
| 35 |
gradient_accumulation_steps: 4
|
| 36 |
micro_batch_size: 2
|
| 37 |
-
num_epochs:
|
| 38 |
optimizer: adamw_bnb_8bit
|
| 39 |
lr_scheduler: cosine
|
| 40 |
learning_rate: 0.0002
|
|
@@ -54,7 +54,7 @@ xformers_attention:
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
-
eval_steps:
|
| 58 |
save_steps:
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
|
|
|
| 34 |
|
| 35 |
gradient_accumulation_steps: 4
|
| 36 |
micro_batch_size: 2
|
| 37 |
+
num_epochs: 4
|
| 38 |
optimizer: adamw_bnb_8bit
|
| 39 |
lr_scheduler: cosine
|
| 40 |
learning_rate: 0.0002
|
|
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
+
eval_steps: 0.05
|
| 58 |
save_steps:
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
examples/code-llama/13b/qlora.yml
CHANGED
|
@@ -36,7 +36,7 @@ wandb_log_model:
|
|
| 36 |
|
| 37 |
gradient_accumulation_steps: 4
|
| 38 |
micro_batch_size: 2
|
| 39 |
-
num_epochs:
|
| 40 |
optimizer: paged_adamw_32bit
|
| 41 |
lr_scheduler: cosine
|
| 42 |
learning_rate: 0.0002
|
|
@@ -56,7 +56,7 @@ xformers_attention:
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
-
eval_steps:
|
| 60 |
save_steps:
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
|
|
|
| 36 |
|
| 37 |
gradient_accumulation_steps: 4
|
| 38 |
micro_batch_size: 2
|
| 39 |
+
num_epochs: 4
|
| 40 |
optimizer: paged_adamw_32bit
|
| 41 |
lr_scheduler: cosine
|
| 42 |
learning_rate: 0.0002
|
|
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
+
eval_steps: 0.05
|
| 60 |
save_steps:
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
examples/code-llama/34b/lora.yml
CHANGED
|
@@ -34,7 +34,7 @@ wandb_log_model:
|
|
| 34 |
|
| 35 |
gradient_accumulation_steps: 4
|
| 36 |
micro_batch_size: 2
|
| 37 |
-
num_epochs:
|
| 38 |
optimizer: adamw_bnb_8bit
|
| 39 |
lr_scheduler: cosine
|
| 40 |
learning_rate: 0.0002
|
|
@@ -54,7 +54,7 @@ xformers_attention:
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
-
eval_steps:
|
| 58 |
save_steps:
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
|
|
|
| 34 |
|
| 35 |
gradient_accumulation_steps: 4
|
| 36 |
micro_batch_size: 2
|
| 37 |
+
num_epochs: 4
|
| 38 |
optimizer: adamw_bnb_8bit
|
| 39 |
lr_scheduler: cosine
|
| 40 |
learning_rate: 0.0002
|
|
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
+
eval_steps: 0.05
|
| 58 |
save_steps:
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
examples/code-llama/34b/qlora.yml
CHANGED
|
@@ -36,7 +36,7 @@ wandb_log_model:
|
|
| 36 |
|
| 37 |
gradient_accumulation_steps: 4
|
| 38 |
micro_batch_size: 2
|
| 39 |
-
num_epochs:
|
| 40 |
optimizer: paged_adamw_32bit
|
| 41 |
lr_scheduler: cosine
|
| 42 |
learning_rate: 0.0002
|
|
@@ -56,7 +56,7 @@ xformers_attention:
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
-
eval_steps:
|
| 60 |
save_steps:
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
|
|
|
| 36 |
|
| 37 |
gradient_accumulation_steps: 4
|
| 38 |
micro_batch_size: 2
|
| 39 |
+
num_epochs: 4
|
| 40 |
optimizer: paged_adamw_32bit
|
| 41 |
lr_scheduler: cosine
|
| 42 |
learning_rate: 0.0002
|
|
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
+
eval_steps: 0.05
|
| 60 |
save_steps:
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
examples/code-llama/7b/lora.yml
CHANGED
|
@@ -34,7 +34,7 @@ wandb_log_model:
|
|
| 34 |
|
| 35 |
gradient_accumulation_steps: 4
|
| 36 |
micro_batch_size: 2
|
| 37 |
-
num_epochs:
|
| 38 |
optimizer: adamw_bnb_8bit
|
| 39 |
lr_scheduler: cosine
|
| 40 |
learning_rate: 0.0002
|
|
@@ -54,7 +54,7 @@ xformers_attention:
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
-
eval_steps:
|
| 58 |
save_steps:
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
|
|
|
| 34 |
|
| 35 |
gradient_accumulation_steps: 4
|
| 36 |
micro_batch_size: 2
|
| 37 |
+
num_epochs: 4
|
| 38 |
optimizer: adamw_bnb_8bit
|
| 39 |
lr_scheduler: cosine
|
| 40 |
learning_rate: 0.0002
|
|
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
+
eval_steps: 0.05
|
| 58 |
save_steps:
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
examples/code-llama/7b/qlora.yml
CHANGED
|
@@ -36,7 +36,7 @@ wandb_log_model:
|
|
| 36 |
|
| 37 |
gradient_accumulation_steps: 4
|
| 38 |
micro_batch_size: 2
|
| 39 |
-
num_epochs:
|
| 40 |
optimizer: paged_adamw_32bit
|
| 41 |
lr_scheduler: cosine
|
| 42 |
learning_rate: 0.0002
|
|
@@ -56,7 +56,7 @@ xformers_attention:
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
-
eval_steps:
|
| 60 |
save_steps:
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
|
|
|
| 36 |
|
| 37 |
gradient_accumulation_steps: 4
|
| 38 |
micro_batch_size: 2
|
| 39 |
+
num_epochs: 4
|
| 40 |
optimizer: paged_adamw_32bit
|
| 41 |
lr_scheduler: cosine
|
| 42 |
learning_rate: 0.0002
|
|
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
+
eval_steps: 0.05
|
| 60 |
save_steps:
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
examples/falcon/config-7b-qlora.yml
CHANGED
|
@@ -53,7 +53,7 @@ output_dir: ./qlora-out
|
|
| 53 |
# decrease if OOM, increase for max VRAM utilization
|
| 54 |
micro_batch_size: 1
|
| 55 |
gradient_accumulation_steps: 2
|
| 56 |
-
num_epochs:
|
| 57 |
# Optimizer for QLoRA
|
| 58 |
optimizer: paged_adamw_32bit
|
| 59 |
torchdistx_path:
|
|
|
|
| 53 |
# decrease if OOM, increase for max VRAM utilization
|
| 54 |
micro_batch_size: 1
|
| 55 |
gradient_accumulation_steps: 2
|
| 56 |
+
num_epochs: 4
|
| 57 |
# Optimizer for QLoRA
|
| 58 |
optimizer: paged_adamw_32bit
|
| 59 |
torchdistx_path:
|
examples/gptj/qlora.yml
CHANGED
|
@@ -46,7 +46,7 @@ flash_attention:
|
|
| 46 |
gptq_groupsize:
|
| 47 |
gptq_model_v1:
|
| 48 |
warmup_steps: 10
|
| 49 |
-
eval_steps:
|
| 50 |
save_steps:
|
| 51 |
debug:
|
| 52 |
deepspeed:
|
|
|
|
| 46 |
gptq_groupsize:
|
| 47 |
gptq_model_v1:
|
| 48 |
warmup_steps: 10
|
| 49 |
+
eval_steps: 0.05
|
| 50 |
save_steps:
|
| 51 |
debug:
|
| 52 |
deepspeed:
|
examples/jeopardy-bot/config.yml
CHANGED
|
@@ -24,7 +24,7 @@ wandb_log_model:
|
|
| 24 |
output_dir: ./jeopardy-bot-7b
|
| 25 |
gradient_accumulation_steps: 1
|
| 26 |
micro_batch_size: 1
|
| 27 |
-
num_epochs:
|
| 28 |
optimizer: adamw_bnb_8bit
|
| 29 |
torchdistx_path:
|
| 30 |
lr_scheduler: cosine
|
|
|
|
| 24 |
output_dir: ./jeopardy-bot-7b
|
| 25 |
gradient_accumulation_steps: 1
|
| 26 |
micro_batch_size: 1
|
| 27 |
+
num_epochs: 4
|
| 28 |
optimizer: adamw_bnb_8bit
|
| 29 |
torchdistx_path:
|
| 30 |
lr_scheduler: cosine
|
examples/llama-2/gptq-lora.yml
CHANGED
|
@@ -37,7 +37,7 @@ wandb_log_model:
|
|
| 37 |
output_dir: ./model-out
|
| 38 |
gradient_accumulation_steps: 1
|
| 39 |
micro_batch_size: 1
|
| 40 |
-
num_epochs:
|
| 41 |
optimizer: adamw_torch
|
| 42 |
adam_beta2: 0.95
|
| 43 |
adam_eps: 0.00001
|
|
|
|
| 37 |
output_dir: ./model-out
|
| 38 |
gradient_accumulation_steps: 1
|
| 39 |
micro_batch_size: 1
|
| 40 |
+
num_epochs: 4
|
| 41 |
optimizer: adamw_torch
|
| 42 |
adam_beta2: 0.95
|
| 43 |
adam_eps: 0.00001
|
examples/llama-2/lora.yml
CHANGED
|
@@ -34,7 +34,7 @@ wandb_log_model:
|
|
| 34 |
|
| 35 |
gradient_accumulation_steps: 4
|
| 36 |
micro_batch_size: 2
|
| 37 |
-
num_epochs:
|
| 38 |
optimizer: adamw_bnb_8bit
|
| 39 |
lr_scheduler: cosine
|
| 40 |
learning_rate: 0.0002
|
|
@@ -54,7 +54,7 @@ xformers_attention:
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
-
eval_steps:
|
| 58 |
eval_table_size:
|
| 59 |
eval_table_max_new_tokens: 128
|
| 60 |
save_steps:
|
|
|
|
| 34 |
|
| 35 |
gradient_accumulation_steps: 4
|
| 36 |
micro_batch_size: 2
|
| 37 |
+
num_epochs: 4
|
| 38 |
optimizer: adamw_bnb_8bit
|
| 39 |
lr_scheduler: cosine
|
| 40 |
learning_rate: 0.0002
|
|
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
+
eval_steps: 0.05
|
| 58 |
eval_table_size:
|
| 59 |
eval_table_max_new_tokens: 128
|
| 60 |
save_steps:
|
examples/llama-2/qlora.yml
CHANGED
|
@@ -36,7 +36,7 @@ wandb_log_model:
|
|
| 36 |
|
| 37 |
gradient_accumulation_steps: 4
|
| 38 |
micro_batch_size: 2
|
| 39 |
-
num_epochs:
|
| 40 |
optimizer: paged_adamw_32bit
|
| 41 |
lr_scheduler: cosine
|
| 42 |
learning_rate: 0.0002
|
|
@@ -56,7 +56,7 @@ xformers_attention:
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
-
eval_steps:
|
| 60 |
eval_table_size:
|
| 61 |
save_steps:
|
| 62 |
debug:
|
|
|
|
| 36 |
|
| 37 |
gradient_accumulation_steps: 4
|
| 38 |
micro_batch_size: 2
|
| 39 |
+
num_epochs: 4
|
| 40 |
optimizer: paged_adamw_32bit
|
| 41 |
lr_scheduler: cosine
|
| 42 |
learning_rate: 0.0002
|
|
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
+
eval_steps: 0.05
|
| 60 |
eval_table_size:
|
| 61 |
save_steps:
|
| 62 |
debug:
|
examples/llama-2/relora.yml
CHANGED
|
@@ -40,7 +40,7 @@ wandb_log_model:
|
|
| 40 |
|
| 41 |
gradient_accumulation_steps: 4
|
| 42 |
micro_batch_size: 4
|
| 43 |
-
num_epochs:
|
| 44 |
optimizer: adamw_bnb_8bit
|
| 45 |
lr_scheduler: cosine
|
| 46 |
learning_rate: 0.0002
|
|
@@ -60,7 +60,7 @@ xformers_attention:
|
|
| 60 |
flash_attention: true
|
| 61 |
|
| 62 |
warmup_steps: 10
|
| 63 |
-
eval_steps:
|
| 64 |
save_steps: 50
|
| 65 |
debug:
|
| 66 |
deepspeed:
|
|
|
|
| 40 |
|
| 41 |
gradient_accumulation_steps: 4
|
| 42 |
micro_batch_size: 4
|
| 43 |
+
num_epochs: 4
|
| 44 |
optimizer: adamw_bnb_8bit
|
| 45 |
lr_scheduler: cosine
|
| 46 |
learning_rate: 0.0002
|
|
|
|
| 60 |
flash_attention: true
|
| 61 |
|
| 62 |
warmup_steps: 10
|
| 63 |
+
eval_steps: 0.05
|
| 64 |
save_steps: 50
|
| 65 |
debug:
|
| 66 |
deepspeed:
|
examples/llama-2/tiny-llama.yml
CHANGED
|
@@ -34,7 +34,7 @@ wandb_log_model:
|
|
| 34 |
|
| 35 |
gradient_accumulation_steps: 4
|
| 36 |
micro_batch_size: 2
|
| 37 |
-
num_epochs:
|
| 38 |
optimizer: adamw_bnb_8bit
|
| 39 |
lr_scheduler: cosine
|
| 40 |
learning_rate: 0.0002
|
|
@@ -54,7 +54,7 @@ xformers_attention:
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
-
eval_steps:
|
| 58 |
eval_table_size:
|
| 59 |
save_steps:
|
| 60 |
debug:
|
|
|
|
| 34 |
|
| 35 |
gradient_accumulation_steps: 4
|
| 36 |
micro_batch_size: 2
|
| 37 |
+
num_epochs: 4
|
| 38 |
optimizer: adamw_bnb_8bit
|
| 39 |
lr_scheduler: cosine
|
| 40 |
learning_rate: 0.0002
|
|
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
+
eval_steps: 0.05
|
| 58 |
eval_table_size:
|
| 59 |
save_steps:
|
| 60 |
debug:
|
examples/mistral/config.yml
CHANGED
|
@@ -26,7 +26,7 @@ wandb_log_model:
|
|
| 26 |
|
| 27 |
gradient_accumulation_steps: 4
|
| 28 |
micro_batch_size: 2
|
| 29 |
-
num_epochs:
|
| 30 |
optimizer: adamw_bnb_8bit
|
| 31 |
lr_scheduler: cosine
|
| 32 |
learning_rate: 0.000005
|
|
@@ -46,7 +46,7 @@ xformers_attention:
|
|
| 46 |
flash_attention: true
|
| 47 |
|
| 48 |
warmup_steps: 10
|
| 49 |
-
eval_steps:
|
| 50 |
eval_table_size:
|
| 51 |
eval_table_max_new_tokens: 128
|
| 52 |
save_steps:
|
|
|
|
| 26 |
|
| 27 |
gradient_accumulation_steps: 4
|
| 28 |
micro_batch_size: 2
|
| 29 |
+
num_epochs: 4
|
| 30 |
optimizer: adamw_bnb_8bit
|
| 31 |
lr_scheduler: cosine
|
| 32 |
learning_rate: 0.000005
|
|
|
|
| 46 |
flash_attention: true
|
| 47 |
|
| 48 |
warmup_steps: 10
|
| 49 |
+
eval_steps: 0.05
|
| 50 |
eval_table_size:
|
| 51 |
eval_table_max_new_tokens: 128
|
| 52 |
save_steps:
|
examples/mistral/qlora.yml
CHANGED
|
@@ -63,7 +63,7 @@ xformers_attention:
|
|
| 63 |
flash_attention: true
|
| 64 |
|
| 65 |
warmup_steps: 10
|
| 66 |
-
eval_steps:
|
| 67 |
eval_table_size:
|
| 68 |
eval_table_max_new_tokens: 128
|
| 69 |
save_steps:
|
|
|
|
| 63 |
flash_attention: true
|
| 64 |
|
| 65 |
warmup_steps: 10
|
| 66 |
+
eval_steps: 0.05
|
| 67 |
eval_table_size:
|
| 68 |
eval_table_max_new_tokens: 128
|
| 69 |
save_steps:
|
examples/mpt-7b/config.yml
CHANGED
|
@@ -26,7 +26,7 @@ wandb_log_model:
|
|
| 26 |
output_dir: ./mpt-alpaca-7b
|
| 27 |
gradient_accumulation_steps: 1
|
| 28 |
micro_batch_size: 1
|
| 29 |
-
num_epochs:
|
| 30 |
optimizer: adamw_bnb_8bit
|
| 31 |
torchdistx_path:
|
| 32 |
lr_scheduler: cosine
|
|
|
|
| 26 |
output_dir: ./mpt-alpaca-7b
|
| 27 |
gradient_accumulation_steps: 1
|
| 28 |
micro_batch_size: 1
|
| 29 |
+
num_epochs: 4
|
| 30 |
optimizer: adamw_bnb_8bit
|
| 31 |
torchdistx_path:
|
| 32 |
lr_scheduler: cosine
|
examples/pythia/lora.yml
CHANGED
|
@@ -23,7 +23,7 @@ wandb_log_model:
|
|
| 23 |
output_dir: ./lora-alpaca-pythia
|
| 24 |
gradient_accumulation_steps: 1
|
| 25 |
micro_batch_size: 4
|
| 26 |
-
num_epochs:
|
| 27 |
learning_rate: 0.00001
|
| 28 |
train_on_inputs: false
|
| 29 |
group_by_length: false
|
|
@@ -33,5 +33,5 @@ early_stopping_patience:
|
|
| 33 |
resume_from_checkpoint:
|
| 34 |
local_rank:
|
| 35 |
weight_decay: 0.1
|
| 36 |
-
eval_steps:
|
| 37 |
logging_steps: 1
|
|
|
|
| 23 |
output_dir: ./lora-alpaca-pythia
|
| 24 |
gradient_accumulation_steps: 1
|
| 25 |
micro_batch_size: 4
|
| 26 |
+
num_epochs: 4
|
| 27 |
learning_rate: 0.00001
|
| 28 |
train_on_inputs: false
|
| 29 |
group_by_length: false
|
|
|
|
| 33 |
resume_from_checkpoint:
|
| 34 |
local_rank:
|
| 35 |
weight_decay: 0.1
|
| 36 |
+
eval_steps: 0.05
|
| 37 |
logging_steps: 1
|
examples/redpajama/config-3b.yml
CHANGED
|
@@ -27,7 +27,7 @@ wandb_log_model:
|
|
| 27 |
output_dir: ./redpajama-alpaca-3b
|
| 28 |
batch_size: 4
|
| 29 |
micro_batch_size: 1
|
| 30 |
-
num_epochs:
|
| 31 |
optimizer: adamw_bnb_8bit
|
| 32 |
torchdistx_path:
|
| 33 |
lr_scheduler: cosine
|
|
|
|
| 27 |
output_dir: ./redpajama-alpaca-3b
|
| 28 |
batch_size: 4
|
| 29 |
micro_batch_size: 1
|
| 30 |
+
num_epochs: 4
|
| 31 |
optimizer: adamw_bnb_8bit
|
| 32 |
torchdistx_path:
|
| 33 |
lr_scheduler: cosine
|
examples/replit-3b/config-lora.yml
CHANGED
|
@@ -26,7 +26,7 @@ wandb_log_model:
|
|
| 26 |
output_dir: ./lora-replit
|
| 27 |
batch_size: 8
|
| 28 |
micro_batch_size: 1
|
| 29 |
-
num_epochs:
|
| 30 |
optimizer:
|
| 31 |
torchdistx_path:
|
| 32 |
lr_scheduler:
|
|
|
|
| 26 |
output_dir: ./lora-replit
|
| 27 |
batch_size: 8
|
| 28 |
micro_batch_size: 1
|
| 29 |
+
num_epochs: 4
|
| 30 |
optimizer:
|
| 31 |
torchdistx_path:
|
| 32 |
lr_scheduler:
|
examples/xgen-7b/xgen-7b-8k-qlora.yml
CHANGED
|
@@ -51,7 +51,7 @@ output_dir: ./qlora-out
|
|
| 51 |
# decrease if OOM, increase for max VRAM utilization
|
| 52 |
micro_batch_size: 1
|
| 53 |
gradient_accumulation_steps: 1
|
| 54 |
-
num_epochs:
|
| 55 |
# Optimizer for QLoRA
|
| 56 |
optimizer: paged_adamw_32bit
|
| 57 |
torchdistx_path:
|
|
|
|
| 51 |
# decrease if OOM, increase for max VRAM utilization
|
| 52 |
micro_batch_size: 1
|
| 53 |
gradient_accumulation_steps: 1
|
| 54 |
+
num_epochs: 4
|
| 55 |
# Optimizer for QLoRA
|
| 56 |
optimizer: paged_adamw_32bit
|
| 57 |
torchdistx_path:
|