Merge pull request #130 from OpenAccess-AI-Collective/gas
Browse filesswap batch size for gradient accumulation steps to decouple from num gpu
- README.md +1 -1
- configs/cerebras_1_3B_alpaca.yml +1 -1
- configs/galactica_1_3B.yml +1 -1
- configs/gpt_neox_20b.yml +1 -1
- configs/llama_13B_alpaca.yml +1 -1
- configs/llama_65B_alpaca.yml +1 -1
- configs/llama_7B_4bit.yml +1 -1
- configs/llama_7B_alpaca.yml +1 -1
- configs/llama_7B_jeopardy.yml +1 -1
- configs/pythia_1_2B_alpaca.yml +1 -1
- configs/quickstart.yml +1 -1
- configs/sample.yml +2 -1
- configs/stability_3b.yml +1 -1
- configs/vicuna_13B_4bit_reflect.yml +1 -1
- examples/gptq-lora-7b/config.yml +1 -1
- examples/mpt-7b/config.yml +1 -1
README.md
CHANGED
|
@@ -265,7 +265,7 @@ wandb_log_model: # 'checkpoint'
|
|
| 265 |
output_dir: ./completed-model
|
| 266 |
|
| 267 |
# training hyperparameters
|
| 268 |
-
|
| 269 |
micro_batch_size: 2
|
| 270 |
eval_batch_size: 2
|
| 271 |
num_epochs: 3
|
|
|
|
| 265 |
output_dir: ./completed-model
|
| 266 |
|
| 267 |
# training hyperparameters
|
| 268 |
+
gradient_accumulation_steps: 1
|
| 269 |
micro_batch_size: 2
|
| 270 |
eval_batch_size: 2
|
| 271 |
num_epochs: 3
|
configs/cerebras_1_3B_alpaca.yml
CHANGED
|
@@ -26,7 +26,7 @@ wandb_watch:
|
|
| 26 |
wandb_run_id:
|
| 27 |
wandb_log_model:
|
| 28 |
output_dir: ./lora-alpaca
|
| 29 |
-
|
| 30 |
micro_batch_size: 4
|
| 31 |
num_epochs: 5
|
| 32 |
learning_rate: 0.0003
|
|
|
|
| 26 |
wandb_run_id:
|
| 27 |
wandb_log_model:
|
| 28 |
output_dir: ./lora-alpaca
|
| 29 |
+
gradient_accumulation_steps: 1
|
| 30 |
micro_batch_size: 4
|
| 31 |
num_epochs: 5
|
| 32 |
learning_rate: 0.0003
|
configs/galactica_1_3B.yml
CHANGED
|
@@ -23,7 +23,7 @@ wandb_watch:
|
|
| 23 |
wandb_run_id:
|
| 24 |
wandb_log_model:
|
| 25 |
output_dir: ./lora-llama-alpaca
|
| 26 |
-
|
| 27 |
micro_batch_size: 16
|
| 28 |
num_epochs: 3
|
| 29 |
learning_rate: 0.00003
|
|
|
|
| 23 |
wandb_run_id:
|
| 24 |
wandb_log_model:
|
| 25 |
output_dir: ./lora-llama-alpaca
|
| 26 |
+
gradient_accumulation_steps: 1
|
| 27 |
micro_batch_size: 16
|
| 28 |
num_epochs: 3
|
| 29 |
learning_rate: 0.00003
|
configs/gpt_neox_20b.yml
CHANGED
|
@@ -25,7 +25,7 @@ wandb_watch:
|
|
| 25 |
wandb_run_id:
|
| 26 |
wandb_log_model:
|
| 27 |
output_dir: ./gpt4all-neox-20b
|
| 28 |
-
|
| 29 |
micro_batch_size: 4
|
| 30 |
num_epochs: 5
|
| 31 |
learning_rate: 0.00003
|
|
|
|
| 25 |
wandb_run_id:
|
| 26 |
wandb_log_model:
|
| 27 |
output_dir: ./gpt4all-neox-20b
|
| 28 |
+
gradient_accumulation_steps: 1
|
| 29 |
micro_batch_size: 4
|
| 30 |
num_epochs: 5
|
| 31 |
learning_rate: 0.00003
|
configs/llama_13B_alpaca.yml
CHANGED
|
@@ -23,7 +23,7 @@ wandb_watch:
|
|
| 23 |
wandb_run_id:
|
| 24 |
wandb_log_model:
|
| 25 |
output_dir: ./llama-13b-sharegpt
|
| 26 |
-
|
| 27 |
micro_batch_size: 2
|
| 28 |
warmup_steps: 1000
|
| 29 |
save_steps:
|
|
|
|
| 23 |
wandb_run_id:
|
| 24 |
wandb_log_model:
|
| 25 |
output_dir: ./llama-13b-sharegpt
|
| 26 |
+
gradient_accumulation_steps: 1
|
| 27 |
micro_batch_size: 2
|
| 28 |
warmup_steps: 1000
|
| 29 |
save_steps:
|
configs/llama_65B_alpaca.yml
CHANGED
|
@@ -29,7 +29,7 @@ wandb_watch:
|
|
| 29 |
wandb_run_id:
|
| 30 |
wandb_log_model:
|
| 31 |
output_dir: ./lora-llama-alpaca
|
| 32 |
-
|
| 33 |
micro_batch_size: 16
|
| 34 |
warmup_steps: 1000
|
| 35 |
save_steps:
|
|
|
|
| 29 |
wandb_run_id:
|
| 30 |
wandb_log_model:
|
| 31 |
output_dir: ./lora-llama-alpaca
|
| 32 |
+
gradient_accumulation_steps: 1
|
| 33 |
micro_batch_size: 16
|
| 34 |
warmup_steps: 1000
|
| 35 |
save_steps:
|
configs/llama_7B_4bit.yml
CHANGED
|
@@ -26,7 +26,7 @@ wandb_watch:
|
|
| 26 |
wandb_run_id:
|
| 27 |
wandb_log_model:
|
| 28 |
output_dir: ./lora-test
|
| 29 |
-
|
| 30 |
micro_batch_size: 2
|
| 31 |
num_epochs: 3
|
| 32 |
warmup_steps: 100
|
|
|
|
| 26 |
wandb_run_id:
|
| 27 |
wandb_log_model:
|
| 28 |
output_dir: ./lora-test
|
| 29 |
+
gradient_accumulation_steps: 1
|
| 30 |
micro_batch_size: 2
|
| 31 |
num_epochs: 3
|
| 32 |
warmup_steps: 100
|
configs/llama_7B_alpaca.yml
CHANGED
|
@@ -28,7 +28,7 @@ wandb_watch:
|
|
| 28 |
wandb_run_id:
|
| 29 |
wandb_log_model:
|
| 30 |
output_dir: ./lora-llama-alpaca
|
| 31 |
-
|
| 32 |
micro_batch_size: 16
|
| 33 |
num_epochs: 5
|
| 34 |
learning_rate: 0.00003
|
|
|
|
| 28 |
wandb_run_id:
|
| 29 |
wandb_log_model:
|
| 30 |
output_dir: ./lora-llama-alpaca
|
| 31 |
+
gradient_accumulation_steps: 1
|
| 32 |
micro_batch_size: 16
|
| 33 |
num_epochs: 5
|
| 34 |
learning_rate: 0.00003
|
configs/llama_7B_jeopardy.yml
CHANGED
|
@@ -24,7 +24,7 @@ wandb_watch:
|
|
| 24 |
wandb_run_id:
|
| 25 |
wandb_log_model:
|
| 26 |
output_dir: ./jeopardy-bot-7b
|
| 27 |
-
|
| 28 |
micro_batch_size: 1
|
| 29 |
num_epochs: 2
|
| 30 |
optimizer: adamw_bnb_8bit
|
|
|
|
| 24 |
wandb_run_id:
|
| 25 |
wandb_log_model:
|
| 26 |
output_dir: ./jeopardy-bot-7b
|
| 27 |
+
gradient_accumulation_steps: 2
|
| 28 |
micro_batch_size: 1
|
| 29 |
num_epochs: 2
|
| 30 |
optimizer: adamw_bnb_8bit
|
configs/pythia_1_2B_alpaca.yml
CHANGED
|
@@ -28,7 +28,7 @@ wandb_watch:
|
|
| 28 |
wandb_run_id:
|
| 29 |
wandb_log_model:
|
| 30 |
output_dir: ./lora-alpaca
|
| 31 |
-
|
| 32 |
micro_batch_size: 4
|
| 33 |
num_epochs: 5
|
| 34 |
learning_rate: 0.00001
|
|
|
|
| 28 |
wandb_run_id:
|
| 29 |
wandb_log_model:
|
| 30 |
output_dir: ./lora-alpaca
|
| 31 |
+
gradient_accumulation_steps: 1
|
| 32 |
micro_batch_size: 4
|
| 33 |
num_epochs: 5
|
| 34 |
learning_rate: 0.00001
|
configs/quickstart.yml
CHANGED
|
@@ -26,7 +26,7 @@ wandb_watch:
|
|
| 26 |
wandb_run_id:
|
| 27 |
wandb_log_model:
|
| 28 |
output_dir: ./lora-test
|
| 29 |
-
|
| 30 |
micro_batch_size: 1
|
| 31 |
num_epochs: 3
|
| 32 |
warmup_steps: 100
|
|
|
|
| 26 |
wandb_run_id:
|
| 27 |
wandb_log_model:
|
| 28 |
output_dir: ./lora-test
|
| 29 |
+
gradient_accumulation_steps: 1
|
| 30 |
micro_batch_size: 1
|
| 31 |
num_epochs: 3
|
| 32 |
warmup_steps: 100
|
configs/sample.yml
CHANGED
|
@@ -53,7 +53,8 @@ wandb_log_model:
|
|
| 53 |
# where to save the finsihed model to
|
| 54 |
output_dir: ./completed-model
|
| 55 |
# training hyperparameters
|
| 56 |
-
|
|
|
|
| 57 |
micro_batch_size: 2
|
| 58 |
num_epochs: 3
|
| 59 |
warmup_steps: 100
|
|
|
|
| 53 |
# where to save the finsihed model to
|
| 54 |
output_dir: ./completed-model
|
| 55 |
# training hyperparameters
|
| 56 |
+
gradient_accumulation_steps: 1
|
| 57 |
+
batch_size:
|
| 58 |
micro_batch_size: 2
|
| 59 |
num_epochs: 3
|
| 60 |
warmup_steps: 100
|
configs/stability_3b.yml
CHANGED
|
@@ -22,7 +22,7 @@ wandb_watch:
|
|
| 22 |
wandb_run_id:
|
| 23 |
wandb_log_model:
|
| 24 |
output_dir: ./stable-alpaca-3b
|
| 25 |
-
|
| 26 |
micro_batch_size: 1
|
| 27 |
num_epochs: 1
|
| 28 |
optimizer: adamw_bnb_8bit
|
|
|
|
| 22 |
wandb_run_id:
|
| 23 |
wandb_log_model:
|
| 24 |
output_dir: ./stable-alpaca-3b
|
| 25 |
+
gradient_accumulation_steps: 1
|
| 26 |
micro_batch_size: 1
|
| 27 |
num_epochs: 1
|
| 28 |
optimizer: adamw_bnb_8bit
|
configs/vicuna_13B_4bit_reflect.yml
CHANGED
|
@@ -30,7 +30,7 @@ wandb_watch:
|
|
| 30 |
wandb_run_id:
|
| 31 |
wandb_log_model:
|
| 32 |
output_dir: ./lora-reflect
|
| 33 |
-
|
| 34 |
micro_batch_size: 2
|
| 35 |
num_epochs: 3
|
| 36 |
learning_rate: 0.00003
|
|
|
|
| 30 |
wandb_run_id:
|
| 31 |
wandb_log_model:
|
| 32 |
output_dir: ./lora-reflect
|
| 33 |
+
gradient_accumulation_steps: 1
|
| 34 |
micro_batch_size: 2
|
| 35 |
num_epochs: 3
|
| 36 |
learning_rate: 0.00003
|
examples/gptq-lora-7b/config.yml
CHANGED
|
@@ -26,7 +26,7 @@ wandb_watch:
|
|
| 26 |
wandb_run_id:
|
| 27 |
wandb_log_model:
|
| 28 |
output_dir: ./llama-7b-lora-int4
|
| 29 |
-
|
| 30 |
micro_batch_size: 1
|
| 31 |
num_epochs: 3
|
| 32 |
optimizer: adamw_bnb_8bit
|
|
|
|
| 26 |
wandb_run_id:
|
| 27 |
wandb_log_model:
|
| 28 |
output_dir: ./llama-7b-lora-int4
|
| 29 |
+
gradient_accumulation_steps: 1
|
| 30 |
micro_batch_size: 1
|
| 31 |
num_epochs: 3
|
| 32 |
optimizer: adamw_bnb_8bit
|
examples/mpt-7b/config.yml
CHANGED
|
@@ -24,7 +24,7 @@ wandb_watch:
|
|
| 24 |
wandb_run_id:
|
| 25 |
wandb_log_model:
|
| 26 |
output_dir: ./mpt-alpaca-7b
|
| 27 |
-
|
| 28 |
micro_batch_size: 1
|
| 29 |
num_epochs: 3
|
| 30 |
optimizer: adamw_bnb_8bit
|
|
|
|
| 24 |
wandb_run_id:
|
| 25 |
wandb_log_model:
|
| 26 |
output_dir: ./mpt-alpaca-7b
|
| 27 |
+
gradient_accumulation_steps: 1
|
| 28 |
micro_batch_size: 1
|
| 29 |
num_epochs: 3
|
| 30 |
optimizer: adamw_bnb_8bit
|