Commit
·
1ea7410
1
Parent(s):
508087f
git add configs/
Browse files- configs/example_finetuning.py +49 -0
- configs/example_pretraining.py +48 -0
configs/example_finetuning.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -----------------------------------------------------------------------------
|
| 2 |
+
# I/O
|
| 3 |
+
|
| 4 |
+
# learning data directory, train.bin and valid .bin are expected. You should prepare them using tokenize.py
|
| 5 |
+
data_dir = 'directory_containing_train.bin/val.bin'
|
| 6 |
+
out_dir = 'output_directory' # output directory
|
| 7 |
+
log_dir = os.path.join(out_dir, 'logs') # logs will be written in to out_dir/logs
|
| 8 |
+
|
| 9 |
+
# -----------------------------------------------------------------------------
|
| 10 |
+
# model parameters
|
| 11 |
+
meta_vocab_size = 1024
|
| 12 |
+
block_size = 256
|
| 13 |
+
n_layer=24
|
| 14 |
+
n_head=16
|
| 15 |
+
n_embd=1024 # 350M, medium
|
| 16 |
+
bias = False
|
| 17 |
+
|
| 18 |
+
# -----------------------------------------------------------------------------
|
| 19 |
+
# learning parameters
|
| 20 |
+
max_iters = 1000000 # total number of training iterations
|
| 21 |
+
eval_interval = 5000
|
| 22 |
+
log_interval = 1
|
| 23 |
+
eval_iters = 100
|
| 24 |
+
eval_only = False # if True, script exits right after the first eval
|
| 25 |
+
always_save_checkpoint = True # if True, always save a checkpoint after each eval
|
| 26 |
+
init_from = 'resume' # 'scratch' or 'resume' or 'gpt2*'
|
| 27 |
+
ckpt_path = 'model.pt'
|
| 28 |
+
gradient_accumulation_steps = 16 # used to simulate larger batch sizes, should be multiple of GPU number
|
| 29 |
+
batch_size = 16
|
| 30 |
+
|
| 31 |
+
# adamw optimizer
|
| 32 |
+
learning_rate = 1e-4 # max learning rate
|
| 33 |
+
dropout = 0.1
|
| 34 |
+
weight_decay = 0.1
|
| 35 |
+
beta1 = 0.9
|
| 36 |
+
beta2 = 0.95
|
| 37 |
+
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
| 38 |
+
|
| 39 |
+
# learning rate decay settings
|
| 40 |
+
decay_lr = False
|
| 41 |
+
warmup_iters = 2000
|
| 42 |
+
lr_decay_iters = 1000000
|
| 43 |
+
min_lr = 1e-4 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
|
| 44 |
+
# DDP settings
|
| 45 |
+
backend = 'nccl' # 'nccl', 'gloo', etc.
|
| 46 |
+
# system
|
| 47 |
+
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1'
|
| 48 |
+
dtype = 'float32' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
|
| 49 |
+
compile = True # use PyTorch 2.0 to compile the model to be faster
|
configs/example_pretraining.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -----------------------------------------------------------------------------
|
| 2 |
+
# I/O
|
| 3 |
+
|
| 4 |
+
# learning data directory, train.bin and valid .bin are expected. You should prepare them using tokenize.py
|
| 5 |
+
data_dir = 'directory_containing_train.bin/val.bin'
|
| 6 |
+
out_dir = 'output_directory' # output directory
|
| 7 |
+
log_dir = os.path.join(out_dir, 'logs') # logs will be written in to out_dir/logs
|
| 8 |
+
|
| 9 |
+
# -----------------------------------------------------------------------------
|
| 10 |
+
# model parameters
|
| 11 |
+
meta_vocab_size = 1024
|
| 12 |
+
block_size = 256
|
| 13 |
+
n_layer=24
|
| 14 |
+
n_head=16
|
| 15 |
+
n_embd=1024 # 350M, medium
|
| 16 |
+
bias = False # do we use bias inside LayerNorm and Linear layers?
|
| 17 |
+
|
| 18 |
+
# -----------------------------------------------------------------------------
|
| 19 |
+
# learning parameters
|
| 20 |
+
max_iters = 1000000 # total number of training iterations
|
| 21 |
+
eval_interval = 100000
|
| 22 |
+
log_interval = 1
|
| 23 |
+
eval_iters = 100
|
| 24 |
+
eval_only = False # if True, script exits right after the first eval
|
| 25 |
+
always_save_checkpoint = True # if True, always save a checkpoint after each eval
|
| 26 |
+
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
|
| 27 |
+
gradient_accumulation_steps = 16 # used to simulate larger batch sizes, should be multiple of GPU number
|
| 28 |
+
batch_size = 16
|
| 29 |
+
|
| 30 |
+
# adamw optimizer
|
| 31 |
+
learning_rate = 1e-3 # max learning rate
|
| 32 |
+
dropout = 0.0
|
| 33 |
+
weight_decay = 0
|
| 34 |
+
beta1 = 0.9
|
| 35 |
+
beta2 = 0.999
|
| 36 |
+
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
| 37 |
+
|
| 38 |
+
# learning rate decay settings
|
| 39 |
+
decay_lr = True # whether to decay the learning rate
|
| 40 |
+
warmup_iters = 5000 # how many steps to warm up for
|
| 41 |
+
lr_decay_iters = 1000000 # should be ~= max_iters per Chinchilla
|
| 42 |
+
min_lr = 1e-4 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
|
| 43 |
+
# DDP settings
|
| 44 |
+
backend = 'nccl' # 'nccl', 'gloo', etc.
|
| 45 |
+
# system
|
| 46 |
+
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
|
| 47 |
+
dtype = 'float32' # if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
|
| 48 |
+
compile = True # use PyTorch 2.0 to compile the model to be faster
|