File size: 4,978 Bytes
c98613b
 
84da9dd
c98613b
 
 
 
84da9dd
c98613b
 
 
 
f03dcd7
8a2ebd6
7d2cb3f
c98613b
 
 
 
 
8a2ebd6
c98613b
 
024aefe
c98613b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab16584
c98613b
 
 
 
 
c60dfe3
c98613b
 
c13c3d0
c98613b
 
 
 
 
e9d9da3
c98613b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9d9da3
 
 
 
 
 
 
 
 
 
 
c98613b
 
 
 
 
 
 
 
 
 
 
e9d9da3
 
 
 
 
 
 
 
 
 
c98613b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
# ``model_config``. (type: Optional[str], default: null)
model_name: 'tangled-alpha-0.10-core'

# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
# ``model_config``. (type: Optional[Config], default: null)
model_config:
  name: 'tangled-alpha-0.10-core'
  block_size: 131072
  vocab_size: 131072
  padded_vocab_size: 131072
  n_layer: 32
  n_head: 12
  n_embd: 768
  n_query_groups: 4
  rotary_percentage: 1.0
  parallel_residual: False
  bias: False
  norm_class_name: "RMSNorm"
  mlp_class_name: "LLaMAMLP"
  intermediate_size: 2048 # n_embd * 2.666
  norm_eps: 1e-5
  rope_base: 4300 # https://arxiv.org/pdf/2405.14591
  head_size: 64 # n_embd / n_head

# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
out_dir: "../out/pretrain-core-0/"

# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
# precision: bf16-mixed
precision: bf16-true

# Optional path to a checkpoint directory to initialize the model from.
# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
initial_checkpoint_dir:

# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
# (type: Union[bool, Literal["auto"], Path], default: False)
resume: "auto"

# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
data:
  class_path: LitData

  init_args:
    data_path: "../core-data-0-0-1073741824-1025-16000/"
    num_workers: 32

# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:
  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
  save_interval: 50

  # Number of iterations between logging calls (type: int, default: 1)
  log_interval: 1

  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
  global_batch_size: 512

  # Number of samples per data-parallel rank (type: int, default: 4)
  micro_batch_size: 8

  # Number of iterations with learning rate warmup active (type: int, default: 2000)
  lr_warmup_steps: 2000

  # Number of epochs to train on (type: Optional[int], default: null)
  epochs:

  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
  max_tokens: 11186775175

  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
  max_steps:

  # Limits the length of samples. Off by default (type: Optional[int], default: null)
  max_seq_length: 1025

  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
  tie_embeddings: false

  #   (type: Optional[float], default: 1.0)
  max_norm: 1.0

  #   (type: float, default: 4e-05)
  min_lr: 1e-5

# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:
  # Number of optimizer steps between evaluation calls (type: int, default: 1000)
  interval: 50

  # Number of tokens to generate (type: Optional[int], default: null)
  max_new_tokens:

  # Number of iterations (type: int, default: 100)
  max_iters: 100

  # Whether to evaluate on the validation set at the beginning of the training
  initial_validation: false

  # Whether to evaluate on the validation set at the end the training
  final_validation: true

# Optimizer-related arguments
# optimizer:
#   class_path: torch.optim.AdamW
#   init_args:
#     # (type: float, default: 0.001)
#     lr: 3e-4
#     # (type: float, default: 0.01)
#     weight_decay: 0.01
#     # (type: tuple, default: (0.9,0.999))
#     betas:
#       - 0.9
#       - 0.999

# optimizer:
#   class_path: sophia_opt.SophiaG
#   init_args:
#     lr: 3e-4
#     betas:
#       - 0.9
#       - 0.95
#     rho: 0.05
#     weight_decay: 0.1

optimizer:
  class_path: sophia_opt.SophiaG
  init_args:
    lr: 1e-4
    betas:
      - 0.965
      - 0.99
    rho: 0.04
    weight_decay: 1e-1

# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
devices: auto

# How many nodes to use. (type: int, default: 1)
num_nodes: 1

# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
# module require this. (type: Optional[Path], default: null)
tokenizer_dir: "../tokenizer"

# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
logger_name: "wandb"

# The random seed to use for reproducibility. (type: int, default: 42)
seed: 23