mtasic85 commited on
Commit
c13c3d0
·
1 Parent(s): 30f3d6e

pretrain core 1

Browse files
Files changed (2) hide show
  1. config-0.json +1 -1
  2. scripts/pretrain_core_model_0.yaml +12 -24
config-0.json CHANGED
@@ -21,7 +21,7 @@
21
  "rms_norm_eps": 1e-05,
22
  "rope_scaling": null,
23
  "rope_theta": 4300.0,
24
- "tie_word_embeddings": true,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.45.0.dev0",
27
  "use_cache": true,
 
21
  "rms_norm_eps": 1e-05,
22
  "rope_scaling": null,
23
  "rope_theta": 4300.0,
24
+ "tie_word_embeddings": false,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.45.0.dev0",
27
  "use_cache": true,
scripts/pretrain_core_model_0.yaml CHANGED
@@ -61,7 +61,6 @@ train:
61
  global_batch_size: 512
62
 
63
  # Number of samples per data-parallel rank (type: int, default: 4)
64
- # micro_batch_size: 2
65
  micro_batch_size: 8
66
 
67
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
@@ -77,11 +76,10 @@ train:
77
  max_steps:
78
 
79
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
80
- # max_seq_length: 4096
81
  max_seq_length: 1024
82
 
83
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
84
- tie_embeddings: true
85
 
86
  # (type: Optional[float], default: 1.0)
87
  max_norm: 1.0
@@ -107,22 +105,17 @@ eval:
107
  final_validation: true
108
 
109
  # Optimizer-related arguments
110
-
111
- # optimizer:
112
- # class_path: torch.optim.AdamW
113
- # # class_path: torchao.prototype.low_bit_optim.AdamW8bit
114
- # # class_path: torchao.prototype.low_bit_optim.AdamW4bit
115
- # # class_path: bitsandbytes.optim.AdamW8bit
116
- # # class_path: bitsandbytes.optim.PagedAdamW8bit
117
- # init_args:
118
- # # (type: float, default: 0.001)
119
- # lr: 3e-4
120
- # # (type: float, default: 0.01)
121
- # weight_decay: 0.01
122
- # # (type: tuple, default: (0.9,0.999))
123
- # betas:
124
- # - 0.9
125
- # - 0.999
126
 
127
  # optimizer:
128
  # class_path: sophia_opt.SophiaG
@@ -134,11 +127,6 @@ eval:
134
  # rho: 0.05
135
  # weight_decay: 0.1
136
 
137
- optimizer:
138
- class_path: dolphinflow.DolphinFlow
139
- init_args:
140
- lr: 3e-4
141
-
142
  # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
143
  devices: auto
144
 
 
61
  global_batch_size: 512
62
 
63
  # Number of samples per data-parallel rank (type: int, default: 4)
 
64
  micro_batch_size: 8
65
 
66
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
 
76
  max_steps:
77
 
78
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
 
79
  max_seq_length: 1024
80
 
81
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
82
+ tie_embeddings: false
83
 
84
  # (type: Optional[float], default: 1.0)
85
  max_norm: 1.0
 
105
  final_validation: true
106
 
107
  # Optimizer-related arguments
108
+ optimizer:
109
+ class_path: torch.optim.AdamW
110
+ init_args:
111
+ # (type: float, default: 0.001)
112
+ lr: 3e-4
113
+ # (type: float, default: 0.01)
114
+ weight_decay: 0.01
115
+ # (type: tuple, default: (0.9,0.999))
116
+ betas:
117
+ - 0.9
118
+ - 0.999
 
 
 
 
 
119
 
120
  # optimizer:
121
  # class_path: sophia_opt.SophiaG
 
127
  # rho: 0.05
128
  # weight_decay: 0.1
129
 
 
 
 
 
 
130
  # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
131
  devices: auto
132