Ubuntu commited on
Commit
8bb6e1d
·
1 Parent(s): e183a4e
eight_bit_config.json DELETED
@@ -1,43 +0,0 @@
1
- {
2
- "_name_or_path": "hivemind/gpt-j-6B-8bit",
3
- "activation_function": "gelu_new",
4
- "architectures": [
5
- "GPTJForCausalLM"
6
- ],
7
- "attn_pdrop": 0.0,
8
- "bos_token_id": 50256,
9
- "eight_bit": true,
10
- "embd_pdrop": 0.0,
11
- "eos_token_id": 50256,
12
- "gradient_checkpointing": false,
13
- "initializer_range": 0.02,
14
- "layer_norm_epsilon": 1e-05,
15
- "model_type": "gptj",
16
- "n_embd": 4096,
17
- "n_head": 16,
18
- "n_inner": null,
19
- "n_layer": 28,
20
- "n_positions": 2048,
21
- "resid_pdrop": 0.0,
22
- "rotary": true,
23
- "rotary_dim": 64,
24
- "scale_attn_weights": true,
25
- "summary_activation": null,
26
- "summary_first_dropout": 0.1,
27
- "summary_proj_to_labels": true,
28
- "summary_type": "cls_index",
29
- "summary_use_proj": true,
30
- "task_specific_params": {
31
- "text-generation": {
32
- "do_sample": true,
33
- "max_length": 50,
34
- "temperature": 1.0
35
- }
36
- },
37
- "tie_word_embeddings": false,
38
- "tokenizer_class": "GPT2Tokenizer",
39
- "torch_dtype": "float32",
40
- "transformers_version": "4.25.1",
41
- "use_cache": true,
42
- "vocab_size": 50400
43
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eight_bit_fully_trained.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ec9e447dcae0d53bdde910dfc9e9e2523c4dd69e95088cbca51e74e84dca68a
3
- size 6316560184
 
 
 
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:776f348ae750a40c43367e039333caef551872f031a95a5bfc92ce2e5e2dafdb
3
  size 24673403925
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2c13d1f6aca9d6c804c0bc9298ed055733e600a46b1bdca00020324eba63c6a
3
  size 24673403925
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.033567287152883735,
5
- "global_step": 440,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -182,11 +182,57 @@
182
  "eval_samples_per_second": 3.319,
183
  "eval_steps_per_second": 0.424,
184
  "step": 440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  }
186
  ],
187
  "max_steps": 13108,
188
  "num_train_epochs": 1,
189
- "total_flos": 6059125112832.0,
190
  "trial_name": null,
191
  "trial_params": null
192
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.0411962160512664,
5
+ "global_step": 540,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
182
  "eval_samples_per_second": 3.319,
183
  "eval_steps_per_second": 0.424,
184
  "step": 440
185
+ },
186
+ {
187
+ "epoch": 0.04,
188
+ "eval_loss": 2.2626798152923584,
189
+ "eval_runtime": 94.7992,
190
+ "eval_samples_per_second": 3.302,
191
+ "eval_steps_per_second": 0.422,
192
+ "step": 460
193
+ },
194
+ {
195
+ "epoch": 0.04,
196
+ "eval_loss": 2.2764577865600586,
197
+ "eval_runtime": 95.9001,
198
+ "eval_samples_per_second": 3.264,
199
+ "eval_steps_per_second": 0.417,
200
+ "step": 480
201
+ },
202
+ {
203
+ "epoch": 0.04,
204
+ "learning_rate": 5e-05,
205
+ "loss": 2.1752,
206
+ "step": 500
207
+ },
208
+ {
209
+ "epoch": 0.04,
210
+ "eval_loss": 2.270517110824585,
211
+ "eval_runtime": 95.5609,
212
+ "eval_samples_per_second": 3.275,
213
+ "eval_steps_per_second": 0.419,
214
+ "step": 500
215
+ },
216
+ {
217
+ "epoch": 0.04,
218
+ "eval_loss": 2.2533695697784424,
219
+ "eval_runtime": 95.989,
220
+ "eval_samples_per_second": 3.261,
221
+ "eval_steps_per_second": 0.417,
222
+ "step": 520
223
+ },
224
+ {
225
+ "epoch": 0.04,
226
+ "eval_loss": 2.2560901641845703,
227
+ "eval_runtime": 95.7778,
228
+ "eval_samples_per_second": 3.268,
229
+ "eval_steps_per_second": 0.418,
230
+ "step": 540
231
  }
232
  ],
233
  "max_steps": 13108,
234
  "num_train_epochs": 1,
235
+ "total_flos": 7509129560064.0,
236
  "trial_name": null,
237
  "trial_params": null
238
  }