sravanthib commited on
Commit
05c5922
·
verified ·
1 Parent(s): 4ccef5d

Training completed

Browse files
Files changed (4) hide show
  1. README.md +4 -4
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +9 -16
README.md CHANGED
@@ -41,12 +41,12 @@ The following hyperparameters were used during training:
41
  - eval_batch_size: 8
42
  - seed: 42
43
  - distributed_type: multi-GPU
44
- - num_devices: 4
45
  - gradient_accumulation_steps: 10
46
- - total_train_batch_size: 80
47
- - total_eval_batch_size: 32
48
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
49
- - lr_scheduler_type: linear
50
  - lr_scheduler_warmup_ratio: 0.03
51
  - training_steps: 10
52
 
 
41
  - eval_batch_size: 8
42
  - seed: 42
43
  - distributed_type: multi-GPU
44
+ - num_devices: 8
45
  - gradient_accumulation_steps: 10
46
+ - total_train_batch_size: 160
47
+ - total_eval_batch_size: 64
48
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
49
+ - lr_scheduler_type: cosine
50
  - lr_scheduler_warmup_ratio: 0.03
51
  - training_steps: 10
52
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.0091324200913242,
3
- "total_flos": 6.970544231337165e+16,
4
- "train_loss": 4.553681945800781,
5
- "train_runtime": 133.9104,
6
- "train_samples_per_second": 5.974,
7
- "train_steps_per_second": 0.075
8
  }
 
1
  {
2
+ "epoch": 0.0182648401826484,
3
+ "total_flos": 8.713180396545638e+16,
4
+ "train_loss": 9.561991882324218,
5
+ "train_runtime": 174.8934,
6
+ "train_samples_per_second": 9.148,
7
+ "train_steps_per_second": 0.057
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.0091324200913242,
3
- "total_flos": 6.970544231337165e+16,
4
- "train_loss": 4.553681945800781,
5
- "train_runtime": 133.9104,
6
- "train_samples_per_second": 5.974,
7
- "train_steps_per_second": 0.075
8
  }
 
1
  {
2
+ "epoch": 0.0182648401826484,
3
+ "total_flos": 8.713180396545638e+16,
4
+ "train_loss": 9.561991882324218,
5
+ "train_runtime": 174.8934,
6
+ "train_samples_per_second": 9.148,
7
+ "train_steps_per_second": 0.057
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.0091324200913242,
6
  "eval_steps": 0,
7
  "global_step": 10,
8
  "is_hyper_param_search": false,
@@ -10,23 +10,16 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.0091324200913242,
14
- "grad_norm": 0.3158293664455414,
15
- "learning_rate": 0.0001,
16
- "loss": 4.5537,
17
- "step": 10
18
- },
19
- {
20
- "epoch": 0.0091324200913242,
21
  "step": 10,
22
- "total_flos": 6.970544231337165e+16,
23
- "train_loss": 4.553681945800781,
24
- "train_runtime": 133.9104,
25
- "train_samples_per_second": 5.974,
26
- "train_steps_per_second": 0.075
27
  }
28
  ],
29
- "logging_steps": 10,
30
  "max_steps": 10,
31
  "num_input_tokens_seen": 0,
32
  "num_train_epochs": 1,
@@ -43,7 +36,7 @@
43
  "attributes": {}
44
  }
45
  },
46
- "total_flos": 6.970544231337165e+16,
47
  "train_batch_size": 2,
48
  "trial_name": null,
49
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.0182648401826484,
6
  "eval_steps": 0,
7
  "global_step": 10,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.0182648401826484,
 
 
 
 
 
 
 
14
  "step": 10,
15
+ "total_flos": 8.713180396545638e+16,
16
+ "train_loss": 9.561991882324218,
17
+ "train_runtime": 174.8934,
18
+ "train_samples_per_second": 9.148,
19
+ "train_steps_per_second": 0.057
20
  }
21
  ],
22
+ "logging_steps": 50,
23
  "max_steps": 10,
24
  "num_input_tokens_seen": 0,
25
  "num_train_epochs": 1,
 
36
  "attributes": {}
37
  }
38
  },
39
+ "total_flos": 8.713180396545638e+16,
40
  "train_batch_size": 2,
41
  "trial_name": null,
42
  "trial_params": null