Training completed

Files changed (4) hide show

README.md CHANGED Viewed

@@ -41,12 +41,12 @@ The following hyperparameters were used during training:
 - eval_batch_size: 8
 - seed: 42
 - distributed_type: multi-GPU
-- num_devices: 4
 - gradient_accumulation_steps: 10
-- total_train_batch_size: 80
-- total_eval_batch_size: 32
 - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
-- lr_scheduler_type: linear
 - lr_scheduler_warmup_ratio: 0.03
 - training_steps: 10

 - eval_batch_size: 8
 - seed: 42
 - distributed_type: multi-GPU
+- num_devices: 8
 - gradient_accumulation_steps: 10
+- total_train_batch_size: 160
+- total_eval_batch_size: 64
 - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.03
 - training_steps: 10

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 0.0091324200913242,
-    "total_flos": 6.970544231337165e+16,
-    "train_loss": 4.553681945800781,
-    "train_runtime": 133.9104,
-    "train_samples_per_second": 5.974,
-    "train_steps_per_second": 0.075
 }

 {
+    "epoch": 0.0182648401826484,
+    "total_flos": 8.713180396545638e+16,
+    "train_loss": 9.561991882324218,
+    "train_runtime": 174.8934,
+    "train_samples_per_second": 9.148,
+    "train_steps_per_second": 0.057
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 0.0091324200913242,
-    "total_flos": 6.970544231337165e+16,
-    "train_loss": 4.553681945800781,
-    "train_runtime": 133.9104,
-    "train_samples_per_second": 5.974,
-    "train_steps_per_second": 0.075
 }

 {
+    "epoch": 0.0182648401826484,
+    "total_flos": 8.713180396545638e+16,
+    "train_loss": 9.561991882324218,
+    "train_runtime": 174.8934,
+    "train_samples_per_second": 9.148,
+    "train_steps_per_second": 0.057
 }

trainer_state.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.0091324200913242,
   "eval_steps": 0,
   "global_step": 10,
   "is_hyper_param_search": false,
@@ -10,23 +10,16 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.0091324200913242,
-      "grad_norm": 0.3158293664455414,
-      "learning_rate": 0.0001,
-      "loss": 4.5537,
-      "step": 10
-    },
-    {
-      "epoch": 0.0091324200913242,
       "step": 10,
-      "total_flos": 6.970544231337165e+16,
-      "train_loss": 4.553681945800781,
-      "train_runtime": 133.9104,
-      "train_samples_per_second": 5.974,
-      "train_steps_per_second": 0.075
     }
   ],
-  "logging_steps": 10,
   "max_steps": 10,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
@@ -43,7 +36,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 6.970544231337165e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.0182648401826484,
   "eval_steps": 0,
   "global_step": 10,
   "is_hyper_param_search": false,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.0182648401826484,
       "step": 10,
+      "total_flos": 8.713180396545638e+16,
+      "train_loss": 9.561991882324218,
+      "train_runtime": 174.8934,
+      "train_samples_per_second": 9.148,
+      "train_steps_per_second": 0.057
     }
   ],
+  "logging_steps": 50,
   "max_steps": 10,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
       "attributes": {}
     }
   },
+  "total_flos": 8.713180396545638e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null