li-muyang
/

zephyr-7b-sft-full

@@ -3,16 +3,11 @@ library_name: transformers
 license: apache-2.0
 base_model: mistralai/Mistral-7B-v0.1
 tags:
-- alignment-handbook
 - trl
 - sft
 - generated_from_trainer
-- trl
-- sft
-- alignment-handbook
-- generated_from_trainer
 datasets:
-- HuggingFaceH4/ultrachat_200k
 model-index:
 - name: zephyr-7b-sft-full
   results: []
@@ -23,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
 # zephyr-7b-sft-full
-This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the HuggingFaceH4/ultrachat_200k dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.9423
 ## Model description
@@ -59,6 +54,9 @@ The following hyperparameters were used during training:
 ### Training results
 ### Framework versions

 license: apache-2.0
 base_model: mistralai/Mistral-7B-v0.1
 tags:
 - trl
 - sft
 - generated_from_trainer
 datasets:
+- generator
 model-index:
 - name: zephyr-7b-sft-full
   results: []
 # zephyr-7b-sft-full
+This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.9420
 ## Model description
 ### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 0.9183        | 1.0   | 1084 | 0.9420          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,14 +1,9 @@
 {
     "epoch": 1.0,
-    "eval_loss": 0.9422639608383179,
-    "eval_runtime": 928.4763,
-    "eval_samples": 23109,
-    "eval_samples_per_second": 16.532,
-    "eval_steps_per_second": 0.258,
     "total_flos": 453935093514240.0,
-    "train_loss": 0.0,
-    "train_runtime": 0.0102,
     "train_samples": 207864,
-    "train_samples_per_second": 13581178.092,
-    "train_steps_per_second": 106152.682
 }

 {
     "epoch": 1.0,
     "total_flos": 453935093514240.0,
+    "train_loss": 0.9848188322408613,
+    "train_runtime": 36728.3484,
     "train_samples": 207864,
+    "train_samples_per_second": 3.776,
+    "train_steps_per_second": 0.03
 }

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
     "epoch": 1.0,
     "total_flos": 453935093514240.0,
-    "train_loss": 0.0,
-    "train_runtime": 0.0102,
     "train_samples": 207864,
-    "train_samples_per_second": 13581178.092,
-    "train_steps_per_second": 106152.682
 }

 {
     "epoch": 1.0,
     "total_flos": 453935093514240.0,
+    "train_loss": 0.9848188322408613,
+    "train_runtime": 36728.3484,
     "train_samples": 207864,
+    "train_samples_per_second": 3.776,
+    "train_steps_per_second": 0.03
 }

trainer_state.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "best_metric": null,
   "best_model_checkpoint": null,
   "epoch": 1.0,
-  "eval_steps": 500,
   "global_step": 1084,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
@@ -1527,14 +1527,22 @@
       "loss": 0.9183,
       "step": 1080
     },
     {
       "epoch": 1.0,
       "step": 1084,
       "total_flos": 453935093514240.0,
-      "train_loss": 0.0,
-      "train_runtime": 0.0102,
-      "train_samples_per_second": 13581178.092,
-      "train_steps_per_second": 106152.682
     }
   ],
   "logging_steps": 5,

   "best_metric": null,
   "best_model_checkpoint": null,
   "epoch": 1.0,
+  "eval_steps": 100.0,
   "global_step": 1084,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
       "loss": 0.9183,
       "step": 1080
     },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.9419716000556946,
+      "eval_runtime": 1011.6018,
+      "eval_samples_per_second": 15.174,
+      "eval_steps_per_second": 0.237,
+      "step": 1084
+    },
     {
       "epoch": 1.0,
       "step": 1084,
       "total_flos": 453935093514240.0,
+      "train_loss": 0.9848188322408613,
+      "train_runtime": 36728.3484,
+      "train_samples_per_second": 3.776,
+      "train_steps_per_second": 0.03
     }
   ],
   "logging_steps": 5,