YuchenLi01 commited on
Commit
c5cd2f2
·
verified ·
1 Parent(s): dc1137b

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: alignment-handbook/zephyr-7b-sft-full
3
+ library_name: transformers
4
+ model_name: ultrafeedbackSkyworkAgree_alignmentZephyr7BSftFull_sdpo_score_ebs64_lr1e-07_0
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - dpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for ultrafeedbackSkyworkAgree_alignmentZephyr7BSftFull_sdpo_score_ebs64_lr1e-07_0
13
+
14
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="YuchenLi01/ultrafeedbackSkyworkAgree_alignmentZephyr7BSftFull_sdpo_score_ebs64_lr1e-07_0", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yuchenl4/lmpref/runs/ultrafeedbackSkyworkAgree_alignmentZephyr7BSftFull_sdpo_score_ebs64_lr1e-07_0try1prJ7sto2sYHlSCIOJJTTEhdKWJiU3dhkNo0dSnqgrczqZp)
31
+
32
+ This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.12.0
37
+ - Transformers: 4.46.3
38
+ - Pytorch: 2.3.0
39
+ - Datasets: 3.1.0
40
+ - Tokenizers: 0.20.3
41
+
42
+ ## Citations
43
+
44
+ Cite DPO as:
45
+
46
+ ```bibtex
47
+ @inproceedings{rafailov2023direct,
48
+ title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}},
49
+ author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn},
50
+ year = 2023,
51
+ booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023},
52
+ url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html},
53
+ editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine},
54
+ }
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.5638467967928208,
5
+ "train_runtime": 32082.6297,
6
+ "train_samples": 45608,
7
+ "train_samples_per_second": 1.422,
8
+ "train_steps_per_second": 0.022
9
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.46.3"
6
+ }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbc706cd3380d0ba63b227c11859b89d4a29e436a262fa322e5b69b61897e6a6
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:961b26ebfb4b70fee969a499072dbadc0e5b980ab8278407269561cd99946597
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:548b1af57a5ee26dc3f83a46eba6cdc3d9649660943160f52fe7df12c948a3fc
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b468511ff6adc1a7bb835581d7f713f6d5a27f40bf0a97436a833067c658c7ed
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d08716c2fb898ee01ec61f32fd753ef8157569f532519be91a34e4a1f018b861
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:796bbafa99f0bd0d7a660d35bdde78d861ab57505eef01785075d37482b2d86c
3
  size 4540516344
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.5638467967928208,
5
+ "train_runtime": 32082.6297,
6
+ "train_samples": 45608,
7
+ "train_samples_per_second": 1.422,
8
+ "train_steps_per_second": 0.022
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,2546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 8,
6
+ "global_step": 713,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.001402524544179523,
13
+ "grad_norm": 13.312526408630161,
14
+ "learning_rate": 1.3888888888888888e-09,
15
+ "logits/chosen": -3.09375,
16
+ "logits/rejected": -3.0,
17
+ "logps/chosen": -410.0,
18
+ "logps/rejected": -408.0,
19
+ "loss": 0.6914,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/rejected": 0.0,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.011220196353436185,
28
+ "eval_logits/chosen": -3.234375,
29
+ "eval_logits/rejected": -3.28125,
30
+ "eval_logps/chosen": -322.0,
31
+ "eval_logps/rejected": -276.0,
32
+ "eval_loss": 0.6922186017036438,
33
+ "eval_rewards/accuracies": 0.228723406791687,
34
+ "eval_rewards/chosen": -0.0003604888916015625,
35
+ "eval_rewards/margins": -0.001190185546875,
36
+ "eval_rewards/rejected": 0.000823974609375,
37
+ "eval_runtime": 64.4932,
38
+ "eval_samples_per_second": 23.057,
39
+ "eval_steps_per_second": 0.729,
40
+ "step": 8
41
+ },
42
+ {
43
+ "epoch": 0.014025245441795231,
44
+ "grad_norm": 10.597327421994228,
45
+ "learning_rate": 1.3888888888888889e-08,
46
+ "logits/chosen": -3.1875,
47
+ "logits/rejected": -3.125,
48
+ "logps/chosen": -310.0,
49
+ "logps/rejected": -286.0,
50
+ "loss": 0.6921,
51
+ "rewards/accuracies": 0.2222222238779068,
52
+ "rewards/chosen": 0.00128936767578125,
53
+ "rewards/margins": 0.00201416015625,
54
+ "rewards/rejected": -0.000728607177734375,
55
+ "step": 10
56
+ },
57
+ {
58
+ "epoch": 0.02244039270687237,
59
+ "eval_logits/chosen": -3.234375,
60
+ "eval_logits/rejected": -3.28125,
61
+ "eval_logps/chosen": -322.0,
62
+ "eval_logps/rejected": -276.0,
63
+ "eval_loss": 0.6920655369758606,
64
+ "eval_rewards/accuracies": 0.18617020547389984,
65
+ "eval_rewards/chosen": -0.0014801025390625,
66
+ "eval_rewards/margins": -0.0022125244140625,
67
+ "eval_rewards/rejected": 0.00074005126953125,
68
+ "eval_runtime": 66.7125,
69
+ "eval_samples_per_second": 22.29,
70
+ "eval_steps_per_second": 0.705,
71
+ "step": 16
72
+ },
73
+ {
74
+ "epoch": 0.028050490883590462,
75
+ "grad_norm": 13.836822232155031,
76
+ "learning_rate": 2.7777777777777777e-08,
77
+ "logits/chosen": -3.171875,
78
+ "logits/rejected": -3.25,
79
+ "logps/chosen": -324.0,
80
+ "logps/rejected": -255.0,
81
+ "loss": 0.6922,
82
+ "rewards/accuracies": 0.20000000298023224,
83
+ "rewards/chosen": -0.000904083251953125,
84
+ "rewards/margins": 0.0003910064697265625,
85
+ "rewards/rejected": -0.0012969970703125,
86
+ "step": 20
87
+ },
88
+ {
89
+ "epoch": 0.033660589060308554,
90
+ "eval_logits/chosen": -3.234375,
91
+ "eval_logits/rejected": -3.28125,
92
+ "eval_logps/chosen": -322.0,
93
+ "eval_logps/rejected": -276.0,
94
+ "eval_loss": 0.6919813752174377,
95
+ "eval_rewards/accuracies": 0.25,
96
+ "eval_rewards/chosen": -0.00095367431640625,
97
+ "eval_rewards/margins": -0.00067138671875,
98
+ "eval_rewards/rejected": -0.0002803802490234375,
99
+ "eval_runtime": 65.6733,
100
+ "eval_samples_per_second": 22.642,
101
+ "eval_steps_per_second": 0.716,
102
+ "step": 24
103
+ },
104
+ {
105
+ "epoch": 0.04207573632538569,
106
+ "grad_norm": 9.808894950380177,
107
+ "learning_rate": 4.166666666666667e-08,
108
+ "logits/chosen": -3.1875,
109
+ "logits/rejected": -3.1875,
110
+ "logps/chosen": -322.0,
111
+ "logps/rejected": -280.0,
112
+ "loss": 0.6917,
113
+ "rewards/accuracies": 0.3375000059604645,
114
+ "rewards/chosen": 0.00023365020751953125,
115
+ "rewards/margins": 0.0014190673828125,
116
+ "rewards/rejected": -0.001190185546875,
117
+ "step": 30
118
+ },
119
+ {
120
+ "epoch": 0.04488078541374474,
121
+ "eval_logits/chosen": -3.234375,
122
+ "eval_logits/rejected": -3.28125,
123
+ "eval_logps/chosen": -322.0,
124
+ "eval_logps/rejected": -276.0,
125
+ "eval_loss": 0.691659152507782,
126
+ "eval_rewards/accuracies": 0.24468085169792175,
127
+ "eval_rewards/chosen": -0.000888824462890625,
128
+ "eval_rewards/margins": 0.0002002716064453125,
129
+ "eval_rewards/rejected": -0.00109100341796875,
130
+ "eval_runtime": 65.5796,
131
+ "eval_samples_per_second": 22.675,
132
+ "eval_steps_per_second": 0.717,
133
+ "step": 32
134
+ },
135
+ {
136
+ "epoch": 0.056100981767180924,
137
+ "grad_norm": 10.03150100621376,
138
+ "learning_rate": 5.5555555555555555e-08,
139
+ "logits/chosen": -3.234375,
140
+ "logits/rejected": -3.234375,
141
+ "logps/chosen": -324.0,
142
+ "logps/rejected": -240.0,
143
+ "loss": 0.6919,
144
+ "rewards/accuracies": 0.1875,
145
+ "rewards/chosen": -0.002655029296875,
146
+ "rewards/margins": -0.001373291015625,
147
+ "rewards/rejected": -0.00128173828125,
148
+ "step": 40
149
+ },
150
+ {
151
+ "epoch": 0.056100981767180924,
152
+ "eval_logits/chosen": -3.234375,
153
+ "eval_logits/rejected": -3.28125,
154
+ "eval_logps/chosen": -322.0,
155
+ "eval_logps/rejected": -276.0,
156
+ "eval_loss": 0.6911664009094238,
157
+ "eval_rewards/accuracies": 0.3085106313228607,
158
+ "eval_rewards/chosen": -0.001922607421875,
159
+ "eval_rewards/margins": 0.00115203857421875,
160
+ "eval_rewards/rejected": -0.0030670166015625,
161
+ "eval_runtime": 65.5097,
162
+ "eval_samples_per_second": 22.699,
163
+ "eval_steps_per_second": 0.717,
164
+ "step": 40
165
+ },
166
+ {
167
+ "epoch": 0.06732117812061711,
168
+ "eval_logits/chosen": -3.234375,
169
+ "eval_logits/rejected": -3.265625,
170
+ "eval_logps/chosen": -322.0,
171
+ "eval_logps/rejected": -276.0,
172
+ "eval_loss": 0.6904101371765137,
173
+ "eval_rewards/accuracies": 0.4095744788646698,
174
+ "eval_rewards/chosen": -0.00360107421875,
175
+ "eval_rewards/margins": 0.0032958984375,
176
+ "eval_rewards/rejected": -0.00689697265625,
177
+ "eval_runtime": 65.8647,
178
+ "eval_samples_per_second": 22.577,
179
+ "eval_steps_per_second": 0.714,
180
+ "step": 48
181
+ },
182
+ {
183
+ "epoch": 0.07012622720897616,
184
+ "grad_norm": 11.024207801738104,
185
+ "learning_rate": 6.944444444444444e-08,
186
+ "logits/chosen": -3.03125,
187
+ "logits/rejected": -3.15625,
188
+ "logps/chosen": -342.0,
189
+ "logps/rejected": -306.0,
190
+ "loss": 0.6917,
191
+ "rewards/accuracies": 0.38749998807907104,
192
+ "rewards/chosen": -0.0007171630859375,
193
+ "rewards/margins": 0.004364013671875,
194
+ "rewards/rejected": -0.005096435546875,
195
+ "step": 50
196
+ },
197
+ {
198
+ "epoch": 0.0785413744740533,
199
+ "eval_logits/chosen": -3.234375,
200
+ "eval_logits/rejected": -3.265625,
201
+ "eval_logps/chosen": -322.0,
202
+ "eval_logps/rejected": -276.0,
203
+ "eval_loss": 0.6893861293792725,
204
+ "eval_rewards/accuracies": 0.5,
205
+ "eval_rewards/chosen": -0.007110595703125,
206
+ "eval_rewards/margins": 0.006195068359375,
207
+ "eval_rewards/rejected": -0.0133056640625,
208
+ "eval_runtime": 66.2121,
209
+ "eval_samples_per_second": 22.458,
210
+ "eval_steps_per_second": 0.71,
211
+ "step": 56
212
+ },
213
+ {
214
+ "epoch": 0.08415147265077139,
215
+ "grad_norm": 11.128580526610264,
216
+ "learning_rate": 8.333333333333334e-08,
217
+ "logits/chosen": -3.1875,
218
+ "logits/rejected": -3.15625,
219
+ "logps/chosen": -286.0,
220
+ "logps/rejected": -266.0,
221
+ "loss": 0.6902,
222
+ "rewards/accuracies": 0.512499988079071,
223
+ "rewards/chosen": -0.00726318359375,
224
+ "rewards/margins": 0.005401611328125,
225
+ "rewards/rejected": -0.0126953125,
226
+ "step": 60
227
+ },
228
+ {
229
+ "epoch": 0.08976157082748948,
230
+ "eval_logits/chosen": -3.234375,
231
+ "eval_logits/rejected": -3.265625,
232
+ "eval_logps/chosen": -322.0,
233
+ "eval_logps/rejected": -278.0,
234
+ "eval_loss": 0.6874608993530273,
235
+ "eval_rewards/accuracies": 0.542553186416626,
236
+ "eval_rewards/chosen": -0.0118408203125,
237
+ "eval_rewards/margins": 0.01043701171875,
238
+ "eval_rewards/rejected": -0.0223388671875,
239
+ "eval_runtime": 65.3339,
240
+ "eval_samples_per_second": 22.76,
241
+ "eval_steps_per_second": 0.719,
242
+ "step": 64
243
+ },
244
+ {
245
+ "epoch": 0.09817671809256662,
246
+ "grad_norm": 11.049642821906803,
247
+ "learning_rate": 9.722222222222221e-08,
248
+ "logits/chosen": -3.21875,
249
+ "logits/rejected": -3.234375,
250
+ "logps/chosen": -290.0,
251
+ "logps/rejected": -280.0,
252
+ "loss": 0.6868,
253
+ "rewards/accuracies": 0.6625000238418579,
254
+ "rewards/chosen": -0.0150146484375,
255
+ "rewards/margins": 0.01202392578125,
256
+ "rewards/rejected": -0.027099609375,
257
+ "step": 70
258
+ },
259
+ {
260
+ "epoch": 0.10098176718092566,
261
+ "eval_logits/chosen": -3.21875,
262
+ "eval_logits/rejected": -3.265625,
263
+ "eval_logps/chosen": -324.0,
264
+ "eval_logps/rejected": -280.0,
265
+ "eval_loss": 0.6850191354751587,
266
+ "eval_rewards/accuracies": 0.6117021441459656,
267
+ "eval_rewards/chosen": -0.0218505859375,
268
+ "eval_rewards/margins": 0.01531982421875,
269
+ "eval_rewards/rejected": -0.037109375,
270
+ "eval_runtime": 65.5283,
271
+ "eval_samples_per_second": 22.692,
272
+ "eval_steps_per_second": 0.717,
273
+ "step": 72
274
+ },
275
+ {
276
+ "epoch": 0.11220196353436185,
277
+ "grad_norm": 12.079775942720834,
278
+ "learning_rate": 9.996157197797842e-08,
279
+ "logits/chosen": -3.1875,
280
+ "logits/rejected": -3.234375,
281
+ "logps/chosen": -332.0,
282
+ "logps/rejected": -286.0,
283
+ "loss": 0.6839,
284
+ "rewards/accuracies": 0.7250000238418579,
285
+ "rewards/chosen": -0.0262451171875,
286
+ "rewards/margins": 0.02587890625,
287
+ "rewards/rejected": -0.052001953125,
288
+ "step": 80
289
+ },
290
+ {
291
+ "epoch": 0.11220196353436185,
292
+ "eval_logits/chosen": -3.21875,
293
+ "eval_logits/rejected": -3.25,
294
+ "eval_logps/chosen": -326.0,
295
+ "eval_logps/rejected": -282.0,
296
+ "eval_loss": 0.6817147135734558,
297
+ "eval_rewards/accuracies": 0.6595744490623474,
298
+ "eval_rewards/chosen": -0.03564453125,
299
+ "eval_rewards/margins": 0.0233154296875,
300
+ "eval_rewards/rejected": -0.058837890625,
301
+ "eval_runtime": 65.1964,
302
+ "eval_samples_per_second": 22.808,
303
+ "eval_steps_per_second": 0.721,
304
+ "step": 80
305
+ },
306
+ {
307
+ "epoch": 0.12342215988779803,
308
+ "eval_logits/chosen": -3.203125,
309
+ "eval_logits/rejected": -3.25,
310
+ "eval_logps/chosen": -326.0,
311
+ "eval_logps/rejected": -284.0,
312
+ "eval_loss": 0.6778491139411926,
313
+ "eval_rewards/accuracies": 0.664893627166748,
314
+ "eval_rewards/chosen": -0.0517578125,
315
+ "eval_rewards/margins": 0.033935546875,
316
+ "eval_rewards/rejected": -0.08544921875,
317
+ "eval_runtime": 65.8317,
318
+ "eval_samples_per_second": 22.588,
319
+ "eval_steps_per_second": 0.714,
320
+ "step": 88
321
+ },
322
+ {
323
+ "epoch": 0.12622720897615708,
324
+ "grad_norm": 13.482170939282454,
325
+ "learning_rate": 9.980555936859367e-08,
326
+ "logits/chosen": -3.171875,
327
+ "logits/rejected": -3.15625,
328
+ "logps/chosen": -338.0,
329
+ "logps/rejected": -304.0,
330
+ "loss": 0.6802,
331
+ "rewards/accuracies": 0.612500011920929,
332
+ "rewards/chosen": -0.05029296875,
333
+ "rewards/margins": 0.0245361328125,
334
+ "rewards/rejected": -0.07470703125,
335
+ "step": 90
336
+ },
337
+ {
338
+ "epoch": 0.13464235624123422,
339
+ "eval_logits/chosen": -3.203125,
340
+ "eval_logits/rejected": -3.234375,
341
+ "eval_logps/chosen": -328.0,
342
+ "eval_logps/rejected": -286.0,
343
+ "eval_loss": 0.6740643978118896,
344
+ "eval_rewards/accuracies": 0.6702127456665039,
345
+ "eval_rewards/chosen": -0.07177734375,
346
+ "eval_rewards/margins": 0.041015625,
347
+ "eval_rewards/rejected": -0.11279296875,
348
+ "eval_runtime": 66.7185,
349
+ "eval_samples_per_second": 22.288,
350
+ "eval_steps_per_second": 0.704,
351
+ "step": 96
352
+ },
353
+ {
354
+ "epoch": 0.1402524544179523,
355
+ "grad_norm": 11.204042194314384,
356
+ "learning_rate": 9.952993480848836e-08,
357
+ "logits/chosen": -3.15625,
358
+ "logits/rejected": -3.171875,
359
+ "logps/chosen": -344.0,
360
+ "logps/rejected": -316.0,
361
+ "loss": 0.6744,
362
+ "rewards/accuracies": 0.6625000238418579,
363
+ "rewards/chosen": -0.0732421875,
364
+ "rewards/margins": 0.03369140625,
365
+ "rewards/rejected": -0.10693359375,
366
+ "step": 100
367
+ },
368
+ {
369
+ "epoch": 0.1458625525946704,
370
+ "eval_logits/chosen": -3.203125,
371
+ "eval_logits/rejected": -3.234375,
372
+ "eval_logps/chosen": -330.0,
373
+ "eval_logps/rejected": -290.0,
374
+ "eval_loss": 0.6696762442588806,
375
+ "eval_rewards/accuracies": 0.6808510422706604,
376
+ "eval_rewards/chosen": -0.08447265625,
377
+ "eval_rewards/margins": 0.052490234375,
378
+ "eval_rewards/rejected": -0.13671875,
379
+ "eval_runtime": 65.3752,
380
+ "eval_samples_per_second": 22.746,
381
+ "eval_steps_per_second": 0.719,
382
+ "step": 104
383
+ },
384
+ {
385
+ "epoch": 0.15427769985974754,
386
+ "grad_norm": 10.547622004632332,
387
+ "learning_rate": 9.913536023162564e-08,
388
+ "logits/chosen": -3.109375,
389
+ "logits/rejected": -3.15625,
390
+ "logps/chosen": -338.0,
391
+ "logps/rejected": -284.0,
392
+ "loss": 0.6694,
393
+ "rewards/accuracies": 0.6625000238418579,
394
+ "rewards/chosen": -0.0966796875,
395
+ "rewards/margins": 0.034423828125,
396
+ "rewards/rejected": -0.130859375,
397
+ "step": 110
398
+ },
399
+ {
400
+ "epoch": 0.1570827489481066,
401
+ "eval_logits/chosen": -3.1875,
402
+ "eval_logits/rejected": -3.21875,
403
+ "eval_logps/chosen": -332.0,
404
+ "eval_logps/rejected": -292.0,
405
+ "eval_loss": 0.6651813387870789,
406
+ "eval_rewards/accuracies": 0.6808510422706604,
407
+ "eval_rewards/chosen": -0.09912109375,
408
+ "eval_rewards/margins": 0.0625,
409
+ "eval_rewards/rejected": -0.1611328125,
410
+ "eval_runtime": 66.8719,
411
+ "eval_samples_per_second": 22.237,
412
+ "eval_steps_per_second": 0.703,
413
+ "step": 112
414
+ },
415
+ {
416
+ "epoch": 0.16830294530154277,
417
+ "grad_norm": 11.442066252813966,
418
+ "learning_rate": 9.862278323974797e-08,
419
+ "logits/chosen": -3.25,
420
+ "logits/rejected": -3.25,
421
+ "logps/chosen": -332.0,
422
+ "logps/rejected": -266.0,
423
+ "loss": 0.6609,
424
+ "rewards/accuracies": 0.8500000238418579,
425
+ "rewards/chosen": -0.11376953125,
426
+ "rewards/margins": 0.0927734375,
427
+ "rewards/rejected": -0.2060546875,
428
+ "step": 120
429
+ },
430
+ {
431
+ "epoch": 0.16830294530154277,
432
+ "eval_logits/chosen": -3.1875,
433
+ "eval_logits/rejected": -3.21875,
434
+ "eval_logps/chosen": -334.0,
435
+ "eval_logps/rejected": -294.0,
436
+ "eval_loss": 0.6606339812278748,
437
+ "eval_rewards/accuracies": 0.707446813583374,
438
+ "eval_rewards/chosen": -0.11572265625,
439
+ "eval_rewards/margins": 0.07568359375,
440
+ "eval_rewards/rejected": -0.19140625,
441
+ "eval_runtime": 65.1545,
442
+ "eval_samples_per_second": 22.823,
443
+ "eval_steps_per_second": 0.721,
444
+ "step": 120
445
+ },
446
+ {
447
+ "epoch": 0.17952314165497896,
448
+ "eval_logits/chosen": -3.171875,
449
+ "eval_logits/rejected": -3.203125,
450
+ "eval_logps/chosen": -334.0,
451
+ "eval_logps/rejected": -298.0,
452
+ "eval_loss": 0.6557604074478149,
453
+ "eval_rewards/accuracies": 0.7234042286872864,
454
+ "eval_rewards/chosen": -0.126953125,
455
+ "eval_rewards/margins": 0.0888671875,
456
+ "eval_rewards/rejected": -0.2158203125,
457
+ "eval_runtime": 65.0762,
458
+ "eval_samples_per_second": 22.85,
459
+ "eval_steps_per_second": 0.722,
460
+ "step": 128
461
+ },
462
+ {
463
+ "epoch": 0.182328190743338,
464
+ "grad_norm": 10.846341171274817,
465
+ "learning_rate": 9.79934348266374e-08,
466
+ "logits/chosen": -2.984375,
467
+ "logits/rejected": -3.1875,
468
+ "logps/chosen": -390.0,
469
+ "logps/rejected": -352.0,
470
+ "loss": 0.6616,
471
+ "rewards/accuracies": 0.699999988079071,
472
+ "rewards/chosen": -0.11474609375,
473
+ "rewards/margins": 0.08251953125,
474
+ "rewards/rejected": -0.197265625,
475
+ "step": 130
476
+ },
477
+ {
478
+ "epoch": 0.19074333800841514,
479
+ "eval_logits/chosen": -3.171875,
480
+ "eval_logits/rejected": -3.203125,
481
+ "eval_logps/chosen": -334.0,
482
+ "eval_logps/rejected": -298.0,
483
+ "eval_loss": 0.6506365537643433,
484
+ "eval_rewards/accuracies": 0.7234042286872864,
485
+ "eval_rewards/chosen": -0.12890625,
486
+ "eval_rewards/margins": 0.10205078125,
487
+ "eval_rewards/rejected": -0.2314453125,
488
+ "eval_runtime": 65.5912,
489
+ "eval_samples_per_second": 22.671,
490
+ "eval_steps_per_second": 0.717,
491
+ "step": 136
492
+ },
493
+ {
494
+ "epoch": 0.19635343618513323,
495
+ "grad_norm": 11.36989829334774,
496
+ "learning_rate": 9.724882642178755e-08,
497
+ "logits/chosen": -3.140625,
498
+ "logits/rejected": -3.203125,
499
+ "logps/chosen": -314.0,
500
+ "logps/rejected": -296.0,
501
+ "loss": 0.6564,
502
+ "rewards/accuracies": 0.6625000238418579,
503
+ "rewards/chosen": -0.1533203125,
504
+ "rewards/margins": 0.09765625,
505
+ "rewards/rejected": -0.251953125,
506
+ "step": 140
507
+ },
508
+ {
509
+ "epoch": 0.20196353436185133,
510
+ "eval_logits/chosen": -3.171875,
511
+ "eval_logits/rejected": -3.203125,
512
+ "eval_logps/chosen": -334.0,
513
+ "eval_logps/rejected": -300.0,
514
+ "eval_loss": 0.6449890732765198,
515
+ "eval_rewards/accuracies": 0.7234042286872864,
516
+ "eval_rewards/chosen": -0.130859375,
517
+ "eval_rewards/margins": 0.11474609375,
518
+ "eval_rewards/rejected": -0.2451171875,
519
+ "eval_runtime": 65.0,
520
+ "eval_samples_per_second": 22.877,
521
+ "eval_steps_per_second": 0.723,
522
+ "step": 144
523
+ },
524
+ {
525
+ "epoch": 0.21037868162692847,
526
+ "grad_norm": 10.745243280453327,
527
+ "learning_rate": 9.63907462605873e-08,
528
+ "logits/chosen": -3.203125,
529
+ "logits/rejected": -3.140625,
530
+ "logps/chosen": -332.0,
531
+ "logps/rejected": -306.0,
532
+ "loss": 0.6413,
533
+ "rewards/accuracies": 0.737500011920929,
534
+ "rewards/chosen": -0.1513671875,
535
+ "rewards/margins": 0.09423828125,
536
+ "rewards/rejected": -0.24609375,
537
+ "step": 150
538
+ },
539
+ {
540
+ "epoch": 0.2131837307152875,
541
+ "eval_logits/chosen": -3.15625,
542
+ "eval_logits/rejected": -3.1875,
543
+ "eval_logps/chosen": -336.0,
544
+ "eval_logps/rejected": -304.0,
545
+ "eval_loss": 0.6383941173553467,
546
+ "eval_rewards/accuracies": 0.7234042286872864,
547
+ "eval_rewards/chosen": -0.1435546875,
548
+ "eval_rewards/margins": 0.1337890625,
549
+ "eval_rewards/rejected": -0.27734375,
550
+ "eval_runtime": 65.4753,
551
+ "eval_samples_per_second": 22.711,
552
+ "eval_steps_per_second": 0.718,
553
+ "step": 152
554
+ },
555
+ {
556
+ "epoch": 0.2244039270687237,
557
+ "grad_norm": 11.27553371495036,
558
+ "learning_rate": 9.542125508973355e-08,
559
+ "logits/chosen": -3.03125,
560
+ "logits/rejected": -3.078125,
561
+ "logps/chosen": -368.0,
562
+ "logps/rejected": -282.0,
563
+ "loss": 0.6307,
564
+ "rewards/accuracies": 0.75,
565
+ "rewards/chosen": -0.12353515625,
566
+ "rewards/margins": 0.189453125,
567
+ "rewards/rejected": -0.314453125,
568
+ "step": 160
569
+ },
570
+ {
571
+ "epoch": 0.2244039270687237,
572
+ "eval_logits/chosen": -3.15625,
573
+ "eval_logits/rejected": -3.1875,
574
+ "eval_logps/chosen": -336.0,
575
+ "eval_logps/rejected": -306.0,
576
+ "eval_loss": 0.6307098269462585,
577
+ "eval_rewards/accuracies": 0.728723406791687,
578
+ "eval_rewards/chosen": -0.1435546875,
579
+ "eval_rewards/margins": 0.154296875,
580
+ "eval_rewards/rejected": -0.296875,
581
+ "eval_runtime": 65.6298,
582
+ "eval_samples_per_second": 22.657,
583
+ "eval_steps_per_second": 0.716,
584
+ "step": 160
585
+ },
586
+ {
587
+ "epoch": 0.23562412342215988,
588
+ "eval_logits/chosen": -3.140625,
589
+ "eval_logits/rejected": -3.171875,
590
+ "eval_logps/chosen": -332.0,
591
+ "eval_logps/rejected": -304.0,
592
+ "eval_loss": 0.6231443881988525,
593
+ "eval_rewards/accuracies": 0.728723406791687,
594
+ "eval_rewards/chosen": -0.1123046875,
595
+ "eval_rewards/margins": 0.1689453125,
596
+ "eval_rewards/rejected": -0.28125,
597
+ "eval_runtime": 65.3606,
598
+ "eval_samples_per_second": 22.751,
599
+ "eval_steps_per_second": 0.719,
600
+ "step": 168
601
+ },
602
+ {
603
+ "epoch": 0.23842917251051893,
604
+ "grad_norm": 10.59379621805741,
605
+ "learning_rate": 9.434268121818663e-08,
606
+ "logits/chosen": -3.234375,
607
+ "logits/rejected": -3.203125,
608
+ "logps/chosen": -292.0,
609
+ "logps/rejected": -268.0,
610
+ "loss": 0.627,
611
+ "rewards/accuracies": 0.7749999761581421,
612
+ "rewards/chosen": -0.18359375,
613
+ "rewards/margins": 0.16015625,
614
+ "rewards/rejected": -0.34375,
615
+ "step": 170
616
+ },
617
+ {
618
+ "epoch": 0.24684431977559607,
619
+ "eval_logits/chosen": -3.140625,
620
+ "eval_logits/rejected": -3.171875,
621
+ "eval_logps/chosen": -334.0,
622
+ "eval_logps/rejected": -308.0,
623
+ "eval_loss": 0.6149851679801941,
624
+ "eval_rewards/accuracies": 0.7393617033958435,
625
+ "eval_rewards/chosen": -0.126953125,
626
+ "eval_rewards/margins": 0.193359375,
627
+ "eval_rewards/rejected": -0.3203125,
628
+ "eval_runtime": 65.2812,
629
+ "eval_samples_per_second": 22.778,
630
+ "eval_steps_per_second": 0.72,
631
+ "step": 176
632
+ },
633
+ {
634
+ "epoch": 0.25245441795231416,
635
+ "grad_norm": 13.319281173183153,
636
+ "learning_rate": 9.315761492555401e-08,
637
+ "logits/chosen": -2.953125,
638
+ "logits/rejected": -3.03125,
639
+ "logps/chosen": -354.0,
640
+ "logps/rejected": -316.0,
641
+ "loss": 0.6139,
642
+ "rewards/accuracies": 0.699999988079071,
643
+ "rewards/chosen": -0.1259765625,
644
+ "rewards/margins": 0.1943359375,
645
+ "rewards/rejected": -0.3203125,
646
+ "step": 180
647
+ },
648
+ {
649
+ "epoch": 0.25806451612903225,
650
+ "eval_logits/chosen": -3.125,
651
+ "eval_logits/rejected": -3.15625,
652
+ "eval_logps/chosen": -336.0,
653
+ "eval_logps/rejected": -310.0,
654
+ "eval_loss": 0.6060777902603149,
655
+ "eval_rewards/accuracies": 0.7340425252914429,
656
+ "eval_rewards/chosen": -0.1337890625,
657
+ "eval_rewards/margins": 0.2197265625,
658
+ "eval_rewards/rejected": -0.353515625,
659
+ "eval_runtime": 65.7676,
660
+ "eval_samples_per_second": 22.61,
661
+ "eval_steps_per_second": 0.715,
662
+ "step": 184
663
+ },
664
+ {
665
+ "epoch": 0.2664796633941094,
666
+ "grad_norm": 13.439011203083904,
667
+ "learning_rate": 9.186890224133106e-08,
668
+ "logits/chosen": -3.03125,
669
+ "logits/rejected": -3.09375,
670
+ "logps/chosen": -356.0,
671
+ "logps/rejected": -314.0,
672
+ "loss": 0.5955,
673
+ "rewards/accuracies": 0.7250000238418579,
674
+ "rewards/chosen": -0.107421875,
675
+ "rewards/margins": 0.2236328125,
676
+ "rewards/rejected": -0.33203125,
677
+ "step": 190
678
+ },
679
+ {
680
+ "epoch": 0.26928471248246844,
681
+ "eval_logits/chosen": -3.125,
682
+ "eval_logits/rejected": -3.15625,
683
+ "eval_logps/chosen": -338.0,
684
+ "eval_logps/rejected": -316.0,
685
+ "eval_loss": 0.5989786386489868,
686
+ "eval_rewards/accuracies": 0.7446808218955994,
687
+ "eval_rewards/chosen": -0.1572265625,
688
+ "eval_rewards/margins": 0.248046875,
689
+ "eval_rewards/rejected": -0.404296875,
690
+ "eval_runtime": 67.4593,
691
+ "eval_samples_per_second": 22.043,
692
+ "eval_steps_per_second": 0.697,
693
+ "step": 192
694
+ },
695
+ {
696
+ "epoch": 0.2805049088359046,
697
+ "grad_norm": 15.24476795667356,
698
+ "learning_rate": 9.047963810993828e-08,
699
+ "logits/chosen": -2.96875,
700
+ "logits/rejected": -3.21875,
701
+ "logps/chosen": -340.0,
702
+ "logps/rejected": -332.0,
703
+ "loss": 0.5975,
704
+ "rewards/accuracies": 0.800000011920929,
705
+ "rewards/chosen": -0.220703125,
706
+ "rewards/margins": 0.318359375,
707
+ "rewards/rejected": -0.5390625,
708
+ "step": 200
709
+ },
710
+ {
711
+ "epoch": 0.2805049088359046,
712
+ "eval_logits/chosen": -3.125,
713
+ "eval_logits/rejected": -3.15625,
714
+ "eval_logps/chosen": -336.0,
715
+ "eval_logps/rejected": -318.0,
716
+ "eval_loss": 0.591622531414032,
717
+ "eval_rewards/accuracies": 0.728723406791687,
718
+ "eval_rewards/chosen": -0.1455078125,
719
+ "eval_rewards/margins": 0.275390625,
720
+ "eval_rewards/rejected": -0.421875,
721
+ "eval_runtime": 65.7395,
722
+ "eval_samples_per_second": 22.62,
723
+ "eval_steps_per_second": 0.715,
724
+ "step": 200
725
+ },
726
+ {
727
+ "epoch": 0.2917251051893408,
728
+ "eval_logits/chosen": -3.125,
729
+ "eval_logits/rejected": -3.15625,
730
+ "eval_logps/chosen": -334.0,
731
+ "eval_logps/rejected": -318.0,
732
+ "eval_loss": 0.5862420797348022,
733
+ "eval_rewards/accuracies": 0.7180851101875305,
734
+ "eval_rewards/chosen": -0.1279296875,
735
+ "eval_rewards/margins": 0.298828125,
736
+ "eval_rewards/rejected": -0.42578125,
737
+ "eval_runtime": 65.2479,
738
+ "eval_samples_per_second": 22.79,
739
+ "eval_steps_per_second": 0.72,
740
+ "step": 208
741
+ },
742
+ {
743
+ "epoch": 0.29453015427769985,
744
+ "grad_norm": 19.945956726222818,
745
+ "learning_rate": 8.899315895796999e-08,
746
+ "logits/chosen": -3.109375,
747
+ "logits/rejected": -3.171875,
748
+ "logps/chosen": -376.0,
749
+ "logps/rejected": -312.0,
750
+ "loss": 0.5887,
751
+ "rewards/accuracies": 0.800000011920929,
752
+ "rewards/chosen": -0.189453125,
753
+ "rewards/margins": 0.369140625,
754
+ "rewards/rejected": -0.55859375,
755
+ "step": 210
756
+ },
757
+ {
758
+ "epoch": 0.302945301542777,
759
+ "eval_logits/chosen": -3.125,
760
+ "eval_logits/rejected": -3.15625,
761
+ "eval_logps/chosen": -334.0,
762
+ "eval_logps/rejected": -318.0,
763
+ "eval_loss": 0.5828134417533875,
764
+ "eval_rewards/accuracies": 0.728723406791687,
765
+ "eval_rewards/chosen": -0.1171875,
766
+ "eval_rewards/margins": 0.310546875,
767
+ "eval_rewards/rejected": -0.427734375,
768
+ "eval_runtime": 65.4231,
769
+ "eval_samples_per_second": 22.729,
770
+ "eval_steps_per_second": 0.718,
771
+ "step": 216
772
+ },
773
+ {
774
+ "epoch": 0.3085553997194951,
775
+ "grad_norm": 13.610682023585936,
776
+ "learning_rate": 8.741303468150459e-08,
777
+ "logits/chosen": -3.140625,
778
+ "logits/rejected": -3.0625,
779
+ "logps/chosen": -262.0,
780
+ "logps/rejected": -272.0,
781
+ "loss": 0.5819,
782
+ "rewards/accuracies": 0.875,
783
+ "rewards/chosen": -0.13671875,
784
+ "rewards/margins": 0.3515625,
785
+ "rewards/rejected": -0.486328125,
786
+ "step": 220
787
+ },
788
+ {
789
+ "epoch": 0.3141654978962132,
790
+ "eval_logits/chosen": -3.125,
791
+ "eval_logits/rejected": -3.15625,
792
+ "eval_logps/chosen": -334.0,
793
+ "eval_logps/rejected": -320.0,
794
+ "eval_loss": 0.5790321826934814,
795
+ "eval_rewards/accuracies": 0.7446808218955994,
796
+ "eval_rewards/chosen": -0.126953125,
797
+ "eval_rewards/margins": 0.322265625,
798
+ "eval_rewards/rejected": -0.44921875,
799
+ "eval_runtime": 65.6686,
800
+ "eval_samples_per_second": 22.644,
801
+ "eval_steps_per_second": 0.716,
802
+ "step": 224
803
+ },
804
+ {
805
+ "epoch": 0.3225806451612903,
806
+ "grad_norm": 16.393253456859437,
807
+ "learning_rate": 8.574306007271956e-08,
808
+ "logits/chosen": -3.09375,
809
+ "logits/rejected": -3.203125,
810
+ "logps/chosen": -372.0,
811
+ "logps/rejected": -344.0,
812
+ "loss": 0.5694,
813
+ "rewards/accuracies": 0.762499988079071,
814
+ "rewards/chosen": -0.14453125,
815
+ "rewards/margins": 0.345703125,
816
+ "rewards/rejected": -0.490234375,
817
+ "step": 230
818
+ },
819
+ {
820
+ "epoch": 0.32538569424964936,
821
+ "eval_logits/chosen": -3.125,
822
+ "eval_logits/rejected": -3.15625,
823
+ "eval_logps/chosen": -340.0,
824
+ "eval_logps/rejected": -328.0,
825
+ "eval_loss": 0.5725518465042114,
826
+ "eval_rewards/accuracies": 0.7180851101875305,
827
+ "eval_rewards/chosen": -0.1748046875,
828
+ "eval_rewards/margins": 0.34765625,
829
+ "eval_rewards/rejected": -0.5234375,
830
+ "eval_runtime": 65.6729,
831
+ "eval_samples_per_second": 22.643,
832
+ "eval_steps_per_second": 0.716,
833
+ "step": 232
834
+ },
835
+ {
836
+ "epoch": 0.33660589060308554,
837
+ "grad_norm": 20.54581221908109,
838
+ "learning_rate": 8.398724570640106e-08,
839
+ "logits/chosen": -3.078125,
840
+ "logits/rejected": -3.0,
841
+ "logps/chosen": -386.0,
842
+ "logps/rejected": -390.0,
843
+ "loss": 0.5678,
844
+ "rewards/accuracies": 0.7875000238418579,
845
+ "rewards/chosen": -0.1826171875,
846
+ "rewards/margins": 0.443359375,
847
+ "rewards/rejected": -0.625,
848
+ "step": 240
849
+ },
850
+ {
851
+ "epoch": 0.33660589060308554,
852
+ "eval_logits/chosen": -3.125,
853
+ "eval_logits/rejected": -3.140625,
854
+ "eval_logps/chosen": -342.0,
855
+ "eval_logps/rejected": -332.0,
856
+ "eval_loss": 0.5683075785636902,
857
+ "eval_rewards/accuracies": 0.7180851101875305,
858
+ "eval_rewards/chosen": -0.1943359375,
859
+ "eval_rewards/margins": 0.3671875,
860
+ "eval_rewards/rejected": -0.5625,
861
+ "eval_runtime": 66.0149,
862
+ "eval_samples_per_second": 22.525,
863
+ "eval_steps_per_second": 0.712,
864
+ "step": 240
865
+ },
866
+ {
867
+ "epoch": 0.34782608695652173,
868
+ "eval_logits/chosen": -3.125,
869
+ "eval_logits/rejected": -3.140625,
870
+ "eval_logps/chosen": -338.0,
871
+ "eval_logps/rejected": -330.0,
872
+ "eval_loss": 0.5656094551086426,
873
+ "eval_rewards/accuracies": 0.7606382966041565,
874
+ "eval_rewards/chosen": -0.16796875,
875
+ "eval_rewards/margins": 0.384765625,
876
+ "eval_rewards/rejected": -0.5546875,
877
+ "eval_runtime": 65.715,
878
+ "eval_samples_per_second": 22.628,
879
+ "eval_steps_per_second": 0.715,
880
+ "step": 248
881
+ },
882
+ {
883
+ "epoch": 0.3506311360448808,
884
+ "grad_norm": 18.794252770652808,
885
+ "learning_rate": 8.214980830823428e-08,
886
+ "logits/chosen": -3.03125,
887
+ "logits/rejected": -3.140625,
888
+ "logps/chosen": -306.0,
889
+ "logps/rejected": -288.0,
890
+ "loss": 0.5712,
891
+ "rewards/accuracies": 0.7875000238418579,
892
+ "rewards/chosen": -0.2109375,
893
+ "rewards/margins": 0.373046875,
894
+ "rewards/rejected": -0.5859375,
895
+ "step": 250
896
+ },
897
+ {
898
+ "epoch": 0.3590462833099579,
899
+ "eval_logits/chosen": -3.125,
900
+ "eval_logits/rejected": -3.140625,
901
+ "eval_logps/chosen": -336.0,
902
+ "eval_logps/rejected": -330.0,
903
+ "eval_loss": 0.5647580027580261,
904
+ "eval_rewards/accuracies": 0.7819148898124695,
905
+ "eval_rewards/chosen": -0.142578125,
906
+ "eval_rewards/margins": 0.392578125,
907
+ "eval_rewards/rejected": -0.53515625,
908
+ "eval_runtime": 66.6553,
909
+ "eval_samples_per_second": 22.309,
910
+ "eval_steps_per_second": 0.705,
911
+ "step": 256
912
+ },
913
+ {
914
+ "epoch": 0.364656381486676,
915
+ "grad_norm": 12.474351601104692,
916
+ "learning_rate": 8.02351606280068e-08,
917
+ "logits/chosen": -3.09375,
918
+ "logits/rejected": -3.078125,
919
+ "logps/chosen": -356.0,
920
+ "logps/rejected": -378.0,
921
+ "loss": 0.5516,
922
+ "rewards/accuracies": 0.7749999761581421,
923
+ "rewards/chosen": -0.212890625,
924
+ "rewards/margins": 0.435546875,
925
+ "rewards/rejected": -0.6484375,
926
+ "step": 260
927
+ },
928
+ {
929
+ "epoch": 0.3702664796633941,
930
+ "eval_logits/chosen": -3.109375,
931
+ "eval_logits/rejected": -3.140625,
932
+ "eval_logps/chosen": -334.0,
933
+ "eval_logps/rejected": -326.0,
934
+ "eval_loss": 0.5650196671485901,
935
+ "eval_rewards/accuracies": 0.771276593208313,
936
+ "eval_rewards/chosen": -0.11376953125,
937
+ "eval_rewards/margins": 0.392578125,
938
+ "eval_rewards/rejected": -0.5078125,
939
+ "eval_runtime": 65.7734,
940
+ "eval_samples_per_second": 22.608,
941
+ "eval_steps_per_second": 0.715,
942
+ "step": 264
943
+ },
944
+ {
945
+ "epoch": 0.37868162692847124,
946
+ "grad_norm": 16.401892485604222,
947
+ "learning_rate": 7.824790084204426e-08,
948
+ "logits/chosen": -3.0625,
949
+ "logits/rejected": -3.140625,
950
+ "logps/chosen": -342.0,
951
+ "logps/rejected": -382.0,
952
+ "loss": 0.5561,
953
+ "rewards/accuracies": 0.75,
954
+ "rewards/chosen": -0.11962890625,
955
+ "rewards/margins": 0.5,
956
+ "rewards/rejected": -0.62109375,
957
+ "step": 270
958
+ },
959
+ {
960
+ "epoch": 0.3814866760168303,
961
+ "eval_logits/chosen": -3.109375,
962
+ "eval_logits/rejected": -3.140625,
963
+ "eval_logps/chosen": -336.0,
964
+ "eval_logps/rejected": -332.0,
965
+ "eval_loss": 0.5578521490097046,
966
+ "eval_rewards/accuracies": 0.7659574747085571,
967
+ "eval_rewards/chosen": -0.1416015625,
968
+ "eval_rewards/margins": 0.421875,
969
+ "eval_rewards/rejected": -0.5625,
970
+ "eval_runtime": 65.6246,
971
+ "eval_samples_per_second": 22.659,
972
+ "eval_steps_per_second": 0.716,
973
+ "step": 272
974
+ },
975
+ {
976
+ "epoch": 0.39270687237026647,
977
+ "grad_norm": 15.668844063971891,
978
+ "learning_rate": 7.619280151032996e-08,
979
+ "logits/chosen": -3.078125,
980
+ "logits/rejected": -3.125,
981
+ "logps/chosen": -308.0,
982
+ "logps/rejected": -310.0,
983
+ "loss": 0.5511,
984
+ "rewards/accuracies": 0.75,
985
+ "rewards/chosen": -0.2041015625,
986
+ "rewards/margins": 0.412109375,
987
+ "rewards/rejected": -0.6171875,
988
+ "step": 280
989
+ },
990
+ {
991
+ "epoch": 0.39270687237026647,
992
+ "eval_logits/chosen": -3.109375,
993
+ "eval_logits/rejected": -3.140625,
994
+ "eval_logps/chosen": -338.0,
995
+ "eval_logps/rejected": -336.0,
996
+ "eval_loss": 0.5538753867149353,
997
+ "eval_rewards/accuracies": 0.7659574747085571,
998
+ "eval_rewards/chosen": -0.15625,
999
+ "eval_rewards/margins": 0.447265625,
1000
+ "eval_rewards/rejected": -0.6015625,
1001
+ "eval_runtime": 66.0541,
1002
+ "eval_samples_per_second": 22.512,
1003
+ "eval_steps_per_second": 0.712,
1004
+ "step": 280
1005
+ },
1006
+ {
1007
+ "epoch": 0.40392706872370265,
1008
+ "eval_logits/chosen": -3.109375,
1009
+ "eval_logits/rejected": -3.125,
1010
+ "eval_logps/chosen": -340.0,
1011
+ "eval_logps/rejected": -340.0,
1012
+ "eval_loss": 0.5501448512077332,
1013
+ "eval_rewards/accuracies": 0.771276593208313,
1014
+ "eval_rewards/chosen": -0.18359375,
1015
+ "eval_rewards/margins": 0.462890625,
1016
+ "eval_rewards/rejected": -0.6484375,
1017
+ "eval_runtime": 65.3015,
1018
+ "eval_samples_per_second": 22.771,
1019
+ "eval_steps_per_second": 0.72,
1020
+ "step": 288
1021
+ },
1022
+ {
1023
+ "epoch": 0.4067321178120617,
1024
+ "grad_norm": 16.413044382075544,
1025
+ "learning_rate": 7.407479811482827e-08,
1026
+ "logits/chosen": -3.09375,
1027
+ "logits/rejected": -3.046875,
1028
+ "logps/chosen": -364.0,
1029
+ "logps/rejected": -338.0,
1030
+ "loss": 0.5385,
1031
+ "rewards/accuracies": 0.7875000238418579,
1032
+ "rewards/chosen": -0.1630859375,
1033
+ "rewards/margins": 0.45703125,
1034
+ "rewards/rejected": -0.62109375,
1035
+ "step": 290
1036
+ },
1037
+ {
1038
+ "epoch": 0.41514726507713884,
1039
+ "eval_logits/chosen": -3.109375,
1040
+ "eval_logits/rejected": -3.125,
1041
+ "eval_logps/chosen": -344.0,
1042
+ "eval_logps/rejected": -344.0,
1043
+ "eval_loss": 0.5482958555221558,
1044
+ "eval_rewards/accuracies": 0.75,
1045
+ "eval_rewards/chosen": -0.2197265625,
1046
+ "eval_rewards/margins": 0.466796875,
1047
+ "eval_rewards/rejected": -0.6875,
1048
+ "eval_runtime": 65.3898,
1049
+ "eval_samples_per_second": 22.741,
1050
+ "eval_steps_per_second": 0.719,
1051
+ "step": 296
1052
+ },
1053
+ {
1054
+ "epoch": 0.42075736325385693,
1055
+ "grad_norm": 19.32972616150208,
1056
+ "learning_rate": 7.189897720653835e-08,
1057
+ "logits/chosen": -3.015625,
1058
+ "logits/rejected": -3.109375,
1059
+ "logps/chosen": -338.0,
1060
+ "logps/rejected": -322.0,
1061
+ "loss": 0.5486,
1062
+ "rewards/accuracies": 0.8374999761581421,
1063
+ "rewards/chosen": -0.30859375,
1064
+ "rewards/margins": 0.455078125,
1065
+ "rewards/rejected": -0.765625,
1066
+ "step": 300
1067
+ },
1068
+ {
1069
+ "epoch": 0.426367461430575,
1070
+ "eval_logits/chosen": -3.109375,
1071
+ "eval_logits/rejected": -3.125,
1072
+ "eval_logps/chosen": -340.0,
1073
+ "eval_logps/rejected": -342.0,
1074
+ "eval_loss": 0.5460695028305054,
1075
+ "eval_rewards/accuracies": 0.7659574747085571,
1076
+ "eval_rewards/chosen": -0.1884765625,
1077
+ "eval_rewards/margins": 0.482421875,
1078
+ "eval_rewards/rejected": -0.671875,
1079
+ "eval_runtime": 65.6401,
1080
+ "eval_samples_per_second": 22.654,
1081
+ "eval_steps_per_second": 0.716,
1082
+ "step": 304
1083
+ },
1084
+ {
1085
+ "epoch": 0.43478260869565216,
1086
+ "grad_norm": 20.84038979708752,
1087
+ "learning_rate": 6.967056418974356e-08,
1088
+ "logits/chosen": -3.078125,
1089
+ "logits/rejected": -3.09375,
1090
+ "logps/chosen": -382.0,
1091
+ "logps/rejected": -366.0,
1092
+ "loss": 0.5263,
1093
+ "rewards/accuracies": 0.8500000238418579,
1094
+ "rewards/chosen": -0.27734375,
1095
+ "rewards/margins": 0.5234375,
1096
+ "rewards/rejected": -0.80078125,
1097
+ "step": 310
1098
+ },
1099
+ {
1100
+ "epoch": 0.4375876577840112,
1101
+ "eval_logits/chosen": -3.09375,
1102
+ "eval_logits/rejected": -3.125,
1103
+ "eval_logps/chosen": -344.0,
1104
+ "eval_logps/rejected": -346.0,
1105
+ "eval_loss": 0.5433906316757202,
1106
+ "eval_rewards/accuracies": 0.7659574747085571,
1107
+ "eval_rewards/chosen": -0.21484375,
1108
+ "eval_rewards/margins": 0.490234375,
1109
+ "eval_rewards/rejected": -0.703125,
1110
+ "eval_runtime": 65.6624,
1111
+ "eval_samples_per_second": 22.646,
1112
+ "eval_steps_per_second": 0.716,
1113
+ "step": 312
1114
+ },
1115
+ {
1116
+ "epoch": 0.4488078541374474,
1117
+ "grad_norm": 19.39801079549374,
1118
+ "learning_rate": 6.739491077279388e-08,
1119
+ "logits/chosen": -3.09375,
1120
+ "logits/rejected": -3.171875,
1121
+ "logps/chosen": -418.0,
1122
+ "logps/rejected": -354.0,
1123
+ "loss": 0.539,
1124
+ "rewards/accuracies": 0.737500011920929,
1125
+ "rewards/chosen": -0.26171875,
1126
+ "rewards/margins": 0.49609375,
1127
+ "rewards/rejected": -0.7578125,
1128
+ "step": 320
1129
+ },
1130
+ {
1131
+ "epoch": 0.4488078541374474,
1132
+ "eval_logits/chosen": -3.09375,
1133
+ "eval_logits/rejected": -3.125,
1134
+ "eval_logps/chosen": -346.0,
1135
+ "eval_logps/rejected": -350.0,
1136
+ "eval_loss": 0.5414044260978699,
1137
+ "eval_rewards/accuracies": 0.7553191781044006,
1138
+ "eval_rewards/chosen": -0.2490234375,
1139
+ "eval_rewards/margins": 0.48828125,
1140
+ "eval_rewards/rejected": -0.73828125,
1141
+ "eval_runtime": 65.8794,
1142
+ "eval_samples_per_second": 22.572,
1143
+ "eval_steps_per_second": 0.713,
1144
+ "step": 320
1145
+ },
1146
+ {
1147
+ "epoch": 0.4600280504908836,
1148
+ "eval_logits/chosen": -3.09375,
1149
+ "eval_logits/rejected": -3.125,
1150
+ "eval_logps/chosen": -336.0,
1151
+ "eval_logps/rejected": -342.0,
1152
+ "eval_loss": 0.5408417582511902,
1153
+ "eval_rewards/accuracies": 0.7872340679168701,
1154
+ "eval_rewards/chosen": -0.1474609375,
1155
+ "eval_rewards/margins": 0.51171875,
1156
+ "eval_rewards/rejected": -0.65625,
1157
+ "eval_runtime": 65.2872,
1158
+ "eval_samples_per_second": 22.776,
1159
+ "eval_steps_per_second": 0.72,
1160
+ "step": 328
1161
+ },
1162
+ {
1163
+ "epoch": 0.4628330995792426,
1164
+ "grad_norm": 25.683811989115487,
1165
+ "learning_rate": 6.507748211555935e-08,
1166
+ "logits/chosen": -3.0,
1167
+ "logits/rejected": -3.046875,
1168
+ "logps/chosen": -364.0,
1169
+ "logps/rejected": -356.0,
1170
+ "loss": 0.5491,
1171
+ "rewards/accuracies": 0.862500011920929,
1172
+ "rewards/chosen": -0.220703125,
1173
+ "rewards/margins": 0.578125,
1174
+ "rewards/rejected": -0.796875,
1175
+ "step": 330
1176
+ },
1177
+ {
1178
+ "epoch": 0.47124824684431976,
1179
+ "eval_logits/chosen": -3.09375,
1180
+ "eval_logits/rejected": -3.125,
1181
+ "eval_logps/chosen": -344.0,
1182
+ "eval_logps/rejected": -348.0,
1183
+ "eval_loss": 0.5370554327964783,
1184
+ "eval_rewards/accuracies": 0.771276593208313,
1185
+ "eval_rewards/chosen": -0.21875,
1186
+ "eval_rewards/margins": 0.51953125,
1187
+ "eval_rewards/rejected": -0.73828125,
1188
+ "eval_runtime": 65.8828,
1189
+ "eval_samples_per_second": 22.57,
1190
+ "eval_steps_per_second": 0.713,
1191
+ "step": 336
1192
+ },
1193
+ {
1194
+ "epoch": 0.47685834502103785,
1195
+ "grad_norm": 25.38527516497017,
1196
+ "learning_rate": 6.272384370442064e-08,
1197
+ "logits/chosen": -2.953125,
1198
+ "logits/rejected": -2.96875,
1199
+ "logps/chosen": -332.0,
1200
+ "logps/rejected": -362.0,
1201
+ "loss": 0.525,
1202
+ "rewards/accuracies": 0.7875000238418579,
1203
+ "rewards/chosen": -0.31640625,
1204
+ "rewards/margins": 0.439453125,
1205
+ "rewards/rejected": -0.75390625,
1206
+ "step": 340
1207
+ },
1208
+ {
1209
+ "epoch": 0.48246844319775595,
1210
+ "eval_logits/chosen": -3.09375,
1211
+ "eval_logits/rejected": -3.109375,
1212
+ "eval_logps/chosen": -350.0,
1213
+ "eval_logps/rejected": -354.0,
1214
+ "eval_loss": 0.5372266173362732,
1215
+ "eval_rewards/accuracies": 0.7606382966041565,
1216
+ "eval_rewards/chosen": -0.28125,
1217
+ "eval_rewards/margins": 0.51171875,
1218
+ "eval_rewards/rejected": -0.79296875,
1219
+ "eval_runtime": 65.5252,
1220
+ "eval_samples_per_second": 22.694,
1221
+ "eval_steps_per_second": 0.717,
1222
+ "step": 344
1223
+ },
1224
+ {
1225
+ "epoch": 0.4908835904628331,
1226
+ "grad_norm": 20.41249212586205,
1227
+ "learning_rate": 6.033964798631775e-08,
1228
+ "logits/chosen": -3.0,
1229
+ "logits/rejected": -3.046875,
1230
+ "logps/chosen": -348.0,
1231
+ "logps/rejected": -356.0,
1232
+ "loss": 0.5509,
1233
+ "rewards/accuracies": 0.699999988079071,
1234
+ "rewards/chosen": -0.4375,
1235
+ "rewards/margins": 0.3828125,
1236
+ "rewards/rejected": -0.8203125,
1237
+ "step": 350
1238
+ },
1239
+ {
1240
+ "epoch": 0.49368863955119213,
1241
+ "eval_logits/chosen": -3.09375,
1242
+ "eval_logits/rejected": -3.125,
1243
+ "eval_logps/chosen": -338.0,
1244
+ "eval_logps/rejected": -344.0,
1245
+ "eval_loss": 0.53655606508255,
1246
+ "eval_rewards/accuracies": 0.7978723645210266,
1247
+ "eval_rewards/chosen": -0.1572265625,
1248
+ "eval_rewards/margins": 0.53515625,
1249
+ "eval_rewards/rejected": -0.6953125,
1250
+ "eval_runtime": 65.8155,
1251
+ "eval_samples_per_second": 22.593,
1252
+ "eval_steps_per_second": 0.714,
1253
+ "step": 352
1254
+ },
1255
+ {
1256
+ "epoch": 0.5049088359046283,
1257
+ "grad_norm": 26.68036848508083,
1258
+ "learning_rate": 5.793062079395602e-08,
1259
+ "logits/chosen": -3.078125,
1260
+ "logits/rejected": -3.140625,
1261
+ "logps/chosen": -384.0,
1262
+ "logps/rejected": -362.0,
1263
+ "loss": 0.5342,
1264
+ "rewards/accuracies": 0.7875000238418579,
1265
+ "rewards/chosen": -0.169921875,
1266
+ "rewards/margins": 0.5078125,
1267
+ "rewards/rejected": -0.67578125,
1268
+ "step": 360
1269
+ },
1270
+ {
1271
+ "epoch": 0.5049088359046283,
1272
+ "eval_logits/chosen": -3.09375,
1273
+ "eval_logits/rejected": -3.125,
1274
+ "eval_logps/chosen": -338.0,
1275
+ "eval_logps/rejected": -346.0,
1276
+ "eval_loss": 0.5345030426979065,
1277
+ "eval_rewards/accuracies": 0.7978723645210266,
1278
+ "eval_rewards/chosen": -0.162109375,
1279
+ "eval_rewards/margins": 0.546875,
1280
+ "eval_rewards/rejected": -0.70703125,
1281
+ "eval_runtime": 65.1382,
1282
+ "eval_samples_per_second": 22.828,
1283
+ "eval_steps_per_second": 0.722,
1284
+ "step": 360
1285
+ },
1286
+ {
1287
+ "epoch": 0.5161290322580645,
1288
+ "eval_logits/chosen": -3.09375,
1289
+ "eval_logits/rejected": -3.109375,
1290
+ "eval_logps/chosen": -350.0,
1291
+ "eval_logps/rejected": -358.0,
1292
+ "eval_loss": 0.5322470664978027,
1293
+ "eval_rewards/accuracies": 0.7765957713127136,
1294
+ "eval_rewards/chosen": -0.2890625,
1295
+ "eval_rewards/margins": 0.53515625,
1296
+ "eval_rewards/rejected": -0.82421875,
1297
+ "eval_runtime": 65.4517,
1298
+ "eval_samples_per_second": 22.719,
1299
+ "eval_steps_per_second": 0.718,
1300
+ "step": 368
1301
+ },
1302
+ {
1303
+ "epoch": 0.5189340813464236,
1304
+ "grad_norm": 35.318750139832915,
1305
+ "learning_rate": 5.550254759477064e-08,
1306
+ "logits/chosen": -3.0625,
1307
+ "logits/rejected": -3.0625,
1308
+ "logps/chosen": -342.0,
1309
+ "logps/rejected": -386.0,
1310
+ "loss": 0.5392,
1311
+ "rewards/accuracies": 0.737500011920929,
1312
+ "rewards/chosen": -0.328125,
1313
+ "rewards/margins": 0.4609375,
1314
+ "rewards/rejected": -0.7890625,
1315
+ "step": 370
1316
+ },
1317
+ {
1318
+ "epoch": 0.5273492286115007,
1319
+ "eval_logits/chosen": -3.09375,
1320
+ "eval_logits/rejected": -3.109375,
1321
+ "eval_logps/chosen": -342.0,
1322
+ "eval_logps/rejected": -352.0,
1323
+ "eval_loss": 0.5290676951408386,
1324
+ "eval_rewards/accuracies": 0.7819148898124695,
1325
+ "eval_rewards/chosen": -0.2109375,
1326
+ "eval_rewards/margins": 0.5625,
1327
+ "eval_rewards/rejected": -0.7734375,
1328
+ "eval_runtime": 66.0225,
1329
+ "eval_samples_per_second": 22.523,
1330
+ "eval_steps_per_second": 0.712,
1331
+ "step": 376
1332
+ },
1333
+ {
1334
+ "epoch": 0.5329593267882188,
1335
+ "grad_norm": 25.630480455788526,
1336
+ "learning_rate": 5.3061259596673514e-08,
1337
+ "logits/chosen": -2.875,
1338
+ "logits/rejected": -2.984375,
1339
+ "logps/chosen": -334.0,
1340
+ "logps/rejected": -384.0,
1341
+ "loss": 0.5275,
1342
+ "rewards/accuracies": 0.824999988079071,
1343
+ "rewards/chosen": -0.32421875,
1344
+ "rewards/margins": 0.52734375,
1345
+ "rewards/rejected": -0.8515625,
1346
+ "step": 380
1347
+ },
1348
+ {
1349
+ "epoch": 0.5385694249649369,
1350
+ "eval_logits/chosen": -3.09375,
1351
+ "eval_logits/rejected": -3.125,
1352
+ "eval_logps/chosen": -342.0,
1353
+ "eval_logps/rejected": -350.0,
1354
+ "eval_loss": 0.5304205417633057,
1355
+ "eval_rewards/accuracies": 0.7872340679168701,
1356
+ "eval_rewards/chosen": -0.197265625,
1357
+ "eval_rewards/margins": 0.5546875,
1358
+ "eval_rewards/rejected": -0.75390625,
1359
+ "eval_runtime": 66.8954,
1360
+ "eval_samples_per_second": 22.229,
1361
+ "eval_steps_per_second": 0.703,
1362
+ "step": 384
1363
+ },
1364
+ {
1365
+ "epoch": 0.5469845722300141,
1366
+ "grad_norm": 19.177955351319696,
1367
+ "learning_rate": 5.061261974395087e-08,
1368
+ "logits/chosen": -3.0625,
1369
+ "logits/rejected": -3.078125,
1370
+ "logps/chosen": -340.0,
1371
+ "logps/rejected": -370.0,
1372
+ "loss": 0.5228,
1373
+ "rewards/accuracies": 0.824999988079071,
1374
+ "rewards/chosen": -0.21875,
1375
+ "rewards/margins": 0.578125,
1376
+ "rewards/rejected": -0.80078125,
1377
+ "step": 390
1378
+ },
1379
+ {
1380
+ "epoch": 0.5497896213183731,
1381
+ "eval_logits/chosen": -3.09375,
1382
+ "eval_logits/rejected": -3.125,
1383
+ "eval_logps/chosen": -346.0,
1384
+ "eval_logps/rejected": -356.0,
1385
+ "eval_loss": 0.5286569595336914,
1386
+ "eval_rewards/accuracies": 0.771276593208313,
1387
+ "eval_rewards/chosen": -0.2490234375,
1388
+ "eval_rewards/margins": 0.55859375,
1389
+ "eval_rewards/rejected": -0.80859375,
1390
+ "eval_runtime": 66.2711,
1391
+ "eval_samples_per_second": 22.438,
1392
+ "eval_steps_per_second": 0.709,
1393
+ "step": 392
1394
+ },
1395
+ {
1396
+ "epoch": 0.5610098176718092,
1397
+ "grad_norm": 22.280250814675583,
1398
+ "learning_rate": 4.81625086369435e-08,
1399
+ "logits/chosen": -3.0625,
1400
+ "logits/rejected": -3.125,
1401
+ "logps/chosen": -318.0,
1402
+ "logps/rejected": -350.0,
1403
+ "loss": 0.5174,
1404
+ "rewards/accuracies": 0.762499988079071,
1405
+ "rewards/chosen": -0.3125,
1406
+ "rewards/margins": 0.53125,
1407
+ "rewards/rejected": -0.84375,
1408
+ "step": 400
1409
+ },
1410
+ {
1411
+ "epoch": 0.5610098176718092,
1412
+ "eval_logits/chosen": -3.09375,
1413
+ "eval_logits/rejected": -3.125,
1414
+ "eval_logps/chosen": -346.0,
1415
+ "eval_logps/rejected": -358.0,
1416
+ "eval_loss": 0.5268398523330688,
1417
+ "eval_rewards/accuracies": 0.771276593208313,
1418
+ "eval_rewards/chosen": -0.251953125,
1419
+ "eval_rewards/margins": 0.56640625,
1420
+ "eval_rewards/rejected": -0.8203125,
1421
+ "eval_runtime": 65.9859,
1422
+ "eval_samples_per_second": 22.535,
1423
+ "eval_steps_per_second": 0.712,
1424
+ "step": 400
1425
+ },
1426
+ {
1427
+ "epoch": 0.5722300140252454,
1428
+ "eval_logits/chosen": -3.09375,
1429
+ "eval_logits/rejected": -3.125,
1430
+ "eval_logps/chosen": -344.0,
1431
+ "eval_logps/rejected": -356.0,
1432
+ "eval_loss": 0.5255098938941956,
1433
+ "eval_rewards/accuracies": 0.771276593208313,
1434
+ "eval_rewards/chosen": -0.2265625,
1435
+ "eval_rewards/margins": 0.578125,
1436
+ "eval_rewards/rejected": -0.8046875,
1437
+ "eval_runtime": 65.6155,
1438
+ "eval_samples_per_second": 22.662,
1439
+ "eval_steps_per_second": 0.716,
1440
+ "step": 408
1441
+ },
1442
+ {
1443
+ "epoch": 0.5750350631136045,
1444
+ "grad_norm": 18.983834453019576,
1445
+ "learning_rate": 4.571681040932457e-08,
1446
+ "logits/chosen": -3.046875,
1447
+ "logits/rejected": -3.09375,
1448
+ "logps/chosen": -350.0,
1449
+ "logps/rejected": -346.0,
1450
+ "loss": 0.5184,
1451
+ "rewards/accuracies": 0.862500011920929,
1452
+ "rewards/chosen": -0.27734375,
1453
+ "rewards/margins": 0.59375,
1454
+ "rewards/rejected": -0.8671875,
1455
+ "step": 410
1456
+ },
1457
+ {
1458
+ "epoch": 0.5834502103786816,
1459
+ "eval_logits/chosen": -3.09375,
1460
+ "eval_logits/rejected": -3.109375,
1461
+ "eval_logps/chosen": -350.0,
1462
+ "eval_logps/rejected": -362.0,
1463
+ "eval_loss": 0.524753212928772,
1464
+ "eval_rewards/accuracies": 0.7659574747085571,
1465
+ "eval_rewards/chosen": -0.27734375,
1466
+ "eval_rewards/margins": 0.5859375,
1467
+ "eval_rewards/rejected": -0.859375,
1468
+ "eval_runtime": 65.6143,
1469
+ "eval_samples_per_second": 22.663,
1470
+ "eval_steps_per_second": 0.716,
1471
+ "step": 416
1472
+ },
1473
+ {
1474
+ "epoch": 0.5890603085553997,
1475
+ "grad_norm": 25.592693204204966,
1476
+ "learning_rate": 4.3281398596891846e-08,
1477
+ "logits/chosen": -2.96875,
1478
+ "logits/rejected": -3.046875,
1479
+ "logps/chosen": -354.0,
1480
+ "logps/rejected": -374.0,
1481
+ "loss": 0.5095,
1482
+ "rewards/accuracies": 0.8374999761581421,
1483
+ "rewards/chosen": -0.236328125,
1484
+ "rewards/margins": 0.7421875,
1485
+ "rewards/rejected": -0.9765625,
1486
+ "step": 420
1487
+ },
1488
+ {
1489
+ "epoch": 0.5946704067321178,
1490
+ "eval_logits/chosen": -3.09375,
1491
+ "eval_logits/rejected": -3.109375,
1492
+ "eval_logps/chosen": -348.0,
1493
+ "eval_logps/rejected": -362.0,
1494
+ "eval_loss": 0.523405909538269,
1495
+ "eval_rewards/accuracies": 0.7819148898124695,
1496
+ "eval_rewards/chosen": -0.271484375,
1497
+ "eval_rewards/margins": 0.59765625,
1498
+ "eval_rewards/rejected": -0.8671875,
1499
+ "eval_runtime": 65.4312,
1500
+ "eval_samples_per_second": 22.726,
1501
+ "eval_steps_per_second": 0.718,
1502
+ "step": 424
1503
+ },
1504
+ {
1505
+ "epoch": 0.603085553997195,
1506
+ "grad_norm": 20.945961565616308,
1507
+ "learning_rate": 4.0862122031811584e-08,
1508
+ "logits/chosen": -3.09375,
1509
+ "logits/rejected": -2.96875,
1510
+ "logps/chosen": -398.0,
1511
+ "logps/rejected": -406.0,
1512
+ "loss": 0.5259,
1513
+ "rewards/accuracies": 0.875,
1514
+ "rewards/chosen": -0.2314453125,
1515
+ "rewards/margins": 0.66015625,
1516
+ "rewards/rejected": -0.890625,
1517
+ "step": 430
1518
+ },
1519
+ {
1520
+ "epoch": 0.605890603085554,
1521
+ "eval_logits/chosen": -3.09375,
1522
+ "eval_logits/rejected": -3.109375,
1523
+ "eval_logps/chosen": -346.0,
1524
+ "eval_logps/rejected": -360.0,
1525
+ "eval_loss": 0.5217041373252869,
1526
+ "eval_rewards/accuracies": 0.7819148898124695,
1527
+ "eval_rewards/chosen": -0.2412109375,
1528
+ "eval_rewards/margins": 0.60546875,
1529
+ "eval_rewards/rejected": -0.84765625,
1530
+ "eval_runtime": 66.0803,
1531
+ "eval_samples_per_second": 22.503,
1532
+ "eval_steps_per_second": 0.711,
1533
+ "step": 432
1534
+ },
1535
+ {
1536
+ "epoch": 0.6171107994389902,
1537
+ "grad_norm": 32.43796306320174,
1538
+ "learning_rate": 3.84647907961901e-08,
1539
+ "logits/chosen": -3.0625,
1540
+ "logits/rejected": -2.96875,
1541
+ "logps/chosen": -348.0,
1542
+ "logps/rejected": -366.0,
1543
+ "loss": 0.5131,
1544
+ "rewards/accuracies": 0.8500000238418579,
1545
+ "rewards/chosen": -0.279296875,
1546
+ "rewards/margins": 0.67578125,
1547
+ "rewards/rejected": -0.953125,
1548
+ "step": 440
1549
+ },
1550
+ {
1551
+ "epoch": 0.6171107994389902,
1552
+ "eval_logits/chosen": -3.09375,
1553
+ "eval_logits/rejected": -3.109375,
1554
+ "eval_logps/chosen": -346.0,
1555
+ "eval_logps/rejected": -360.0,
1556
+ "eval_loss": 0.520456075668335,
1557
+ "eval_rewards/accuracies": 0.792553186416626,
1558
+ "eval_rewards/chosen": -0.23828125,
1559
+ "eval_rewards/margins": 0.61328125,
1560
+ "eval_rewards/rejected": -0.85546875,
1561
+ "eval_runtime": 65.5788,
1562
+ "eval_samples_per_second": 22.675,
1563
+ "eval_steps_per_second": 0.717,
1564
+ "step": 440
1565
+ },
1566
+ {
1567
+ "epoch": 0.6283309957924264,
1568
+ "eval_logits/chosen": -3.078125,
1569
+ "eval_logits/rejected": -3.109375,
1570
+ "eval_logps/chosen": -344.0,
1571
+ "eval_logps/rejected": -360.0,
1572
+ "eval_loss": 0.5200462937355042,
1573
+ "eval_rewards/accuracies": 0.7978723645210266,
1574
+ "eval_rewards/chosen": -0.2236328125,
1575
+ "eval_rewards/margins": 0.62109375,
1576
+ "eval_rewards/rejected": -0.84765625,
1577
+ "eval_runtime": 65.4678,
1578
+ "eval_samples_per_second": 22.713,
1579
+ "eval_steps_per_second": 0.718,
1580
+ "step": 448
1581
+ },
1582
+ {
1583
+ "epoch": 0.6311360448807855,
1584
+ "grad_norm": 25.78934426006984,
1585
+ "learning_rate": 3.609516226870659e-08,
1586
+ "logits/chosen": -3.015625,
1587
+ "logits/rejected": -3.09375,
1588
+ "logps/chosen": -378.0,
1589
+ "logps/rejected": -394.0,
1590
+ "loss": 0.536,
1591
+ "rewards/accuracies": 0.8125,
1592
+ "rewards/chosen": -0.349609375,
1593
+ "rewards/margins": 0.60546875,
1594
+ "rewards/rejected": -0.953125,
1595
+ "step": 450
1596
+ },
1597
+ {
1598
+ "epoch": 0.6395511921458625,
1599
+ "eval_logits/chosen": -3.078125,
1600
+ "eval_logits/rejected": -3.109375,
1601
+ "eval_logps/chosen": -344.0,
1602
+ "eval_logps/rejected": -360.0,
1603
+ "eval_loss": 0.5196602940559387,
1604
+ "eval_rewards/accuracies": 0.7978723645210266,
1605
+ "eval_rewards/chosen": -0.22265625,
1606
+ "eval_rewards/margins": 0.625,
1607
+ "eval_rewards/rejected": -0.84765625,
1608
+ "eval_runtime": 66.1158,
1609
+ "eval_samples_per_second": 22.491,
1610
+ "eval_steps_per_second": 0.711,
1611
+ "step": 456
1612
+ },
1613
+ {
1614
+ "epoch": 0.6451612903225806,
1615
+ "grad_norm": 27.561675324577404,
1616
+ "learning_rate": 3.375892729781754e-08,
1617
+ "logits/chosen": -2.953125,
1618
+ "logits/rejected": -3.046875,
1619
+ "logps/chosen": -380.0,
1620
+ "logps/rejected": -392.0,
1621
+ "loss": 0.52,
1622
+ "rewards/accuracies": 0.800000011920929,
1623
+ "rewards/chosen": -0.2890625,
1624
+ "rewards/margins": 0.62890625,
1625
+ "rewards/rejected": -0.91796875,
1626
+ "step": 460
1627
+ },
1628
+ {
1629
+ "epoch": 0.6507713884992987,
1630
+ "eval_logits/chosen": -3.078125,
1631
+ "eval_logits/rejected": -3.109375,
1632
+ "eval_logps/chosen": -350.0,
1633
+ "eval_logps/rejected": -366.0,
1634
+ "eval_loss": 0.5189826488494873,
1635
+ "eval_rewards/accuracies": 0.7978723645210266,
1636
+ "eval_rewards/chosen": -0.287109375,
1637
+ "eval_rewards/margins": 0.62109375,
1638
+ "eval_rewards/rejected": -0.90625,
1639
+ "eval_runtime": 66.7501,
1640
+ "eval_samples_per_second": 22.277,
1641
+ "eval_steps_per_second": 0.704,
1642
+ "step": 464
1643
+ },
1644
+ {
1645
+ "epoch": 0.6591865357643759,
1646
+ "grad_norm": 28.5091931919782,
1647
+ "learning_rate": 3.146169653473842e-08,
1648
+ "logits/chosen": -2.953125,
1649
+ "logits/rejected": -3.0,
1650
+ "logps/chosen": -392.0,
1651
+ "logps/rejected": -370.0,
1652
+ "loss": 0.5136,
1653
+ "rewards/accuracies": 0.8374999761581421,
1654
+ "rewards/chosen": -0.2265625,
1655
+ "rewards/margins": 0.69921875,
1656
+ "rewards/rejected": -0.92578125,
1657
+ "step": 470
1658
+ },
1659
+ {
1660
+ "epoch": 0.6619915848527349,
1661
+ "eval_logits/chosen": -3.078125,
1662
+ "eval_logits/rejected": -3.109375,
1663
+ "eval_logps/chosen": -356.0,
1664
+ "eval_logps/rejected": -372.0,
1665
+ "eval_loss": 0.5202246904373169,
1666
+ "eval_rewards/accuracies": 0.7819148898124695,
1667
+ "eval_rewards/chosen": -0.349609375,
1668
+ "eval_rewards/margins": 0.61328125,
1669
+ "eval_rewards/rejected": -0.96484375,
1670
+ "eval_runtime": 65.2435,
1671
+ "eval_samples_per_second": 22.792,
1672
+ "eval_steps_per_second": 0.72,
1673
+ "step": 472
1674
+ },
1675
+ {
1676
+ "epoch": 0.6732117812061711,
1677
+ "grad_norm": 28.307915984199855,
1678
+ "learning_rate": 2.920898695902556e-08,
1679
+ "logits/chosen": -2.96875,
1680
+ "logits/rejected": -3.015625,
1681
+ "logps/chosen": -388.0,
1682
+ "logps/rejected": -386.0,
1683
+ "loss": 0.5402,
1684
+ "rewards/accuracies": 0.762499988079071,
1685
+ "rewards/chosen": -0.46875,
1686
+ "rewards/margins": 0.5703125,
1687
+ "rewards/rejected": -1.0390625,
1688
+ "step": 480
1689
+ },
1690
+ {
1691
+ "epoch": 0.6732117812061711,
1692
+ "eval_logits/chosen": -3.078125,
1693
+ "eval_logits/rejected": -3.109375,
1694
+ "eval_logps/chosen": -358.0,
1695
+ "eval_logps/rejected": -372.0,
1696
+ "eval_loss": 0.5193986296653748,
1697
+ "eval_rewards/accuracies": 0.7765957713127136,
1698
+ "eval_rewards/chosen": -0.357421875,
1699
+ "eval_rewards/margins": 0.6171875,
1700
+ "eval_rewards/rejected": -0.9765625,
1701
+ "eval_runtime": 65.2533,
1702
+ "eval_samples_per_second": 22.788,
1703
+ "eval_steps_per_second": 0.72,
1704
+ "step": 480
1705
+ },
1706
+ {
1707
+ "epoch": 0.6844319775596073,
1708
+ "eval_logits/chosen": -3.078125,
1709
+ "eval_logits/rejected": -3.109375,
1710
+ "eval_logps/chosen": -350.0,
1711
+ "eval_logps/rejected": -366.0,
1712
+ "eval_loss": 0.5179064869880676,
1713
+ "eval_rewards/accuracies": 0.7978723645210266,
1714
+ "eval_rewards/chosen": -0.287109375,
1715
+ "eval_rewards/margins": 0.625,
1716
+ "eval_rewards/rejected": -0.9140625,
1717
+ "eval_runtime": 65.4006,
1718
+ "eval_samples_per_second": 22.737,
1719
+ "eval_steps_per_second": 0.719,
1720
+ "step": 488
1721
+ },
1722
+ {
1723
+ "epoch": 0.6872370266479664,
1724
+ "grad_norm": 34.377948997484125,
1725
+ "learning_rate": 2.7006208629117678e-08,
1726
+ "logits/chosen": -3.046875,
1727
+ "logits/rejected": -3.125,
1728
+ "logps/chosen": -418.0,
1729
+ "logps/rejected": -362.0,
1730
+ "loss": 0.5,
1731
+ "rewards/accuracies": 0.7875000238418579,
1732
+ "rewards/chosen": -0.341796875,
1733
+ "rewards/margins": 0.66796875,
1734
+ "rewards/rejected": -1.0078125,
1735
+ "step": 490
1736
+ },
1737
+ {
1738
+ "epoch": 0.6956521739130435,
1739
+ "eval_logits/chosen": -3.078125,
1740
+ "eval_logits/rejected": -3.109375,
1741
+ "eval_logps/chosen": -348.0,
1742
+ "eval_logps/rejected": -366.0,
1743
+ "eval_loss": 0.5165696144104004,
1744
+ "eval_rewards/accuracies": 0.792553186416626,
1745
+ "eval_rewards/chosen": -0.265625,
1746
+ "eval_rewards/margins": 0.6328125,
1747
+ "eval_rewards/rejected": -0.89453125,
1748
+ "eval_runtime": 66.0312,
1749
+ "eval_samples_per_second": 22.52,
1750
+ "eval_steps_per_second": 0.712,
1751
+ "step": 496
1752
+ },
1753
+ {
1754
+ "epoch": 0.7012622720897616,
1755
+ "grad_norm": 24.33249271202181,
1756
+ "learning_rate": 2.485865168965695e-08,
1757
+ "logits/chosen": -2.96875,
1758
+ "logits/rejected": -3.09375,
1759
+ "logps/chosen": -386.0,
1760
+ "logps/rejected": -404.0,
1761
+ "loss": 0.5087,
1762
+ "rewards/accuracies": 0.824999988079071,
1763
+ "rewards/chosen": -0.25390625,
1764
+ "rewards/margins": 0.66796875,
1765
+ "rewards/rejected": -0.921875,
1766
+ "step": 500
1767
+ },
1768
+ {
1769
+ "epoch": 0.7068723702664796,
1770
+ "eval_logits/chosen": -3.078125,
1771
+ "eval_logits/rejected": -3.09375,
1772
+ "eval_logps/chosen": -348.0,
1773
+ "eval_logps/rejected": -366.0,
1774
+ "eval_loss": 0.5151299834251404,
1775
+ "eval_rewards/accuracies": 0.8031914830207825,
1776
+ "eval_rewards/chosen": -0.26171875,
1777
+ "eval_rewards/margins": 0.640625,
1778
+ "eval_rewards/rejected": -0.90234375,
1779
+ "eval_runtime": 65.7259,
1780
+ "eval_samples_per_second": 22.624,
1781
+ "eval_steps_per_second": 0.715,
1782
+ "step": 504
1783
+ },
1784
+ {
1785
+ "epoch": 0.7152875175315568,
1786
+ "grad_norm": 17.54957543271777,
1787
+ "learning_rate": 2.2771473666792496e-08,
1788
+ "logits/chosen": -2.90625,
1789
+ "logits/rejected": -3.03125,
1790
+ "logps/chosen": -380.0,
1791
+ "logps/rejected": -384.0,
1792
+ "loss": 0.5192,
1793
+ "rewards/accuracies": 0.862500011920929,
1794
+ "rewards/chosen": -0.23046875,
1795
+ "rewards/margins": 0.7109375,
1796
+ "rewards/rejected": -0.94140625,
1797
+ "step": 510
1798
+ },
1799
+ {
1800
+ "epoch": 0.7180925666199158,
1801
+ "eval_logits/chosen": -3.078125,
1802
+ "eval_logits/rejected": -3.09375,
1803
+ "eval_logps/chosen": -352.0,
1804
+ "eval_logps/rejected": -370.0,
1805
+ "eval_loss": 0.5147544145584106,
1806
+ "eval_rewards/accuracies": 0.8031914830207825,
1807
+ "eval_rewards/chosen": -0.298828125,
1808
+ "eval_rewards/margins": 0.63671875,
1809
+ "eval_rewards/rejected": -0.9375,
1810
+ "eval_runtime": 65.4306,
1811
+ "eval_samples_per_second": 22.726,
1812
+ "eval_steps_per_second": 0.718,
1813
+ "step": 512
1814
+ },
1815
+ {
1816
+ "epoch": 0.729312762973352,
1817
+ "grad_norm": 35.64036533963828,
1818
+ "learning_rate": 2.0749687081977334e-08,
1819
+ "logits/chosen": -3.03125,
1820
+ "logits/rejected": -3.0,
1821
+ "logps/chosen": -382.0,
1822
+ "logps/rejected": -378.0,
1823
+ "loss": 0.5057,
1824
+ "rewards/accuracies": 0.762499988079071,
1825
+ "rewards/chosen": -0.275390625,
1826
+ "rewards/margins": 0.71484375,
1827
+ "rewards/rejected": -0.98828125,
1828
+ "step": 520
1829
+ },
1830
+ {
1831
+ "epoch": 0.729312762973352,
1832
+ "eval_logits/chosen": -3.0625,
1833
+ "eval_logits/rejected": -3.09375,
1834
+ "eval_logps/chosen": -354.0,
1835
+ "eval_logps/rejected": -372.0,
1836
+ "eval_loss": 0.5146514177322388,
1837
+ "eval_rewards/accuracies": 0.7872340679168701,
1838
+ "eval_rewards/chosen": -0.328125,
1839
+ "eval_rewards/margins": 0.6328125,
1840
+ "eval_rewards/rejected": -0.9609375,
1841
+ "eval_runtime": 65.754,
1842
+ "eval_samples_per_second": 22.615,
1843
+ "eval_steps_per_second": 0.715,
1844
+ "step": 520
1845
+ },
1846
+ {
1847
+ "epoch": 0.7405329593267882,
1848
+ "eval_logits/chosen": -3.0625,
1849
+ "eval_logits/rejected": -3.09375,
1850
+ "eval_logps/chosen": -354.0,
1851
+ "eval_logps/rejected": -372.0,
1852
+ "eval_loss": 0.5139682292938232,
1853
+ "eval_rewards/accuracies": 0.792553186416626,
1854
+ "eval_rewards/chosen": -0.326171875,
1855
+ "eval_rewards/margins": 0.63671875,
1856
+ "eval_rewards/rejected": -0.9609375,
1857
+ "eval_runtime": 65.2389,
1858
+ "eval_samples_per_second": 22.793,
1859
+ "eval_steps_per_second": 0.72,
1860
+ "step": 528
1861
+ },
1862
+ {
1863
+ "epoch": 0.7433380084151473,
1864
+ "grad_norm": 26.831689294819764,
1865
+ "learning_rate": 1.8798147414005737e-08,
1866
+ "logits/chosen": -3.0,
1867
+ "logits/rejected": -2.875,
1868
+ "logps/chosen": -330.0,
1869
+ "logps/rejected": -372.0,
1870
+ "loss": 0.5183,
1871
+ "rewards/accuracies": 0.762499988079071,
1872
+ "rewards/chosen": -0.34765625,
1873
+ "rewards/margins": 0.5859375,
1874
+ "rewards/rejected": -0.93359375,
1875
+ "step": 530
1876
+ },
1877
+ {
1878
+ "epoch": 0.7517531556802244,
1879
+ "eval_logits/chosen": -3.0625,
1880
+ "eval_logits/rejected": -3.09375,
1881
+ "eval_logps/chosen": -352.0,
1882
+ "eval_logps/rejected": -370.0,
1883
+ "eval_loss": 0.5127372741699219,
1884
+ "eval_rewards/accuracies": 0.7978723645210266,
1885
+ "eval_rewards/chosen": -0.298828125,
1886
+ "eval_rewards/margins": 0.64453125,
1887
+ "eval_rewards/rejected": -0.9453125,
1888
+ "eval_runtime": 66.3192,
1889
+ "eval_samples_per_second": 22.422,
1890
+ "eval_steps_per_second": 0.709,
1891
+ "step": 536
1892
+ },
1893
+ {
1894
+ "epoch": 0.7573632538569425,
1895
+ "grad_norm": 46.59871665028949,
1896
+ "learning_rate": 1.692154143820063e-08,
1897
+ "logits/chosen": -3.03125,
1898
+ "logits/rejected": -2.96875,
1899
+ "logps/chosen": -348.0,
1900
+ "logps/rejected": -328.0,
1901
+ "loss": 0.5034,
1902
+ "rewards/accuracies": 0.7124999761581421,
1903
+ "rewards/chosen": -0.388671875,
1904
+ "rewards/margins": 0.515625,
1905
+ "rewards/rejected": -0.90234375,
1906
+ "step": 540
1907
+ },
1908
+ {
1909
+ "epoch": 0.7629733520336606,
1910
+ "eval_logits/chosen": -3.0625,
1911
+ "eval_logits/rejected": -3.09375,
1912
+ "eval_logps/chosen": -350.0,
1913
+ "eval_logps/rejected": -368.0,
1914
+ "eval_loss": 0.5124016404151917,
1915
+ "eval_rewards/accuracies": 0.7978723645210266,
1916
+ "eval_rewards/chosen": -0.271484375,
1917
+ "eval_rewards/margins": 0.65234375,
1918
+ "eval_rewards/rejected": -0.92578125,
1919
+ "eval_runtime": 68.1586,
1920
+ "eval_samples_per_second": 21.817,
1921
+ "eval_steps_per_second": 0.69,
1922
+ "step": 544
1923
+ },
1924
+ {
1925
+ "epoch": 0.7713884992987378,
1926
+ "grad_norm": 17.28019155088413,
1927
+ "learning_rate": 1.5124375970755755e-08,
1928
+ "logits/chosen": -3.078125,
1929
+ "logits/rejected": -3.078125,
1930
+ "logps/chosen": -356.0,
1931
+ "logps/rejected": -392.0,
1932
+ "loss": 0.5075,
1933
+ "rewards/accuracies": 0.800000011920929,
1934
+ "rewards/chosen": -0.390625,
1935
+ "rewards/margins": 0.62890625,
1936
+ "rewards/rejected": -1.015625,
1937
+ "step": 550
1938
+ },
1939
+ {
1940
+ "epoch": 0.7741935483870968,
1941
+ "eval_logits/chosen": -3.0625,
1942
+ "eval_logits/rejected": -3.09375,
1943
+ "eval_logps/chosen": -350.0,
1944
+ "eval_logps/rejected": -368.0,
1945
+ "eval_loss": 0.51242595911026,
1946
+ "eval_rewards/accuracies": 0.792553186416626,
1947
+ "eval_rewards/chosen": -0.28515625,
1948
+ "eval_rewards/margins": 0.65234375,
1949
+ "eval_rewards/rejected": -0.9375,
1950
+ "eval_runtime": 66.0347,
1951
+ "eval_samples_per_second": 22.518,
1952
+ "eval_steps_per_second": 0.712,
1953
+ "step": 552
1954
+ },
1955
+ {
1956
+ "epoch": 0.7854137447405329,
1957
+ "grad_norm": 18.119529412971673,
1958
+ "learning_rate": 1.3410967045263622e-08,
1959
+ "logits/chosen": -3.0,
1960
+ "logits/rejected": -2.90625,
1961
+ "logps/chosen": -348.0,
1962
+ "logps/rejected": -384.0,
1963
+ "loss": 0.5274,
1964
+ "rewards/accuracies": 0.800000011920929,
1965
+ "rewards/chosen": -0.29296875,
1966
+ "rewards/margins": 0.703125,
1967
+ "rewards/rejected": -0.99609375,
1968
+ "step": 560
1969
+ },
1970
+ {
1971
+ "epoch": 0.7854137447405329,
1972
+ "eval_logits/chosen": -3.0625,
1973
+ "eval_logits/rejected": -3.09375,
1974
+ "eval_logps/chosen": -354.0,
1975
+ "eval_logps/rejected": -372.0,
1976
+ "eval_loss": 0.5127307176589966,
1977
+ "eval_rewards/accuracies": 0.792553186416626,
1978
+ "eval_rewards/chosen": -0.31640625,
1979
+ "eval_rewards/margins": 0.64453125,
1980
+ "eval_rewards/rejected": -0.9609375,
1981
+ "eval_runtime": 65.7053,
1982
+ "eval_samples_per_second": 22.631,
1983
+ "eval_steps_per_second": 0.715,
1984
+ "step": 560
1985
+ },
1986
+ {
1987
+ "epoch": 0.7966339410939691,
1988
+ "eval_logits/chosen": -3.0625,
1989
+ "eval_logits/rejected": -3.09375,
1990
+ "eval_logps/chosen": -354.0,
1991
+ "eval_logps/rejected": -372.0,
1992
+ "eval_loss": 0.5127339959144592,
1993
+ "eval_rewards/accuracies": 0.7819148898124695,
1994
+ "eval_rewards/chosen": -0.3203125,
1995
+ "eval_rewards/margins": 0.64453125,
1996
+ "eval_rewards/rejected": -0.96875,
1997
+ "eval_runtime": 65.5474,
1998
+ "eval_samples_per_second": 22.686,
1999
+ "eval_steps_per_second": 0.717,
2000
+ "step": 568
2001
+ },
2002
+ {
2003
+ "epoch": 0.7994389901823282,
2004
+ "grad_norm": 28.16590781200549,
2005
+ "learning_rate": 1.1785429547422909e-08,
2006
+ "logits/chosen": -3.0,
2007
+ "logits/rejected": -3.03125,
2008
+ "logps/chosen": -318.0,
2009
+ "logps/rejected": -362.0,
2010
+ "loss": 0.5138,
2011
+ "rewards/accuracies": 0.699999988079071,
2012
+ "rewards/chosen": -0.462890625,
2013
+ "rewards/margins": 0.48828125,
2014
+ "rewards/rejected": -0.953125,
2015
+ "step": 570
2016
+ },
2017
+ {
2018
+ "epoch": 0.8078541374474053,
2019
+ "eval_logits/chosen": -3.0625,
2020
+ "eval_logits/rejected": -3.09375,
2021
+ "eval_logps/chosen": -354.0,
2022
+ "eval_logps/rejected": -372.0,
2023
+ "eval_loss": 0.5122042298316956,
2024
+ "eval_rewards/accuracies": 0.7872340679168701,
2025
+ "eval_rewards/chosen": -0.3125,
2026
+ "eval_rewards/margins": 0.6484375,
2027
+ "eval_rewards/rejected": -0.9609375,
2028
+ "eval_runtime": 65.369,
2029
+ "eval_samples_per_second": 22.748,
2030
+ "eval_steps_per_second": 0.719,
2031
+ "step": 576
2032
+ },
2033
+ {
2034
+ "epoch": 0.8134642356241234,
2035
+ "grad_norm": 20.331557503849197,
2036
+ "learning_rate": 1.0251667332818215e-08,
2037
+ "logits/chosen": -2.875,
2038
+ "logits/rejected": -2.96875,
2039
+ "logps/chosen": -364.0,
2040
+ "logps/rejected": -390.0,
2041
+ "loss": 0.5087,
2042
+ "rewards/accuracies": 0.824999988079071,
2043
+ "rewards/chosen": -0.4140625,
2044
+ "rewards/margins": 0.75390625,
2045
+ "rewards/rejected": -1.171875,
2046
+ "step": 580
2047
+ },
2048
+ {
2049
+ "epoch": 0.8190743338008415,
2050
+ "eval_logits/chosen": -3.0625,
2051
+ "eval_logits/rejected": -3.09375,
2052
+ "eval_logps/chosen": -352.0,
2053
+ "eval_logps/rejected": -370.0,
2054
+ "eval_loss": 0.5114827156066895,
2055
+ "eval_rewards/accuracies": 0.7978723645210266,
2056
+ "eval_rewards/chosen": -0.30078125,
2057
+ "eval_rewards/margins": 0.65234375,
2058
+ "eval_rewards/rejected": -0.953125,
2059
+ "eval_runtime": 65.3364,
2060
+ "eval_samples_per_second": 22.759,
2061
+ "eval_steps_per_second": 0.719,
2062
+ "step": 584
2063
+ },
2064
+ {
2065
+ "epoch": 0.8274894810659187,
2066
+ "grad_norm": 26.321821934426545,
2067
+ "learning_rate": 8.813363851505284e-09,
2068
+ "logits/chosen": -2.96875,
2069
+ "logits/rejected": -2.96875,
2070
+ "logps/chosen": -412.0,
2071
+ "logps/rejected": -410.0,
2072
+ "loss": 0.5026,
2073
+ "rewards/accuracies": 0.8500000238418579,
2074
+ "rewards/chosen": -0.46875,
2075
+ "rewards/margins": 0.55078125,
2076
+ "rewards/rejected": -1.0234375,
2077
+ "step": 590
2078
+ },
2079
+ {
2080
+ "epoch": 0.8302945301542777,
2081
+ "eval_logits/chosen": -3.0625,
2082
+ "eval_logits/rejected": -3.078125,
2083
+ "eval_logps/chosen": -352.0,
2084
+ "eval_logps/rejected": -370.0,
2085
+ "eval_loss": 0.5113134384155273,
2086
+ "eval_rewards/accuracies": 0.7978723645210266,
2087
+ "eval_rewards/chosen": -0.30078125,
2088
+ "eval_rewards/margins": 0.65234375,
2089
+ "eval_rewards/rejected": -0.953125,
2090
+ "eval_runtime": 65.7466,
2091
+ "eval_samples_per_second": 22.617,
2092
+ "eval_steps_per_second": 0.715,
2093
+ "step": 592
2094
+ },
2095
+ {
2096
+ "epoch": 0.8415147265077139,
2097
+ "grad_norm": 17.927505615509187,
2098
+ "learning_rate": 7.473973301917124e-09,
2099
+ "logits/chosen": -2.90625,
2100
+ "logits/rejected": -2.875,
2101
+ "logps/chosen": -322.0,
2102
+ "logps/rejected": -352.0,
2103
+ "loss": 0.4993,
2104
+ "rewards/accuracies": 0.887499988079071,
2105
+ "rewards/chosen": -0.26171875,
2106
+ "rewards/margins": 0.63671875,
2107
+ "rewards/rejected": -0.8984375,
2108
+ "step": 600
2109
+ },
2110
+ {
2111
+ "epoch": 0.8415147265077139,
2112
+ "eval_logits/chosen": -3.0625,
2113
+ "eval_logits/rejected": -3.078125,
2114
+ "eval_logps/chosen": -352.0,
2115
+ "eval_logps/rejected": -372.0,
2116
+ "eval_loss": 0.5112673044204712,
2117
+ "eval_rewards/accuracies": 0.7978723645210266,
2118
+ "eval_rewards/chosen": -0.3046875,
2119
+ "eval_rewards/margins": 0.65234375,
2120
+ "eval_rewards/rejected": -0.95703125,
2121
+ "eval_runtime": 65.3172,
2122
+ "eval_samples_per_second": 22.766,
2123
+ "eval_steps_per_second": 0.72,
2124
+ "step": 600
2125
+ },
2126
+ {
2127
+ "epoch": 0.85273492286115,
2128
+ "eval_logits/chosen": -3.0625,
2129
+ "eval_logits/rejected": -3.078125,
2130
+ "eval_logps/chosen": -354.0,
2131
+ "eval_logps/rejected": -372.0,
2132
+ "eval_loss": 0.5110276341438293,
2133
+ "eval_rewards/accuracies": 0.7872340679168701,
2134
+ "eval_rewards/chosen": -0.314453125,
2135
+ "eval_rewards/margins": 0.6484375,
2136
+ "eval_rewards/rejected": -0.9609375,
2137
+ "eval_runtime": 65.4306,
2138
+ "eval_samples_per_second": 22.726,
2139
+ "eval_steps_per_second": 0.718,
2140
+ "step": 608
2141
+ },
2142
+ {
2143
+ "epoch": 0.8555399719495091,
2144
+ "grad_norm": 23.917252124640086,
2145
+ "learning_rate": 6.236712335336131e-09,
2146
+ "logits/chosen": -3.0625,
2147
+ "logits/rejected": -2.90625,
2148
+ "logps/chosen": -292.0,
2149
+ "logps/rejected": -342.0,
2150
+ "loss": 0.5145,
2151
+ "rewards/accuracies": 0.7875000238418579,
2152
+ "rewards/chosen": -0.37109375,
2153
+ "rewards/margins": 0.578125,
2154
+ "rewards/rejected": -0.94921875,
2155
+ "step": 610
2156
+ },
2157
+ {
2158
+ "epoch": 0.8639551192145862,
2159
+ "eval_logits/chosen": -3.0625,
2160
+ "eval_logits/rejected": -3.078125,
2161
+ "eval_logps/chosen": -354.0,
2162
+ "eval_logps/rejected": -372.0,
2163
+ "eval_loss": 0.5110043883323669,
2164
+ "eval_rewards/accuracies": 0.7872340679168701,
2165
+ "eval_rewards/chosen": -0.31640625,
2166
+ "eval_rewards/margins": 0.6484375,
2167
+ "eval_rewards/rejected": -0.96484375,
2168
+ "eval_runtime": 65.4869,
2169
+ "eval_samples_per_second": 22.707,
2170
+ "eval_steps_per_second": 0.718,
2171
+ "step": 616
2172
+ },
2173
+ {
2174
+ "epoch": 0.8695652173913043,
2175
+ "grad_norm": 35.23685131659172,
2176
+ "learning_rate": 5.104552330854112e-09,
2177
+ "logits/chosen": -2.921875,
2178
+ "logits/rejected": -2.90625,
2179
+ "logps/chosen": -406.0,
2180
+ "logps/rejected": -434.0,
2181
+ "loss": 0.4857,
2182
+ "rewards/accuracies": 0.862500011920929,
2183
+ "rewards/chosen": -0.353515625,
2184
+ "rewards/margins": 0.76953125,
2185
+ "rewards/rejected": -1.125,
2186
+ "step": 620
2187
+ },
2188
+ {
2189
+ "epoch": 0.8751753155680224,
2190
+ "eval_logits/chosen": -3.0625,
2191
+ "eval_logits/rejected": -3.078125,
2192
+ "eval_logps/chosen": -354.0,
2193
+ "eval_logps/rejected": -372.0,
2194
+ "eval_loss": 0.5109130144119263,
2195
+ "eval_rewards/accuracies": 0.7872340679168701,
2196
+ "eval_rewards/chosen": -0.31640625,
2197
+ "eval_rewards/margins": 0.65234375,
2198
+ "eval_rewards/rejected": -0.96875,
2199
+ "eval_runtime": 65.4952,
2200
+ "eval_samples_per_second": 22.704,
2201
+ "eval_steps_per_second": 0.718,
2202
+ "step": 624
2203
+ },
2204
+ {
2205
+ "epoch": 0.8835904628330996,
2206
+ "grad_norm": 28.24616740578334,
2207
+ "learning_rate": 4.080212259372711e-09,
2208
+ "logits/chosen": -3.0625,
2209
+ "logits/rejected": -3.0625,
2210
+ "logps/chosen": -334.0,
2211
+ "logps/rejected": -356.0,
2212
+ "loss": 0.5163,
2213
+ "rewards/accuracies": 0.887499988079071,
2214
+ "rewards/chosen": -0.33984375,
2215
+ "rewards/margins": 0.73046875,
2216
+ "rewards/rejected": -1.0703125,
2217
+ "step": 630
2218
+ },
2219
+ {
2220
+ "epoch": 0.8863955119214586,
2221
+ "eval_logits/chosen": -3.0625,
2222
+ "eval_logits/rejected": -3.078125,
2223
+ "eval_logps/chosen": -354.0,
2224
+ "eval_logps/rejected": -372.0,
2225
+ "eval_loss": 0.5109508037567139,
2226
+ "eval_rewards/accuracies": 0.7872340679168701,
2227
+ "eval_rewards/chosen": -0.3203125,
2228
+ "eval_rewards/margins": 0.6484375,
2229
+ "eval_rewards/rejected": -0.96875,
2230
+ "eval_runtime": 65.7135,
2231
+ "eval_samples_per_second": 22.629,
2232
+ "eval_steps_per_second": 0.715,
2233
+ "step": 632
2234
+ },
2235
+ {
2236
+ "epoch": 0.8976157082748948,
2237
+ "grad_norm": 23.74887530831686,
2238
+ "learning_rate": 3.1661521537819257e-09,
2239
+ "logits/chosen": -3.03125,
2240
+ "logits/rejected": -2.96875,
2241
+ "logps/chosen": -384.0,
2242
+ "logps/rejected": -402.0,
2243
+ "loss": 0.5258,
2244
+ "rewards/accuracies": 0.8374999761581421,
2245
+ "rewards/chosen": -0.34375,
2246
+ "rewards/margins": 0.73828125,
2247
+ "rewards/rejected": -1.0859375,
2248
+ "step": 640
2249
+ },
2250
+ {
2251
+ "epoch": 0.8976157082748948,
2252
+ "eval_logits/chosen": -3.0625,
2253
+ "eval_logits/rejected": -3.078125,
2254
+ "eval_logps/chosen": -354.0,
2255
+ "eval_logps/rejected": -372.0,
2256
+ "eval_loss": 0.5108342170715332,
2257
+ "eval_rewards/accuracies": 0.7819148898124695,
2258
+ "eval_rewards/chosen": -0.32421875,
2259
+ "eval_rewards/margins": 0.65234375,
2260
+ "eval_rewards/rejected": -0.9765625,
2261
+ "eval_runtime": 65.4352,
2262
+ "eval_samples_per_second": 22.725,
2263
+ "eval_steps_per_second": 0.718,
2264
+ "step": 640
2265
+ },
2266
+ {
2267
+ "epoch": 0.908835904628331,
2268
+ "eval_logits/chosen": -3.0625,
2269
+ "eval_logits/rejected": -3.078125,
2270
+ "eval_logps/chosen": -354.0,
2271
+ "eval_logps/rejected": -374.0,
2272
+ "eval_loss": 0.510999321937561,
2273
+ "eval_rewards/accuracies": 0.7765957713127136,
2274
+ "eval_rewards/chosen": -0.328125,
2275
+ "eval_rewards/margins": 0.65234375,
2276
+ "eval_rewards/rejected": -0.98046875,
2277
+ "eval_runtime": 66.8594,
2278
+ "eval_samples_per_second": 22.241,
2279
+ "eval_steps_per_second": 0.703,
2280
+ "step": 648
2281
+ },
2282
+ {
2283
+ "epoch": 0.9116409537166901,
2284
+ "grad_norm": 23.848849763801983,
2285
+ "learning_rate": 2.3645672009984684e-09,
2286
+ "logits/chosen": -2.90625,
2287
+ "logits/rejected": -2.859375,
2288
+ "logps/chosen": -334.0,
2289
+ "logps/rejected": -372.0,
2290
+ "loss": 0.5206,
2291
+ "rewards/accuracies": 0.8500000238418579,
2292
+ "rewards/chosen": -0.4765625,
2293
+ "rewards/margins": 0.56640625,
2294
+ "rewards/rejected": -1.046875,
2295
+ "step": 650
2296
+ },
2297
+ {
2298
+ "epoch": 0.9200561009817672,
2299
+ "eval_logits/chosen": -3.0625,
2300
+ "eval_logits/rejected": -3.078125,
2301
+ "eval_logps/chosen": -354.0,
2302
+ "eval_logps/rejected": -374.0,
2303
+ "eval_loss": 0.5108804702758789,
2304
+ "eval_rewards/accuracies": 0.7765957713127136,
2305
+ "eval_rewards/chosen": -0.326171875,
2306
+ "eval_rewards/margins": 0.65234375,
2307
+ "eval_rewards/rejected": -0.98046875,
2308
+ "eval_runtime": 66.985,
2309
+ "eval_samples_per_second": 22.199,
2310
+ "eval_steps_per_second": 0.702,
2311
+ "step": 656
2312
+ },
2313
+ {
2314
+ "epoch": 0.9256661991584852,
2315
+ "grad_norm": 20.841266069644966,
2316
+ "learning_rate": 1.677382470052513e-09,
2317
+ "logits/chosen": -3.0,
2318
+ "logits/rejected": -2.984375,
2319
+ "logps/chosen": -412.0,
2320
+ "logps/rejected": -372.0,
2321
+ "loss": 0.4926,
2322
+ "rewards/accuracies": 0.7875000238418579,
2323
+ "rewards/chosen": -0.43359375,
2324
+ "rewards/margins": 0.6875,
2325
+ "rewards/rejected": -1.125,
2326
+ "step": 660
2327
+ },
2328
+ {
2329
+ "epoch": 0.9312762973352033,
2330
+ "eval_logits/chosen": -3.0625,
2331
+ "eval_logits/rejected": -3.078125,
2332
+ "eval_logps/chosen": -354.0,
2333
+ "eval_logps/rejected": -374.0,
2334
+ "eval_loss": 0.5108736753463745,
2335
+ "eval_rewards/accuracies": 0.7819148898124695,
2336
+ "eval_rewards/chosen": -0.328125,
2337
+ "eval_rewards/margins": 0.65234375,
2338
+ "eval_rewards/rejected": -0.98046875,
2339
+ "eval_runtime": 66.0163,
2340
+ "eval_samples_per_second": 22.525,
2341
+ "eval_steps_per_second": 0.712,
2342
+ "step": 664
2343
+ },
2344
+ {
2345
+ "epoch": 0.9396914446002805,
2346
+ "grad_norm": 42.65600681696292,
2347
+ "learning_rate": 1.1062482888836656e-09,
2348
+ "logits/chosen": -3.0625,
2349
+ "logits/rejected": -2.953125,
2350
+ "logps/chosen": -372.0,
2351
+ "logps/rejected": -374.0,
2352
+ "loss": 0.5,
2353
+ "rewards/accuracies": 0.875,
2354
+ "rewards/chosen": -0.349609375,
2355
+ "rewards/margins": 0.8828125,
2356
+ "rewards/rejected": -1.234375,
2357
+ "step": 670
2358
+ },
2359
+ {
2360
+ "epoch": 0.9424964936886395,
2361
+ "eval_logits/chosen": -3.0625,
2362
+ "eval_logits/rejected": -3.078125,
2363
+ "eval_logps/chosen": -354.0,
2364
+ "eval_logps/rejected": -374.0,
2365
+ "eval_loss": 0.5107899904251099,
2366
+ "eval_rewards/accuracies": 0.7819148898124695,
2367
+ "eval_rewards/chosen": -0.326171875,
2368
+ "eval_rewards/margins": 0.65625,
2369
+ "eval_rewards/rejected": -0.984375,
2370
+ "eval_runtime": 65.1766,
2371
+ "eval_samples_per_second": 22.815,
2372
+ "eval_steps_per_second": 0.721,
2373
+ "step": 672
2374
+ },
2375
+ {
2376
+ "epoch": 0.9537166900420757,
2377
+ "grad_norm": 22.14348589619,
2378
+ "learning_rate": 6.525362809492008e-10,
2379
+ "logits/chosen": -2.90625,
2380
+ "logits/rejected": -3.078125,
2381
+ "logps/chosen": -350.0,
2382
+ "logps/rejected": -390.0,
2383
+ "loss": 0.5101,
2384
+ "rewards/accuracies": 0.8125,
2385
+ "rewards/chosen": -0.26953125,
2386
+ "rewards/margins": 0.76171875,
2387
+ "rewards/rejected": -1.03125,
2388
+ "step": 680
2389
+ },
2390
+ {
2391
+ "epoch": 0.9537166900420757,
2392
+ "eval_logits/chosen": -3.0625,
2393
+ "eval_logits/rejected": -3.078125,
2394
+ "eval_logps/chosen": -354.0,
2395
+ "eval_logps/rejected": -372.0,
2396
+ "eval_loss": 0.5109755992889404,
2397
+ "eval_rewards/accuracies": 0.7872340679168701,
2398
+ "eval_rewards/chosen": -0.32421875,
2399
+ "eval_rewards/margins": 0.65234375,
2400
+ "eval_rewards/rejected": -0.9765625,
2401
+ "eval_runtime": 65.4357,
2402
+ "eval_samples_per_second": 22.725,
2403
+ "eval_steps_per_second": 0.718,
2404
+ "step": 680
2405
+ },
2406
+ {
2407
+ "epoch": 0.9649368863955119,
2408
+ "eval_logits/chosen": -3.0625,
2409
+ "eval_logits/rejected": -3.078125,
2410
+ "eval_logps/chosen": -354.0,
2411
+ "eval_logps/rejected": -374.0,
2412
+ "eval_loss": 0.510669469833374,
2413
+ "eval_rewards/accuracies": 0.7819148898124695,
2414
+ "eval_rewards/chosen": -0.32421875,
2415
+ "eval_rewards/margins": 0.65234375,
2416
+ "eval_rewards/rejected": -0.9765625,
2417
+ "eval_runtime": 65.5907,
2418
+ "eval_samples_per_second": 22.671,
2419
+ "eval_steps_per_second": 0.717,
2420
+ "step": 688
2421
+ },
2422
+ {
2423
+ "epoch": 0.967741935483871,
2424
+ "grad_norm": 18.79687876463209,
2425
+ "learning_rate": 3.173360711629325e-10,
2426
+ "logits/chosen": -2.921875,
2427
+ "logits/rejected": -3.0,
2428
+ "logps/chosen": -284.0,
2429
+ "logps/rejected": -380.0,
2430
+ "loss": 0.4995,
2431
+ "rewards/accuracies": 0.824999988079071,
2432
+ "rewards/chosen": -0.357421875,
2433
+ "rewards/margins": 0.64453125,
2434
+ "rewards/rejected": -1.0,
2435
+ "step": 690
2436
+ },
2437
+ {
2438
+ "epoch": 0.9761570827489481,
2439
+ "eval_logits/chosen": -3.0625,
2440
+ "eval_logits/rejected": -3.078125,
2441
+ "eval_logps/chosen": -354.0,
2442
+ "eval_logps/rejected": -372.0,
2443
+ "eval_loss": 0.5105814337730408,
2444
+ "eval_rewards/accuracies": 0.7872340679168701,
2445
+ "eval_rewards/chosen": -0.32421875,
2446
+ "eval_rewards/margins": 0.65234375,
2447
+ "eval_rewards/rejected": -0.9765625,
2448
+ "eval_runtime": 65.6777,
2449
+ "eval_samples_per_second": 22.641,
2450
+ "eval_steps_per_second": 0.716,
2451
+ "step": 696
2452
+ },
2453
+ {
2454
+ "epoch": 0.9817671809256662,
2455
+ "grad_norm": 27.90140864474871,
2456
+ "learning_rate": 1.014526690756634e-10,
2457
+ "logits/chosen": -3.046875,
2458
+ "logits/rejected": -3.09375,
2459
+ "logps/chosen": -368.0,
2460
+ "logps/rejected": -396.0,
2461
+ "loss": 0.5048,
2462
+ "rewards/accuracies": 0.8500000238418579,
2463
+ "rewards/chosen": -0.341796875,
2464
+ "rewards/margins": 0.859375,
2465
+ "rewards/rejected": -1.203125,
2466
+ "step": 700
2467
+ },
2468
+ {
2469
+ "epoch": 0.9873772791023843,
2470
+ "eval_logits/chosen": -3.0625,
2471
+ "eval_logits/rejected": -3.078125,
2472
+ "eval_logps/chosen": -354.0,
2473
+ "eval_logps/rejected": -372.0,
2474
+ "eval_loss": 0.5105742812156677,
2475
+ "eval_rewards/accuracies": 0.7872340679168701,
2476
+ "eval_rewards/chosen": -0.322265625,
2477
+ "eval_rewards/margins": 0.65625,
2478
+ "eval_rewards/rejected": -0.9765625,
2479
+ "eval_runtime": 64.9937,
2480
+ "eval_samples_per_second": 22.879,
2481
+ "eval_steps_per_second": 0.723,
2482
+ "step": 704
2483
+ },
2484
+ {
2485
+ "epoch": 0.9957924263674615,
2486
+ "grad_norm": 25.796305051934237,
2487
+ "learning_rate": 5.404535581693403e-12,
2488
+ "logits/chosen": -3.015625,
2489
+ "logits/rejected": -2.953125,
2490
+ "logps/chosen": -370.0,
2491
+ "logps/rejected": -408.0,
2492
+ "loss": 0.5149,
2493
+ "rewards/accuracies": 0.800000011920929,
2494
+ "rewards/chosen": -0.392578125,
2495
+ "rewards/margins": 0.63671875,
2496
+ "rewards/rejected": -1.03125,
2497
+ "step": 710
2498
+ },
2499
+ {
2500
+ "epoch": 0.9985974754558204,
2501
+ "eval_logits/chosen": -3.0625,
2502
+ "eval_logits/rejected": -3.078125,
2503
+ "eval_logps/chosen": -354.0,
2504
+ "eval_logps/rejected": -374.0,
2505
+ "eval_loss": 0.5107670426368713,
2506
+ "eval_rewards/accuracies": 0.7872340679168701,
2507
+ "eval_rewards/chosen": -0.32421875,
2508
+ "eval_rewards/margins": 0.65234375,
2509
+ "eval_rewards/rejected": -0.9765625,
2510
+ "eval_runtime": 65.9108,
2511
+ "eval_samples_per_second": 22.561,
2512
+ "eval_steps_per_second": 0.713,
2513
+ "step": 712
2514
+ },
2515
+ {
2516
+ "epoch": 1.0,
2517
+ "step": 713,
2518
+ "total_flos": 0.0,
2519
+ "train_loss": 0.5638467967928208,
2520
+ "train_runtime": 32082.6297,
2521
+ "train_samples_per_second": 1.422,
2522
+ "train_steps_per_second": 0.022
2523
+ }
2524
+ ],
2525
+ "logging_steps": 10,
2526
+ "max_steps": 713,
2527
+ "num_input_tokens_seen": 0,
2528
+ "num_train_epochs": 1,
2529
+ "save_steps": 8,
2530
+ "stateful_callbacks": {
2531
+ "TrainerControl": {
2532
+ "args": {
2533
+ "should_epoch_stop": false,
2534
+ "should_evaluate": false,
2535
+ "should_log": false,
2536
+ "should_save": true,
2537
+ "should_training_stop": true
2538
+ },
2539
+ "attributes": {}
2540
+ }
2541
+ },
2542
+ "total_flos": 0.0,
2543
+ "train_batch_size": 4,
2544
+ "trial_name": null,
2545
+ "trial_params": null
2546
+ }