AmberYifan commited on
Commit
6d9358d
·
verified ·
1 Parent(s): b4ea47a

Training in progress, epoch 2, checkpoint

Browse files
last-checkpoint/global_step2496/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03d4407874468a31609f6ecbbe75bf8700a11c1b5018c0ef16daafbd696656aa
3
+ size 30462473157
last-checkpoint/global_step2496/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ccbc947add30ccdbea986c88513b17ecf225fdef4e5a4052ab5ccd4eb0377c8
3
+ size 30462473157
last-checkpoint/global_step2496/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a307b23c89338775b5aade32ea4fbde1f98601d778b1dedbd65a5087360d9f22
3
+ size 168021
last-checkpoint/global_step2496/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:672f5f5347aba43a2f3bd4f1def683a3231d360da3149605ba803dc97cb2d03e
3
+ size 168021
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step1248
 
1
+ global_step2496
last-checkpoint/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02a87a068580e17b63efa87796e3cc03c3c80312c9971d47fd2fd4f6d5582a43
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fc827e0e29fe68064287084a789c280ee07071edfaf802494bfd45a4328d9b4
3
  size 4877660776
last-checkpoint/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c475f356780c768e46eacf970415ab7cc46714ef3732edce1110907dd8f21a3
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17eff878abe01ff745e70abc42b2798b23c5cca5a165c37c10a14930bca0d0c6
3
  size 4932751008
last-checkpoint/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a82ea2f8d60eb99e2c2e7b52258f8ad9af5e81e7251a460b3067da29fd499b58
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:614d95265342337ab7854c0fec6ee9677c78a2d9c93f081a8c779cc5a910acd8
3
  size 4330865200
last-checkpoint/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f1f53ced200b2511a937c79d9d4878ea6c7d89792cc4144436543f708ee49f1
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a018f26b180a313b65824cdc68761f2b6f1700c2d360dfc69b6300fcb5e461b
3
  size 1089994880
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b580656286e8a6f334aced7bdb46499a54f3bb95644a0167405da037afbd894d
3
  size 14768
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9698021f2d84167912e7be6ba48d3d2b8d6b20894f23319f36df078c03b33a64
3
  size 14768
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a763d1d109f11374f3725ac97283433a5c2264a51fd11d55a5af0441e79bbe2c
3
  size 14768
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a140d1d010220b1679bf6e519f8d3d518cb57331e0e7fb30008dc00e427811
3
  size 14768
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f509e07aeb2d18a9542d77802086a220c855eaadfa7372ba3c450b3c079e1739
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76861627b7d29ad5e6036c31e8897f76435dd322d24f5d6f9e3f8afd8fab8ced
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 1248,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1898,6 +1898,1897 @@
1898
  "eval_samples_per_second": 7.837,
1899
  "eval_steps_per_second": 0.985,
1900
  "step": 1248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1901
  }
1902
  ],
1903
  "logging_steps": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
  "eval_steps": 500,
6
+ "global_step": 2496,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1898
  "eval_samples_per_second": 7.837,
1899
  "eval_steps_per_second": 0.985,
1900
  "step": 1248
1901
+ },
1902
+ {
1903
+ "epoch": 1.001602564102564,
1904
+ "grad_norm": 1.5097157370383436e-06,
1905
+ "learning_rate": 3.7013950727218754e-07,
1906
+ "logits/chosen": 0.90234375,
1907
+ "logits/rejected": 1.2890625,
1908
+ "logps/chosen": -149.0,
1909
+ "logps/rejected": -280.0,
1910
+ "loss": 0.0,
1911
+ "rewards/accuracies": 1.0,
1912
+ "rewards/chosen": 3.4375,
1913
+ "rewards/margins": 19.375,
1914
+ "rewards/rejected": -15.875,
1915
+ "step": 1250
1916
+ },
1917
+ {
1918
+ "epoch": 1.0096153846153846,
1919
+ "grad_norm": 6.567245720525397e-06,
1920
+ "learning_rate": 3.6865538735529826e-07,
1921
+ "logits/chosen": 0.8359375,
1922
+ "logits/rejected": 1.046875,
1923
+ "logps/chosen": -135.0,
1924
+ "logps/rejected": -284.0,
1925
+ "loss": 0.0,
1926
+ "rewards/accuracies": 1.0,
1927
+ "rewards/chosen": 3.640625,
1928
+ "rewards/margins": 18.5,
1929
+ "rewards/rejected": -14.875,
1930
+ "step": 1260
1931
+ },
1932
+ {
1933
+ "epoch": 1.017628205128205,
1934
+ "grad_norm": 2.7660339460230295e-07,
1935
+ "learning_rate": 3.6717126743840897e-07,
1936
+ "logits/chosen": 0.75390625,
1937
+ "logits/rejected": 1.203125,
1938
+ "logps/chosen": -176.0,
1939
+ "logps/rejected": -296.0,
1940
+ "loss": 0.0,
1941
+ "rewards/accuracies": 1.0,
1942
+ "rewards/chosen": 3.71875,
1943
+ "rewards/margins": 19.25,
1944
+ "rewards/rejected": -15.5,
1945
+ "step": 1270
1946
+ },
1947
+ {
1948
+ "epoch": 1.0256410256410255,
1949
+ "grad_norm": 0.00038016083445366024,
1950
+ "learning_rate": 3.6568714752151974e-07,
1951
+ "logits/chosen": 0.90625,
1952
+ "logits/rejected": 1.203125,
1953
+ "logps/chosen": -158.0,
1954
+ "logps/rejected": -302.0,
1955
+ "loss": 0.0,
1956
+ "rewards/accuracies": 1.0,
1957
+ "rewards/chosen": 3.53125,
1958
+ "rewards/margins": 19.0,
1959
+ "rewards/rejected": -15.5,
1960
+ "step": 1280
1961
+ },
1962
+ {
1963
+ "epoch": 1.0336538461538463,
1964
+ "grad_norm": 1.642467149992957e-05,
1965
+ "learning_rate": 3.6420302760463045e-07,
1966
+ "logits/chosen": 0.79296875,
1967
+ "logits/rejected": 1.171875,
1968
+ "logps/chosen": -163.0,
1969
+ "logps/rejected": -292.0,
1970
+ "loss": 0.0,
1971
+ "rewards/accuracies": 1.0,
1972
+ "rewards/chosen": 3.46875,
1973
+ "rewards/margins": 19.375,
1974
+ "rewards/rejected": -15.9375,
1975
+ "step": 1290
1976
+ },
1977
+ {
1978
+ "epoch": 1.0416666666666667,
1979
+ "grad_norm": 1.456498508279097e-05,
1980
+ "learning_rate": 3.6271890768774116e-07,
1981
+ "logits/chosen": 0.83984375,
1982
+ "logits/rejected": 1.296875,
1983
+ "logps/chosen": -138.0,
1984
+ "logps/rejected": -276.0,
1985
+ "loss": 0.0,
1986
+ "rewards/accuracies": 1.0,
1987
+ "rewards/chosen": 3.390625,
1988
+ "rewards/margins": 18.875,
1989
+ "rewards/rejected": -15.4375,
1990
+ "step": 1300
1991
+ },
1992
+ {
1993
+ "epoch": 1.0496794871794872,
1994
+ "grad_norm": 0.2184132961337863,
1995
+ "learning_rate": 3.612347877708519e-07,
1996
+ "logits/chosen": 0.796875,
1997
+ "logits/rejected": 1.1015625,
1998
+ "logps/chosen": -126.0,
1999
+ "logps/rejected": -302.0,
2000
+ "loss": 0.0013,
2001
+ "rewards/accuracies": 1.0,
2002
+ "rewards/chosen": 3.59375,
2003
+ "rewards/margins": 20.25,
2004
+ "rewards/rejected": -16.625,
2005
+ "step": 1310
2006
+ },
2007
+ {
2008
+ "epoch": 1.0576923076923077,
2009
+ "grad_norm": 0.00027060412424146214,
2010
+ "learning_rate": 3.597506678539626e-07,
2011
+ "logits/chosen": 0.78125,
2012
+ "logits/rejected": 1.4375,
2013
+ "logps/chosen": -162.0,
2014
+ "logps/rejected": -294.0,
2015
+ "loss": 0.0,
2016
+ "rewards/accuracies": 1.0,
2017
+ "rewards/chosen": 3.78125,
2018
+ "rewards/margins": 19.0,
2019
+ "rewards/rejected": -15.1875,
2020
+ "step": 1320
2021
+ },
2022
+ {
2023
+ "epoch": 1.0657051282051282,
2024
+ "grad_norm": 0.0005949930902462416,
2025
+ "learning_rate": 3.582665479370733e-07,
2026
+ "logits/chosen": 1.015625,
2027
+ "logits/rejected": 1.1640625,
2028
+ "logps/chosen": -133.0,
2029
+ "logps/rejected": -280.0,
2030
+ "loss": 0.0001,
2031
+ "rewards/accuracies": 1.0,
2032
+ "rewards/chosen": 3.765625,
2033
+ "rewards/margins": 18.625,
2034
+ "rewards/rejected": -14.8125,
2035
+ "step": 1330
2036
+ },
2037
+ {
2038
+ "epoch": 1.0737179487179487,
2039
+ "grad_norm": 3.821536446255942e-05,
2040
+ "learning_rate": 3.5678242802018396e-07,
2041
+ "logits/chosen": 0.98828125,
2042
+ "logits/rejected": 1.0234375,
2043
+ "logps/chosen": -169.0,
2044
+ "logps/rejected": -284.0,
2045
+ "loss": 0.0,
2046
+ "rewards/accuracies": 1.0,
2047
+ "rewards/chosen": 3.8125,
2048
+ "rewards/margins": 19.0,
2049
+ "rewards/rejected": -15.25,
2050
+ "step": 1340
2051
+ },
2052
+ {
2053
+ "epoch": 1.0817307692307692,
2054
+ "grad_norm": 0.0003595307379009215,
2055
+ "learning_rate": 3.5529830810329473e-07,
2056
+ "logits/chosen": 0.703125,
2057
+ "logits/rejected": 1.109375,
2058
+ "logps/chosen": -163.0,
2059
+ "logps/rejected": -282.0,
2060
+ "loss": 0.0,
2061
+ "rewards/accuracies": 1.0,
2062
+ "rewards/chosen": 3.984375,
2063
+ "rewards/margins": 18.875,
2064
+ "rewards/rejected": -14.9375,
2065
+ "step": 1350
2066
+ },
2067
+ {
2068
+ "epoch": 1.0897435897435896,
2069
+ "grad_norm": 0.21495162145805216,
2070
+ "learning_rate": 3.5381418818640544e-07,
2071
+ "logits/chosen": 0.828125,
2072
+ "logits/rejected": 1.1796875,
2073
+ "logps/chosen": -150.0,
2074
+ "logps/rejected": -288.0,
2075
+ "loss": 0.0192,
2076
+ "rewards/accuracies": 1.0,
2077
+ "rewards/chosen": 3.421875,
2078
+ "rewards/margins": 19.625,
2079
+ "rewards/rejected": -16.25,
2080
+ "step": 1360
2081
+ },
2082
+ {
2083
+ "epoch": 1.0977564102564104,
2084
+ "grad_norm": 0.00022987129229484713,
2085
+ "learning_rate": 3.5233006826951616e-07,
2086
+ "logits/chosen": 1.0390625,
2087
+ "logits/rejected": 0.9921875,
2088
+ "logps/chosen": -118.0,
2089
+ "logps/rejected": -298.0,
2090
+ "loss": 0.0,
2091
+ "rewards/accuracies": 1.0,
2092
+ "rewards/chosen": 3.421875,
2093
+ "rewards/margins": 20.0,
2094
+ "rewards/rejected": -16.5,
2095
+ "step": 1370
2096
+ },
2097
+ {
2098
+ "epoch": 1.1057692307692308,
2099
+ "grad_norm": 0.0001366686827476735,
2100
+ "learning_rate": 3.5084594835262687e-07,
2101
+ "logits/chosen": 0.80078125,
2102
+ "logits/rejected": 0.98046875,
2103
+ "logps/chosen": -125.0,
2104
+ "logps/rejected": -284.0,
2105
+ "loss": 0.0,
2106
+ "rewards/accuracies": 1.0,
2107
+ "rewards/chosen": 3.546875,
2108
+ "rewards/margins": 19.875,
2109
+ "rewards/rejected": -16.375,
2110
+ "step": 1380
2111
+ },
2112
+ {
2113
+ "epoch": 1.1137820512820513,
2114
+ "grad_norm": 1.2520041285180355e-05,
2115
+ "learning_rate": 3.493618284357376e-07,
2116
+ "logits/chosen": 0.90234375,
2117
+ "logits/rejected": 1.3984375,
2118
+ "logps/chosen": -168.0,
2119
+ "logps/rejected": -312.0,
2120
+ "loss": 0.0,
2121
+ "rewards/accuracies": 1.0,
2122
+ "rewards/chosen": 3.4375,
2123
+ "rewards/margins": 19.125,
2124
+ "rewards/rejected": -15.75,
2125
+ "step": 1390
2126
+ },
2127
+ {
2128
+ "epoch": 1.1217948717948718,
2129
+ "grad_norm": 0.002926505425137915,
2130
+ "learning_rate": 3.478777085188483e-07,
2131
+ "logits/chosen": 0.578125,
2132
+ "logits/rejected": 0.921875,
2133
+ "logps/chosen": -153.0,
2134
+ "logps/rejected": -298.0,
2135
+ "loss": 0.0,
2136
+ "rewards/accuracies": 1.0,
2137
+ "rewards/chosen": 3.84375,
2138
+ "rewards/margins": 20.0,
2139
+ "rewards/rejected": -16.125,
2140
+ "step": 1400
2141
+ },
2142
+ {
2143
+ "epoch": 1.1298076923076923,
2144
+ "grad_norm": 0.01017669683918559,
2145
+ "learning_rate": 3.46393588601959e-07,
2146
+ "logits/chosen": 0.94921875,
2147
+ "logits/rejected": 1.125,
2148
+ "logps/chosen": -126.0,
2149
+ "logps/rejected": -308.0,
2150
+ "loss": 0.0,
2151
+ "rewards/accuracies": 1.0,
2152
+ "rewards/chosen": 3.46875,
2153
+ "rewards/margins": 21.125,
2154
+ "rewards/rejected": -17.625,
2155
+ "step": 1410
2156
+ },
2157
+ {
2158
+ "epoch": 1.1378205128205128,
2159
+ "grad_norm": 4.668022191357913e-06,
2160
+ "learning_rate": 3.449094686850698e-07,
2161
+ "logits/chosen": 0.8984375,
2162
+ "logits/rejected": 1.3984375,
2163
+ "logps/chosen": -155.0,
2164
+ "logps/rejected": -314.0,
2165
+ "loss": 0.0,
2166
+ "rewards/accuracies": 1.0,
2167
+ "rewards/chosen": 3.984375,
2168
+ "rewards/margins": 20.75,
2169
+ "rewards/rejected": -16.75,
2170
+ "step": 1420
2171
+ },
2172
+ {
2173
+ "epoch": 1.1458333333333333,
2174
+ "grad_norm": 4.630494393827752e-05,
2175
+ "learning_rate": 3.434253487681805e-07,
2176
+ "logits/chosen": 0.671875,
2177
+ "logits/rejected": 0.94140625,
2178
+ "logps/chosen": -177.0,
2179
+ "logps/rejected": -296.0,
2180
+ "loss": 0.0,
2181
+ "rewards/accuracies": 1.0,
2182
+ "rewards/chosen": 3.53125,
2183
+ "rewards/margins": 19.375,
2184
+ "rewards/rejected": -15.8125,
2185
+ "step": 1430
2186
+ },
2187
+ {
2188
+ "epoch": 1.1538461538461537,
2189
+ "grad_norm": 0.0006577813730898143,
2190
+ "learning_rate": 3.4194122885129115e-07,
2191
+ "logits/chosen": 0.73828125,
2192
+ "logits/rejected": 0.94921875,
2193
+ "logps/chosen": -180.0,
2194
+ "logps/rejected": -282.0,
2195
+ "loss": 0.0,
2196
+ "rewards/accuracies": 1.0,
2197
+ "rewards/chosen": 3.6875,
2198
+ "rewards/margins": 18.125,
2199
+ "rewards/rejected": -14.4375,
2200
+ "step": 1440
2201
+ },
2202
+ {
2203
+ "epoch": 1.1618589743589745,
2204
+ "grad_norm": 0.0004865147564756555,
2205
+ "learning_rate": 3.4045710893440187e-07,
2206
+ "logits/chosen": 0.66796875,
2207
+ "logits/rejected": 1.1640625,
2208
+ "logps/chosen": -170.0,
2209
+ "logps/rejected": -304.0,
2210
+ "loss": 0.0,
2211
+ "rewards/accuracies": 1.0,
2212
+ "rewards/chosen": 3.65625,
2213
+ "rewards/margins": 20.75,
2214
+ "rewards/rejected": -17.125,
2215
+ "step": 1450
2216
+ },
2217
+ {
2218
+ "epoch": 1.169871794871795,
2219
+ "grad_norm": 8.243889755055517e-05,
2220
+ "learning_rate": 3.389729890175126e-07,
2221
+ "logits/chosen": 0.73828125,
2222
+ "logits/rejected": 1.1171875,
2223
+ "logps/chosen": -174.0,
2224
+ "logps/rejected": -300.0,
2225
+ "loss": 0.0,
2226
+ "rewards/accuracies": 1.0,
2227
+ "rewards/chosen": 4.03125,
2228
+ "rewards/margins": 20.25,
2229
+ "rewards/rejected": -16.25,
2230
+ "step": 1460
2231
+ },
2232
+ {
2233
+ "epoch": 1.1778846153846154,
2234
+ "grad_norm": 0.055297406196161324,
2235
+ "learning_rate": 3.374888691006233e-07,
2236
+ "logits/chosen": 0.68359375,
2237
+ "logits/rejected": 1.0078125,
2238
+ "logps/chosen": -180.0,
2239
+ "logps/rejected": -292.0,
2240
+ "loss": 0.0001,
2241
+ "rewards/accuracies": 1.0,
2242
+ "rewards/chosen": 3.9375,
2243
+ "rewards/margins": 19.875,
2244
+ "rewards/rejected": -15.9375,
2245
+ "step": 1470
2246
+ },
2247
+ {
2248
+ "epoch": 1.185897435897436,
2249
+ "grad_norm": 3.940056875422956e-06,
2250
+ "learning_rate": 3.36004749183734e-07,
2251
+ "logits/chosen": 0.7734375,
2252
+ "logits/rejected": 1.1875,
2253
+ "logps/chosen": -187.0,
2254
+ "logps/rejected": -282.0,
2255
+ "loss": 0.0,
2256
+ "rewards/accuracies": 1.0,
2257
+ "rewards/chosen": 3.8125,
2258
+ "rewards/margins": 19.0,
2259
+ "rewards/rejected": -15.1875,
2260
+ "step": 1480
2261
+ },
2262
+ {
2263
+ "epoch": 1.1939102564102564,
2264
+ "grad_norm": 0.02924640642898095,
2265
+ "learning_rate": 3.3452062926684477e-07,
2266
+ "logits/chosen": 0.77734375,
2267
+ "logits/rejected": 0.87890625,
2268
+ "logps/chosen": -151.0,
2269
+ "logps/rejected": -294.0,
2270
+ "loss": 0.0,
2271
+ "rewards/accuracies": 1.0,
2272
+ "rewards/chosen": 3.359375,
2273
+ "rewards/margins": 19.625,
2274
+ "rewards/rejected": -16.25,
2275
+ "step": 1490
2276
+ },
2277
+ {
2278
+ "epoch": 1.2019230769230769,
2279
+ "grad_norm": 0.003264557976511741,
2280
+ "learning_rate": 3.330365093499555e-07,
2281
+ "logits/chosen": 0.7109375,
2282
+ "logits/rejected": 0.8828125,
2283
+ "logps/chosen": -181.0,
2284
+ "logps/rejected": -280.0,
2285
+ "loss": 0.0,
2286
+ "rewards/accuracies": 1.0,
2287
+ "rewards/chosen": 3.96875,
2288
+ "rewards/margins": 18.75,
2289
+ "rewards/rejected": -14.75,
2290
+ "step": 1500
2291
+ },
2292
+ {
2293
+ "epoch": 1.2099358974358974,
2294
+ "grad_norm": 1.5994308395955108e-05,
2295
+ "learning_rate": 3.315523894330662e-07,
2296
+ "logits/chosen": 0.63671875,
2297
+ "logits/rejected": 0.77734375,
2298
+ "logps/chosen": -180.0,
2299
+ "logps/rejected": -306.0,
2300
+ "loss": 0.0,
2301
+ "rewards/accuracies": 1.0,
2302
+ "rewards/chosen": 3.703125,
2303
+ "rewards/margins": 20.625,
2304
+ "rewards/rejected": -16.875,
2305
+ "step": 1510
2306
+ },
2307
+ {
2308
+ "epoch": 1.217948717948718,
2309
+ "grad_norm": 0.0006042891458313199,
2310
+ "learning_rate": 3.300682695161769e-07,
2311
+ "logits/chosen": 0.6484375,
2312
+ "logits/rejected": 1.328125,
2313
+ "logps/chosen": -158.0,
2314
+ "logps/rejected": -294.0,
2315
+ "loss": 0.0,
2316
+ "rewards/accuracies": 1.0,
2317
+ "rewards/chosen": 3.765625,
2318
+ "rewards/margins": 19.625,
2319
+ "rewards/rejected": -15.9375,
2320
+ "step": 1520
2321
+ },
2322
+ {
2323
+ "epoch": 1.2259615384615385,
2324
+ "grad_norm": 0.00021178126794762857,
2325
+ "learning_rate": 3.2858414959928757e-07,
2326
+ "logits/chosen": 0.65625,
2327
+ "logits/rejected": 1.078125,
2328
+ "logps/chosen": -116.0,
2329
+ "logps/rejected": -290.0,
2330
+ "loss": 0.0,
2331
+ "rewards/accuracies": 1.0,
2332
+ "rewards/chosen": 3.265625,
2333
+ "rewards/margins": 20.625,
2334
+ "rewards/rejected": -17.375,
2335
+ "step": 1530
2336
+ },
2337
+ {
2338
+ "epoch": 1.233974358974359,
2339
+ "grad_norm": 0.0010273708366364401,
2340
+ "learning_rate": 3.271000296823983e-07,
2341
+ "logits/chosen": 0.65625,
2342
+ "logits/rejected": 1.375,
2343
+ "logps/chosen": -152.0,
2344
+ "logps/rejected": -308.0,
2345
+ "loss": 0.0,
2346
+ "rewards/accuracies": 1.0,
2347
+ "rewards/chosen": 3.28125,
2348
+ "rewards/margins": 20.25,
2349
+ "rewards/rejected": -17.0,
2350
+ "step": 1540
2351
+ },
2352
+ {
2353
+ "epoch": 1.2419871794871795,
2354
+ "grad_norm": 3.07402349065646e-06,
2355
+ "learning_rate": 3.25615909765509e-07,
2356
+ "logits/chosen": 0.921875,
2357
+ "logits/rejected": 1.4453125,
2358
+ "logps/chosen": -142.0,
2359
+ "logps/rejected": -304.0,
2360
+ "loss": 0.0,
2361
+ "rewards/accuracies": 1.0,
2362
+ "rewards/chosen": 3.171875,
2363
+ "rewards/margins": 20.125,
2364
+ "rewards/rejected": -16.875,
2365
+ "step": 1550
2366
+ },
2367
+ {
2368
+ "epoch": 1.25,
2369
+ "grad_norm": 0.0011000584431525772,
2370
+ "learning_rate": 3.2413178984861977e-07,
2371
+ "logits/chosen": 0.70703125,
2372
+ "logits/rejected": 1.0390625,
2373
+ "logps/chosen": -117.0,
2374
+ "logps/rejected": -292.0,
2375
+ "loss": 0.0,
2376
+ "rewards/accuracies": 1.0,
2377
+ "rewards/chosen": 3.578125,
2378
+ "rewards/margins": 20.0,
2379
+ "rewards/rejected": -16.375,
2380
+ "step": 1560
2381
+ },
2382
+ {
2383
+ "epoch": 1.2580128205128205,
2384
+ "grad_norm": 0.000452369898432101,
2385
+ "learning_rate": 3.226476699317305e-07,
2386
+ "logits/chosen": 0.671875,
2387
+ "logits/rejected": 1.0703125,
2388
+ "logps/chosen": -150.0,
2389
+ "logps/rejected": -316.0,
2390
+ "loss": 0.0,
2391
+ "rewards/accuracies": 1.0,
2392
+ "rewards/chosen": 3.796875,
2393
+ "rewards/margins": 21.25,
2394
+ "rewards/rejected": -17.375,
2395
+ "step": 1570
2396
+ },
2397
+ {
2398
+ "epoch": 1.266025641025641,
2399
+ "grad_norm": 0.00026586619030460033,
2400
+ "learning_rate": 3.211635500148412e-07,
2401
+ "logits/chosen": 0.8671875,
2402
+ "logits/rejected": 1.1640625,
2403
+ "logps/chosen": -160.0,
2404
+ "logps/rejected": -306.0,
2405
+ "loss": 0.0,
2406
+ "rewards/accuracies": 1.0,
2407
+ "rewards/chosen": 3.578125,
2408
+ "rewards/margins": 20.25,
2409
+ "rewards/rejected": -16.75,
2410
+ "step": 1580
2411
+ },
2412
+ {
2413
+ "epoch": 1.2740384615384617,
2414
+ "grad_norm": 2.7345902360690067e-06,
2415
+ "learning_rate": 3.196794300979519e-07,
2416
+ "logits/chosen": 0.76171875,
2417
+ "logits/rejected": 1.140625,
2418
+ "logps/chosen": -185.0,
2419
+ "logps/rejected": -312.0,
2420
+ "loss": 0.0,
2421
+ "rewards/accuracies": 1.0,
2422
+ "rewards/chosen": 3.65625,
2423
+ "rewards/margins": 20.75,
2424
+ "rewards/rejected": -17.125,
2425
+ "step": 1590
2426
+ },
2427
+ {
2428
+ "epoch": 1.282051282051282,
2429
+ "grad_norm": 1.5071041983034111e-06,
2430
+ "learning_rate": 3.181953101810626e-07,
2431
+ "logits/chosen": 0.53515625,
2432
+ "logits/rejected": 0.96875,
2433
+ "logps/chosen": -153.0,
2434
+ "logps/rejected": -300.0,
2435
+ "loss": 0.0,
2436
+ "rewards/accuracies": 1.0,
2437
+ "rewards/chosen": 3.90625,
2438
+ "rewards/margins": 20.375,
2439
+ "rewards/rejected": -16.5,
2440
+ "step": 1600
2441
+ },
2442
+ {
2443
+ "epoch": 1.2900641025641026,
2444
+ "grad_norm": 9.809509824262037e-05,
2445
+ "learning_rate": 3.1671119026417333e-07,
2446
+ "logits/chosen": 0.71875,
2447
+ "logits/rejected": 0.9921875,
2448
+ "logps/chosen": -145.0,
2449
+ "logps/rejected": -302.0,
2450
+ "loss": 0.0,
2451
+ "rewards/accuracies": 1.0,
2452
+ "rewards/chosen": 3.203125,
2453
+ "rewards/margins": 19.625,
2454
+ "rewards/rejected": -16.375,
2455
+ "step": 1610
2456
+ },
2457
+ {
2458
+ "epoch": 1.2980769230769231,
2459
+ "grad_norm": 0.00019153122395682397,
2460
+ "learning_rate": 3.15227070347284e-07,
2461
+ "logits/chosen": 0.8828125,
2462
+ "logits/rejected": 1.1015625,
2463
+ "logps/chosen": -159.0,
2464
+ "logps/rejected": -292.0,
2465
+ "loss": 0.0,
2466
+ "rewards/accuracies": 1.0,
2467
+ "rewards/chosen": 3.96875,
2468
+ "rewards/margins": 19.5,
2469
+ "rewards/rejected": -15.625,
2470
+ "step": 1620
2471
+ },
2472
+ {
2473
+ "epoch": 1.3060897435897436,
2474
+ "grad_norm": 1.1285816682202219e-06,
2475
+ "learning_rate": 3.1374295043039476e-07,
2476
+ "logits/chosen": 0.91015625,
2477
+ "logits/rejected": 1.359375,
2478
+ "logps/chosen": -139.0,
2479
+ "logps/rejected": -304.0,
2480
+ "loss": 0.0,
2481
+ "rewards/accuracies": 1.0,
2482
+ "rewards/chosen": 3.46875,
2483
+ "rewards/margins": 21.75,
2484
+ "rewards/rejected": -18.25,
2485
+ "step": 1630
2486
+ },
2487
+ {
2488
+ "epoch": 1.314102564102564,
2489
+ "grad_norm": 0.00014596821162342946,
2490
+ "learning_rate": 3.122588305135055e-07,
2491
+ "logits/chosen": 0.8984375,
2492
+ "logits/rejected": 0.99609375,
2493
+ "logps/chosen": -164.0,
2494
+ "logps/rejected": -300.0,
2495
+ "loss": 0.0003,
2496
+ "rewards/accuracies": 1.0,
2497
+ "rewards/chosen": 3.0625,
2498
+ "rewards/margins": 19.625,
2499
+ "rewards/rejected": -16.5,
2500
+ "step": 1640
2501
+ },
2502
+ {
2503
+ "epoch": 1.3221153846153846,
2504
+ "grad_norm": 6.180429625577289e-05,
2505
+ "learning_rate": 3.107747105966162e-07,
2506
+ "logits/chosen": 0.5859375,
2507
+ "logits/rejected": 0.91015625,
2508
+ "logps/chosen": -140.0,
2509
+ "logps/rejected": -304.0,
2510
+ "loss": 0.0,
2511
+ "rewards/accuracies": 1.0,
2512
+ "rewards/chosen": 3.6875,
2513
+ "rewards/margins": 21.5,
2514
+ "rewards/rejected": -17.75,
2515
+ "step": 1650
2516
+ },
2517
+ {
2518
+ "epoch": 1.330128205128205,
2519
+ "grad_norm": 1.528718002747695e-05,
2520
+ "learning_rate": 3.092905906797269e-07,
2521
+ "logits/chosen": 0.96484375,
2522
+ "logits/rejected": 1.25,
2523
+ "logps/chosen": -160.0,
2524
+ "logps/rejected": -312.0,
2525
+ "loss": 0.0,
2526
+ "rewards/accuracies": 1.0,
2527
+ "rewards/chosen": 3.59375,
2528
+ "rewards/margins": 21.125,
2529
+ "rewards/rejected": -17.5,
2530
+ "step": 1660
2531
+ },
2532
+ {
2533
+ "epoch": 1.3381410256410255,
2534
+ "grad_norm": 0.0018059387091916529,
2535
+ "learning_rate": 3.078064707628376e-07,
2536
+ "logits/chosen": 0.96484375,
2537
+ "logits/rejected": 1.3515625,
2538
+ "logps/chosen": -150.0,
2539
+ "logps/rejected": -298.0,
2540
+ "loss": 0.0,
2541
+ "rewards/accuracies": 1.0,
2542
+ "rewards/chosen": 3.265625,
2543
+ "rewards/margins": 20.875,
2544
+ "rewards/rejected": -17.625,
2545
+ "step": 1670
2546
+ },
2547
+ {
2548
+ "epoch": 1.3461538461538463,
2549
+ "grad_norm": 9.308236631197583e-05,
2550
+ "learning_rate": 3.0632235084594833e-07,
2551
+ "logits/chosen": 0.94140625,
2552
+ "logits/rejected": 1.21875,
2553
+ "logps/chosen": -154.0,
2554
+ "logps/rejected": -298.0,
2555
+ "loss": 0.0,
2556
+ "rewards/accuracies": 1.0,
2557
+ "rewards/chosen": 3.59375,
2558
+ "rewards/margins": 20.25,
2559
+ "rewards/rejected": -16.75,
2560
+ "step": 1680
2561
+ },
2562
+ {
2563
+ "epoch": 1.3541666666666667,
2564
+ "grad_norm": 1.0292832754568623e-05,
2565
+ "learning_rate": 3.048382309290591e-07,
2566
+ "logits/chosen": 0.9453125,
2567
+ "logits/rejected": 1.1953125,
2568
+ "logps/chosen": -149.0,
2569
+ "logps/rejected": -314.0,
2570
+ "loss": 0.0,
2571
+ "rewards/accuracies": 1.0,
2572
+ "rewards/chosen": 3.578125,
2573
+ "rewards/margins": 21.0,
2574
+ "rewards/rejected": -17.375,
2575
+ "step": 1690
2576
+ },
2577
+ {
2578
+ "epoch": 1.3621794871794872,
2579
+ "grad_norm": 0.002936471913731386,
2580
+ "learning_rate": 3.033541110121698e-07,
2581
+ "logits/chosen": 0.67578125,
2582
+ "logits/rejected": 0.8515625,
2583
+ "logps/chosen": -178.0,
2584
+ "logps/rejected": -304.0,
2585
+ "loss": 0.0,
2586
+ "rewards/accuracies": 1.0,
2587
+ "rewards/chosen": 3.609375,
2588
+ "rewards/margins": 20.75,
2589
+ "rewards/rejected": -17.125,
2590
+ "step": 1700
2591
+ },
2592
+ {
2593
+ "epoch": 1.3701923076923077,
2594
+ "grad_norm": 0.00010907511941114075,
2595
+ "learning_rate": 3.018699910952805e-07,
2596
+ "logits/chosen": 0.796875,
2597
+ "logits/rejected": 0.9609375,
2598
+ "logps/chosen": -171.0,
2599
+ "logps/rejected": -296.0,
2600
+ "loss": 0.0,
2601
+ "rewards/accuracies": 1.0,
2602
+ "rewards/chosen": 3.625,
2603
+ "rewards/margins": 19.375,
2604
+ "rewards/rejected": -15.75,
2605
+ "step": 1710
2606
+ },
2607
+ {
2608
+ "epoch": 1.3782051282051282,
2609
+ "grad_norm": 0.004723298049564672,
2610
+ "learning_rate": 3.003858711783912e-07,
2611
+ "logits/chosen": 0.93359375,
2612
+ "logits/rejected": 1.34375,
2613
+ "logps/chosen": -128.0,
2614
+ "logps/rejected": -304.0,
2615
+ "loss": 0.0,
2616
+ "rewards/accuracies": 1.0,
2617
+ "rewards/chosen": 3.40625,
2618
+ "rewards/margins": 20.75,
2619
+ "rewards/rejected": -17.375,
2620
+ "step": 1720
2621
+ },
2622
+ {
2623
+ "epoch": 1.3862179487179487,
2624
+ "grad_norm": 0.0037479058484858564,
2625
+ "learning_rate": 2.989017512615019e-07,
2626
+ "logits/chosen": 0.81640625,
2627
+ "logits/rejected": 1.3359375,
2628
+ "logps/chosen": -187.0,
2629
+ "logps/rejected": -316.0,
2630
+ "loss": 0.0,
2631
+ "rewards/accuracies": 1.0,
2632
+ "rewards/chosen": 3.84375,
2633
+ "rewards/margins": 20.625,
2634
+ "rewards/rejected": -16.75,
2635
+ "step": 1730
2636
+ },
2637
+ {
2638
+ "epoch": 1.3942307692307692,
2639
+ "grad_norm": 6.403847125969602e-06,
2640
+ "learning_rate": 2.974176313446126e-07,
2641
+ "logits/chosen": 0.8984375,
2642
+ "logits/rejected": 1.40625,
2643
+ "logps/chosen": -167.0,
2644
+ "logps/rejected": -304.0,
2645
+ "loss": 0.0,
2646
+ "rewards/accuracies": 1.0,
2647
+ "rewards/chosen": 3.46875,
2648
+ "rewards/margins": 20.5,
2649
+ "rewards/rejected": -17.125,
2650
+ "step": 1740
2651
+ },
2652
+ {
2653
+ "epoch": 1.4022435897435899,
2654
+ "grad_norm": 3.170124782559705e-06,
2655
+ "learning_rate": 2.959335114277233e-07,
2656
+ "logits/chosen": 0.4765625,
2657
+ "logits/rejected": 0.98046875,
2658
+ "logps/chosen": -200.0,
2659
+ "logps/rejected": -312.0,
2660
+ "loss": 0.0,
2661
+ "rewards/accuracies": 1.0,
2662
+ "rewards/chosen": 3.71875,
2663
+ "rewards/margins": 20.75,
2664
+ "rewards/rejected": -17.0,
2665
+ "step": 1750
2666
+ },
2667
+ {
2668
+ "epoch": 1.4102564102564101,
2669
+ "grad_norm": 0.002625958073479466,
2670
+ "learning_rate": 2.944493915108341e-07,
2671
+ "logits/chosen": 0.8125,
2672
+ "logits/rejected": 0.984375,
2673
+ "logps/chosen": -145.0,
2674
+ "logps/rejected": -288.0,
2675
+ "loss": 0.0,
2676
+ "rewards/accuracies": 1.0,
2677
+ "rewards/chosen": 3.375,
2678
+ "rewards/margins": 19.375,
2679
+ "rewards/rejected": -16.0,
2680
+ "step": 1760
2681
+ },
2682
+ {
2683
+ "epoch": 1.4182692307692308,
2684
+ "grad_norm": 1.7345808005103415e-05,
2685
+ "learning_rate": 2.929652715939448e-07,
2686
+ "logits/chosen": 0.66015625,
2687
+ "logits/rejected": 1.203125,
2688
+ "logps/chosen": -186.0,
2689
+ "logps/rejected": -304.0,
2690
+ "loss": 0.0008,
2691
+ "rewards/accuracies": 1.0,
2692
+ "rewards/chosen": 3.53125,
2693
+ "rewards/margins": 20.875,
2694
+ "rewards/rejected": -17.375,
2695
+ "step": 1770
2696
+ },
2697
+ {
2698
+ "epoch": 1.4262820512820513,
2699
+ "grad_norm": 3.70136218823452e-06,
2700
+ "learning_rate": 2.914811516770555e-07,
2701
+ "logits/chosen": 0.486328125,
2702
+ "logits/rejected": 1.0078125,
2703
+ "logps/chosen": -209.0,
2704
+ "logps/rejected": -320.0,
2705
+ "loss": 0.0,
2706
+ "rewards/accuracies": 1.0,
2707
+ "rewards/chosen": 3.59375,
2708
+ "rewards/margins": 20.875,
2709
+ "rewards/rejected": -17.375,
2710
+ "step": 1780
2711
+ },
2712
+ {
2713
+ "epoch": 1.4342948717948718,
2714
+ "grad_norm": 1.1884302830057285e-05,
2715
+ "learning_rate": 2.8999703176016623e-07,
2716
+ "logits/chosen": 0.8046875,
2717
+ "logits/rejected": 1.34375,
2718
+ "logps/chosen": -176.0,
2719
+ "logps/rejected": -296.0,
2720
+ "loss": 0.0,
2721
+ "rewards/accuracies": 1.0,
2722
+ "rewards/chosen": 3.0625,
2723
+ "rewards/margins": 19.625,
2724
+ "rewards/rejected": -16.625,
2725
+ "step": 1790
2726
+ },
2727
+ {
2728
+ "epoch": 1.4423076923076923,
2729
+ "grad_norm": 0.0004609700315428155,
2730
+ "learning_rate": 2.8851291184327694e-07,
2731
+ "logits/chosen": 0.703125,
2732
+ "logits/rejected": 1.09375,
2733
+ "logps/chosen": -164.0,
2734
+ "logps/rejected": -322.0,
2735
+ "loss": 0.0,
2736
+ "rewards/accuracies": 1.0,
2737
+ "rewards/chosen": 3.078125,
2738
+ "rewards/margins": 22.125,
2739
+ "rewards/rejected": -19.125,
2740
+ "step": 1800
2741
+ },
2742
+ {
2743
+ "epoch": 1.4503205128205128,
2744
+ "grad_norm": 0.0006305642233578017,
2745
+ "learning_rate": 2.870287919263876e-07,
2746
+ "logits/chosen": 0.72265625,
2747
+ "logits/rejected": 1.1953125,
2748
+ "logps/chosen": -185.0,
2749
+ "logps/rejected": -312.0,
2750
+ "loss": 0.0,
2751
+ "rewards/accuracies": 1.0,
2752
+ "rewards/chosen": 2.875,
2753
+ "rewards/margins": 20.875,
2754
+ "rewards/rejected": -18.0,
2755
+ "step": 1810
2756
+ },
2757
+ {
2758
+ "epoch": 1.4583333333333333,
2759
+ "grad_norm": 3.4339114874993125e-06,
2760
+ "learning_rate": 2.855446720094983e-07,
2761
+ "logits/chosen": 0.7421875,
2762
+ "logits/rejected": 1.3203125,
2763
+ "logps/chosen": -165.0,
2764
+ "logps/rejected": -316.0,
2765
+ "loss": 0.0,
2766
+ "rewards/accuracies": 1.0,
2767
+ "rewards/chosen": 3.75,
2768
+ "rewards/margins": 21.25,
2769
+ "rewards/rejected": -17.5,
2770
+ "step": 1820
2771
+ },
2772
+ {
2773
+ "epoch": 1.4663461538461537,
2774
+ "grad_norm": 0.002348896026755423,
2775
+ "learning_rate": 2.840605520926091e-07,
2776
+ "logits/chosen": 0.7265625,
2777
+ "logits/rejected": 0.98046875,
2778
+ "logps/chosen": -181.0,
2779
+ "logps/rejected": -304.0,
2780
+ "loss": 0.0,
2781
+ "rewards/accuracies": 1.0,
2782
+ "rewards/chosen": 3.25,
2783
+ "rewards/margins": 20.875,
2784
+ "rewards/rejected": -17.625,
2785
+ "step": 1830
2786
+ },
2787
+ {
2788
+ "epoch": 1.4743589743589745,
2789
+ "grad_norm": 4.445170925547984e-05,
2790
+ "learning_rate": 2.825764321757198e-07,
2791
+ "logits/chosen": 0.78515625,
2792
+ "logits/rejected": 0.93359375,
2793
+ "logps/chosen": -172.0,
2794
+ "logps/rejected": -302.0,
2795
+ "loss": 0.0,
2796
+ "rewards/accuracies": 1.0,
2797
+ "rewards/chosen": 3.515625,
2798
+ "rewards/margins": 20.875,
2799
+ "rewards/rejected": -17.375,
2800
+ "step": 1840
2801
+ },
2802
+ {
2803
+ "epoch": 1.482371794871795,
2804
+ "grad_norm": 0.16403277766161206,
2805
+ "learning_rate": 2.810923122588305e-07,
2806
+ "logits/chosen": 0.7265625,
2807
+ "logits/rejected": 1.0859375,
2808
+ "logps/chosen": -162.0,
2809
+ "logps/rejected": -300.0,
2810
+ "loss": 0.0,
2811
+ "rewards/accuracies": 1.0,
2812
+ "rewards/chosen": 3.390625,
2813
+ "rewards/margins": 19.75,
2814
+ "rewards/rejected": -16.375,
2815
+ "step": 1850
2816
+ },
2817
+ {
2818
+ "epoch": 1.4903846153846154,
2819
+ "grad_norm": 0.0008420288269927029,
2820
+ "learning_rate": 2.796081923419412e-07,
2821
+ "logits/chosen": 0.71875,
2822
+ "logits/rejected": 1.3125,
2823
+ "logps/chosen": -197.0,
2824
+ "logps/rejected": -324.0,
2825
+ "loss": 0.0,
2826
+ "rewards/accuracies": 1.0,
2827
+ "rewards/chosen": 3.4375,
2828
+ "rewards/margins": 21.5,
2829
+ "rewards/rejected": -18.0,
2830
+ "step": 1860
2831
+ },
2832
+ {
2833
+ "epoch": 1.498397435897436,
2834
+ "grad_norm": 0.00014053022075140328,
2835
+ "learning_rate": 2.7812407242505194e-07,
2836
+ "logits/chosen": 0.9921875,
2837
+ "logits/rejected": 1.515625,
2838
+ "logps/chosen": -160.0,
2839
+ "logps/rejected": -328.0,
2840
+ "loss": 0.0,
2841
+ "rewards/accuracies": 1.0,
2842
+ "rewards/chosen": 3.828125,
2843
+ "rewards/margins": 23.0,
2844
+ "rewards/rejected": -19.125,
2845
+ "step": 1870
2846
+ },
2847
+ {
2848
+ "epoch": 1.5064102564102564,
2849
+ "grad_norm": 8.433668468764786e-05,
2850
+ "learning_rate": 2.7663995250816265e-07,
2851
+ "logits/chosen": 1.0390625,
2852
+ "logits/rejected": 1.09375,
2853
+ "logps/chosen": -150.0,
2854
+ "logps/rejected": -326.0,
2855
+ "loss": 0.0001,
2856
+ "rewards/accuracies": 1.0,
2857
+ "rewards/chosen": 3.21875,
2858
+ "rewards/margins": 22.25,
2859
+ "rewards/rejected": -19.0,
2860
+ "step": 1880
2861
+ },
2862
+ {
2863
+ "epoch": 1.5144230769230769,
2864
+ "grad_norm": 3.8086614833106377e-06,
2865
+ "learning_rate": 2.7515583259127337e-07,
2866
+ "logits/chosen": 0.5,
2867
+ "logits/rejected": 1.0234375,
2868
+ "logps/chosen": -216.0,
2869
+ "logps/rejected": -316.0,
2870
+ "loss": 0.0,
2871
+ "rewards/accuracies": 1.0,
2872
+ "rewards/chosen": 3.765625,
2873
+ "rewards/margins": 22.125,
2874
+ "rewards/rejected": -18.375,
2875
+ "step": 1890
2876
+ },
2877
+ {
2878
+ "epoch": 1.5224358974358974,
2879
+ "grad_norm": 5.226952675441339e-07,
2880
+ "learning_rate": 2.736717126743841e-07,
2881
+ "logits/chosen": 0.72265625,
2882
+ "logits/rejected": 1.0859375,
2883
+ "logps/chosen": -165.0,
2884
+ "logps/rejected": -334.0,
2885
+ "loss": 0.0,
2886
+ "rewards/accuracies": 1.0,
2887
+ "rewards/chosen": 3.703125,
2888
+ "rewards/margins": 22.75,
2889
+ "rewards/rejected": -19.0,
2890
+ "step": 1900
2891
+ },
2892
+ {
2893
+ "epoch": 1.530448717948718,
2894
+ "grad_norm": 2.2091624737157388e-05,
2895
+ "learning_rate": 2.721875927574948e-07,
2896
+ "logits/chosen": 0.75,
2897
+ "logits/rejected": 1.2578125,
2898
+ "logps/chosen": -189.0,
2899
+ "logps/rejected": -316.0,
2900
+ "loss": 0.0,
2901
+ "rewards/accuracies": 1.0,
2902
+ "rewards/chosen": 3.84375,
2903
+ "rewards/margins": 22.125,
2904
+ "rewards/rejected": -18.25,
2905
+ "step": 1910
2906
+ },
2907
+ {
2908
+ "epoch": 1.5384615384615383,
2909
+ "grad_norm": 0.00010413244451900578,
2910
+ "learning_rate": 2.707034728406055e-07,
2911
+ "logits/chosen": 0.76171875,
2912
+ "logits/rejected": 1.265625,
2913
+ "logps/chosen": -182.0,
2914
+ "logps/rejected": -312.0,
2915
+ "loss": 0.0,
2916
+ "rewards/accuracies": 1.0,
2917
+ "rewards/chosen": 3.6875,
2918
+ "rewards/margins": 21.5,
2919
+ "rewards/rejected": -17.75,
2920
+ "step": 1920
2921
+ },
2922
+ {
2923
+ "epoch": 1.546474358974359,
2924
+ "grad_norm": 5.596299324175904e-05,
2925
+ "learning_rate": 2.692193529237162e-07,
2926
+ "logits/chosen": 0.90234375,
2927
+ "logits/rejected": 0.984375,
2928
+ "logps/chosen": -145.0,
2929
+ "logps/rejected": -328.0,
2930
+ "loss": 0.0,
2931
+ "rewards/accuracies": 1.0,
2932
+ "rewards/chosen": 3.515625,
2933
+ "rewards/margins": 23.375,
2934
+ "rewards/rejected": -19.875,
2935
+ "step": 1930
2936
+ },
2937
+ {
2938
+ "epoch": 1.5544871794871795,
2939
+ "grad_norm": 9.505443984664321e-07,
2940
+ "learning_rate": 2.6773523300682693e-07,
2941
+ "logits/chosen": 0.8828125,
2942
+ "logits/rejected": 1.28125,
2943
+ "logps/chosen": -150.0,
2944
+ "logps/rejected": -306.0,
2945
+ "loss": 0.0029,
2946
+ "rewards/accuracies": 1.0,
2947
+ "rewards/chosen": 3.15625,
2948
+ "rewards/margins": 20.875,
2949
+ "rewards/rejected": -17.75,
2950
+ "step": 1940
2951
+ },
2952
+ {
2953
+ "epoch": 1.5625,
2954
+ "grad_norm": 7.9835406414024e-05,
2955
+ "learning_rate": 2.6625111308993765e-07,
2956
+ "logits/chosen": 0.78125,
2957
+ "logits/rejected": 1.25,
2958
+ "logps/chosen": -155.0,
2959
+ "logps/rejected": -310.0,
2960
+ "loss": 0.0,
2961
+ "rewards/accuracies": 1.0,
2962
+ "rewards/chosen": 3.65625,
2963
+ "rewards/margins": 22.125,
2964
+ "rewards/rejected": -18.5,
2965
+ "step": 1950
2966
+ },
2967
+ {
2968
+ "epoch": 1.5705128205128205,
2969
+ "grad_norm": 5.207840607481746e-06,
2970
+ "learning_rate": 2.6476699317304836e-07,
2971
+ "logits/chosen": 0.77734375,
2972
+ "logits/rejected": 1.2734375,
2973
+ "logps/chosen": -197.0,
2974
+ "logps/rejected": -308.0,
2975
+ "loss": 0.0001,
2976
+ "rewards/accuracies": 1.0,
2977
+ "rewards/chosen": 3.484375,
2978
+ "rewards/margins": 21.25,
2979
+ "rewards/rejected": -17.75,
2980
+ "step": 1960
2981
+ },
2982
+ {
2983
+ "epoch": 1.578525641025641,
2984
+ "grad_norm": 0.0055069480205136,
2985
+ "learning_rate": 2.6328287325615913e-07,
2986
+ "logits/chosen": 0.55859375,
2987
+ "logits/rejected": 0.9765625,
2988
+ "logps/chosen": -181.0,
2989
+ "logps/rejected": -310.0,
2990
+ "loss": 0.0,
2991
+ "rewards/accuracies": 1.0,
2992
+ "rewards/chosen": 3.5625,
2993
+ "rewards/margins": 20.875,
2994
+ "rewards/rejected": -17.25,
2995
+ "step": 1970
2996
+ },
2997
+ {
2998
+ "epoch": 1.5865384615384617,
2999
+ "grad_norm": 1.396497021073787e-05,
3000
+ "learning_rate": 2.6179875333926984e-07,
3001
+ "logits/chosen": 0.890625,
3002
+ "logits/rejected": 1.0546875,
3003
+ "logps/chosen": -162.0,
3004
+ "logps/rejected": -330.0,
3005
+ "loss": 0.0,
3006
+ "rewards/accuracies": 1.0,
3007
+ "rewards/chosen": 3.890625,
3008
+ "rewards/margins": 23.375,
3009
+ "rewards/rejected": -19.5,
3010
+ "step": 1980
3011
+ },
3012
+ {
3013
+ "epoch": 1.594551282051282,
3014
+ "grad_norm": 0.0017364799756809,
3015
+ "learning_rate": 2.603146334223805e-07,
3016
+ "logits/chosen": 0.74609375,
3017
+ "logits/rejected": 1.09375,
3018
+ "logps/chosen": -167.0,
3019
+ "logps/rejected": -302.0,
3020
+ "loss": 0.0,
3021
+ "rewards/accuracies": 1.0,
3022
+ "rewards/chosen": 3.296875,
3023
+ "rewards/margins": 20.5,
3024
+ "rewards/rejected": -17.25,
3025
+ "step": 1990
3026
+ },
3027
+ {
3028
+ "epoch": 1.6025641025641026,
3029
+ "grad_norm": 1.0811679994728376e-05,
3030
+ "learning_rate": 2.588305135054912e-07,
3031
+ "logits/chosen": 0.7890625,
3032
+ "logits/rejected": 1.0390625,
3033
+ "logps/chosen": -190.0,
3034
+ "logps/rejected": -322.0,
3035
+ "loss": 0.0,
3036
+ "rewards/accuracies": 1.0,
3037
+ "rewards/chosen": 4.3125,
3038
+ "rewards/margins": 23.125,
3039
+ "rewards/rejected": -18.75,
3040
+ "step": 2000
3041
+ },
3042
+ {
3043
+ "epoch": 1.6105769230769231,
3044
+ "grad_norm": 1.0141558630107686e-07,
3045
+ "learning_rate": 2.5734639358860193e-07,
3046
+ "logits/chosen": 0.80078125,
3047
+ "logits/rejected": 1.421875,
3048
+ "logps/chosen": -174.0,
3049
+ "logps/rejected": -316.0,
3050
+ "loss": 0.0,
3051
+ "rewards/accuracies": 1.0,
3052
+ "rewards/chosen": 3.765625,
3053
+ "rewards/margins": 21.625,
3054
+ "rewards/rejected": -17.875,
3055
+ "step": 2010
3056
+ },
3057
+ {
3058
+ "epoch": 1.6185897435897436,
3059
+ "grad_norm": 3.9088308067055224e-06,
3060
+ "learning_rate": 2.5586227367171264e-07,
3061
+ "logits/chosen": 0.86328125,
3062
+ "logits/rejected": 1.15625,
3063
+ "logps/chosen": -163.0,
3064
+ "logps/rejected": -302.0,
3065
+ "loss": 0.0,
3066
+ "rewards/accuracies": 1.0,
3067
+ "rewards/chosen": 3.28125,
3068
+ "rewards/margins": 21.0,
3069
+ "rewards/rejected": -17.75,
3070
+ "step": 2020
3071
+ },
3072
+ {
3073
+ "epoch": 1.626602564102564,
3074
+ "grad_norm": 4.13722853892595e-06,
3075
+ "learning_rate": 2.5437815375482335e-07,
3076
+ "logits/chosen": 0.84375,
3077
+ "logits/rejected": 1.0703125,
3078
+ "logps/chosen": -168.0,
3079
+ "logps/rejected": -338.0,
3080
+ "loss": 0.0,
3081
+ "rewards/accuracies": 1.0,
3082
+ "rewards/chosen": 3.25,
3083
+ "rewards/margins": 22.0,
3084
+ "rewards/rejected": -18.75,
3085
+ "step": 2030
3086
+ },
3087
+ {
3088
+ "epoch": 1.6346153846153846,
3089
+ "grad_norm": 3.401296459983001e-05,
3090
+ "learning_rate": 2.528940338379341e-07,
3091
+ "logits/chosen": 1.0234375,
3092
+ "logits/rejected": 1.1875,
3093
+ "logps/chosen": -159.0,
3094
+ "logps/rejected": -326.0,
3095
+ "loss": 0.0,
3096
+ "rewards/accuracies": 1.0,
3097
+ "rewards/chosen": 3.859375,
3098
+ "rewards/margins": 23.0,
3099
+ "rewards/rejected": -19.125,
3100
+ "step": 2040
3101
+ },
3102
+ {
3103
+ "epoch": 1.6426282051282053,
3104
+ "grad_norm": 2.957293101803746e-07,
3105
+ "learning_rate": 2.5140991392104483e-07,
3106
+ "logits/chosen": 0.87109375,
3107
+ "logits/rejected": 1.28125,
3108
+ "logps/chosen": -97.0,
3109
+ "logps/rejected": -328.0,
3110
+ "loss": 0.0,
3111
+ "rewards/accuracies": 1.0,
3112
+ "rewards/chosen": 3.203125,
3113
+ "rewards/margins": 22.625,
3114
+ "rewards/rejected": -19.375,
3115
+ "step": 2050
3116
+ },
3117
+ {
3118
+ "epoch": 1.6506410256410255,
3119
+ "grad_norm": 5.9562249969748895e-06,
3120
+ "learning_rate": 2.4992579400415555e-07,
3121
+ "logits/chosen": 0.796875,
3122
+ "logits/rejected": 1.0390625,
3123
+ "logps/chosen": -165.0,
3124
+ "logps/rejected": -296.0,
3125
+ "loss": 0.0,
3126
+ "rewards/accuracies": 1.0,
3127
+ "rewards/chosen": 3.71875,
3128
+ "rewards/margins": 21.25,
3129
+ "rewards/rejected": -17.5,
3130
+ "step": 2060
3131
+ },
3132
+ {
3133
+ "epoch": 1.6586538461538463,
3134
+ "grad_norm": 6.175150368480905e-06,
3135
+ "learning_rate": 2.4844167408726626e-07,
3136
+ "logits/chosen": 0.984375,
3137
+ "logits/rejected": 1.4375,
3138
+ "logps/chosen": -160.0,
3139
+ "logps/rejected": -324.0,
3140
+ "loss": 0.0,
3141
+ "rewards/accuracies": 1.0,
3142
+ "rewards/chosen": 3.8125,
3143
+ "rewards/margins": 22.125,
3144
+ "rewards/rejected": -18.25,
3145
+ "step": 2070
3146
+ },
3147
+ {
3148
+ "epoch": 1.6666666666666665,
3149
+ "grad_norm": 2.6561524482376626e-05,
3150
+ "learning_rate": 2.469575541703769e-07,
3151
+ "logits/chosen": 0.83203125,
3152
+ "logits/rejected": 1.1796875,
3153
+ "logps/chosen": -169.0,
3154
+ "logps/rejected": -322.0,
3155
+ "loss": 0.0,
3156
+ "rewards/accuracies": 1.0,
3157
+ "rewards/chosen": 3.546875,
3158
+ "rewards/margins": 22.625,
3159
+ "rewards/rejected": -19.0,
3160
+ "step": 2080
3161
+ },
3162
+ {
3163
+ "epoch": 1.6746794871794872,
3164
+ "grad_norm": 4.068151187669672e-06,
3165
+ "learning_rate": 2.454734342534877e-07,
3166
+ "logits/chosen": 0.96875,
3167
+ "logits/rejected": 1.171875,
3168
+ "logps/chosen": -184.0,
3169
+ "logps/rejected": -298.0,
3170
+ "loss": 0.0034,
3171
+ "rewards/accuracies": 1.0,
3172
+ "rewards/chosen": 3.703125,
3173
+ "rewards/margins": 20.625,
3174
+ "rewards/rejected": -16.875,
3175
+ "step": 2090
3176
+ },
3177
+ {
3178
+ "epoch": 1.6826923076923077,
3179
+ "grad_norm": 2.6204722144054077e-05,
3180
+ "learning_rate": 2.439893143365984e-07,
3181
+ "logits/chosen": 1.015625,
3182
+ "logits/rejected": 1.296875,
3183
+ "logps/chosen": -155.0,
3184
+ "logps/rejected": -314.0,
3185
+ "loss": 0.0,
3186
+ "rewards/accuracies": 1.0,
3187
+ "rewards/chosen": 3.625,
3188
+ "rewards/margins": 22.25,
3189
+ "rewards/rejected": -18.625,
3190
+ "step": 2100
3191
+ },
3192
+ {
3193
+ "epoch": 1.6907051282051282,
3194
+ "grad_norm": 2.398210733527027e-06,
3195
+ "learning_rate": 2.425051944197091e-07,
3196
+ "logits/chosen": 0.8984375,
3197
+ "logits/rejected": 1.1640625,
3198
+ "logps/chosen": -142.0,
3199
+ "logps/rejected": -338.0,
3200
+ "loss": 0.0002,
3201
+ "rewards/accuracies": 1.0,
3202
+ "rewards/chosen": 3.390625,
3203
+ "rewards/margins": 23.375,
3204
+ "rewards/rejected": -20.0,
3205
+ "step": 2110
3206
+ },
3207
+ {
3208
+ "epoch": 1.6987179487179487,
3209
+ "grad_norm": 4.12694708820085e-06,
3210
+ "learning_rate": 2.4102107450281983e-07,
3211
+ "logits/chosen": 0.8984375,
3212
+ "logits/rejected": 1.3359375,
3213
+ "logps/chosen": -160.0,
3214
+ "logps/rejected": -308.0,
3215
+ "loss": 0.0,
3216
+ "rewards/accuracies": 1.0,
3217
+ "rewards/chosen": 3.03125,
3218
+ "rewards/margins": 19.75,
3219
+ "rewards/rejected": -16.625,
3220
+ "step": 2120
3221
+ },
3222
+ {
3223
+ "epoch": 1.7067307692307692,
3224
+ "grad_norm": 0.00013371742562856578,
3225
+ "learning_rate": 2.3953695458593054e-07,
3226
+ "logits/chosen": 0.86328125,
3227
+ "logits/rejected": 1.109375,
3228
+ "logps/chosen": -146.0,
3229
+ "logps/rejected": -328.0,
3230
+ "loss": 0.0,
3231
+ "rewards/accuracies": 1.0,
3232
+ "rewards/chosen": 3.75,
3233
+ "rewards/margins": 23.75,
3234
+ "rewards/rejected": -19.875,
3235
+ "step": 2130
3236
+ },
3237
+ {
3238
+ "epoch": 1.7147435897435899,
3239
+ "grad_norm": 7.636681790511307e-06,
3240
+ "learning_rate": 2.3805283466904126e-07,
3241
+ "logits/chosen": 1.0078125,
3242
+ "logits/rejected": 1.1796875,
3243
+ "logps/chosen": -166.0,
3244
+ "logps/rejected": -310.0,
3245
+ "loss": 0.0,
3246
+ "rewards/accuracies": 1.0,
3247
+ "rewards/chosen": 3.8125,
3248
+ "rewards/margins": 21.625,
3249
+ "rewards/rejected": -17.75,
3250
+ "step": 2140
3251
+ },
3252
+ {
3253
+ "epoch": 1.7227564102564101,
3254
+ "grad_norm": 8.958009719967443e-07,
3255
+ "learning_rate": 2.3656871475215194e-07,
3256
+ "logits/chosen": 0.70703125,
3257
+ "logits/rejected": 1.1875,
3258
+ "logps/chosen": -154.0,
3259
+ "logps/rejected": -338.0,
3260
+ "loss": 0.0,
3261
+ "rewards/accuracies": 1.0,
3262
+ "rewards/chosen": 3.28125,
3263
+ "rewards/margins": 22.5,
3264
+ "rewards/rejected": -19.25,
3265
+ "step": 2150
3266
+ },
3267
+ {
3268
+ "epoch": 1.7307692307692308,
3269
+ "grad_norm": 3.747660209974031e-05,
3270
+ "learning_rate": 2.3508459483526268e-07,
3271
+ "logits/chosen": 0.9453125,
3272
+ "logits/rejected": 1.390625,
3273
+ "logps/chosen": -186.0,
3274
+ "logps/rejected": -312.0,
3275
+ "loss": 0.0,
3276
+ "rewards/accuracies": 1.0,
3277
+ "rewards/chosen": 3.84375,
3278
+ "rewards/margins": 22.125,
3279
+ "rewards/rejected": -18.25,
3280
+ "step": 2160
3281
+ },
3282
+ {
3283
+ "epoch": 1.7387820512820513,
3284
+ "grad_norm": 4.869196145111484e-07,
3285
+ "learning_rate": 2.336004749183734e-07,
3286
+ "logits/chosen": 0.81640625,
3287
+ "logits/rejected": 1.140625,
3288
+ "logps/chosen": -149.0,
3289
+ "logps/rejected": -320.0,
3290
+ "loss": 0.0,
3291
+ "rewards/accuracies": 1.0,
3292
+ "rewards/chosen": 3.328125,
3293
+ "rewards/margins": 22.75,
3294
+ "rewards/rejected": -19.375,
3295
+ "step": 2170
3296
+ },
3297
+ {
3298
+ "epoch": 1.7467948717948718,
3299
+ "grad_norm": 0.00026582023320058276,
3300
+ "learning_rate": 2.321163550014841e-07,
3301
+ "logits/chosen": 0.796875,
3302
+ "logits/rejected": 1.3046875,
3303
+ "logps/chosen": -133.0,
3304
+ "logps/rejected": -336.0,
3305
+ "loss": 0.0,
3306
+ "rewards/accuracies": 1.0,
3307
+ "rewards/chosen": 3.15625,
3308
+ "rewards/margins": 22.75,
3309
+ "rewards/rejected": -19.625,
3310
+ "step": 2180
3311
+ },
3312
+ {
3313
+ "epoch": 1.7548076923076923,
3314
+ "grad_norm": 2.8032704053085966e-06,
3315
+ "learning_rate": 2.3063223508459482e-07,
3316
+ "logits/chosen": 0.8125,
3317
+ "logits/rejected": 1.1171875,
3318
+ "logps/chosen": -172.0,
3319
+ "logps/rejected": -318.0,
3320
+ "loss": 0.0,
3321
+ "rewards/accuracies": 1.0,
3322
+ "rewards/chosen": 3.375,
3323
+ "rewards/margins": 22.375,
3324
+ "rewards/rejected": -19.0,
3325
+ "step": 2190
3326
+ },
3327
+ {
3328
+ "epoch": 1.7628205128205128,
3329
+ "grad_norm": 0.0028628851089617393,
3330
+ "learning_rate": 2.2914811516770554e-07,
3331
+ "logits/chosen": 0.859375,
3332
+ "logits/rejected": 1.1796875,
3333
+ "logps/chosen": -185.0,
3334
+ "logps/rejected": -300.0,
3335
+ "loss": 0.0,
3336
+ "rewards/accuracies": 1.0,
3337
+ "rewards/chosen": 3.25,
3338
+ "rewards/margins": 19.875,
3339
+ "rewards/rejected": -16.625,
3340
+ "step": 2200
3341
+ },
3342
+ {
3343
+ "epoch": 1.7708333333333335,
3344
+ "grad_norm": 0.0001342614273603189,
3345
+ "learning_rate": 2.2766399525081625e-07,
3346
+ "logits/chosen": 1.0859375,
3347
+ "logits/rejected": 1.1640625,
3348
+ "logps/chosen": -177.0,
3349
+ "logps/rejected": -320.0,
3350
+ "loss": 0.0,
3351
+ "rewards/accuracies": 1.0,
3352
+ "rewards/chosen": 3.875,
3353
+ "rewards/margins": 21.75,
3354
+ "rewards/rejected": -17.875,
3355
+ "step": 2210
3356
+ },
3357
+ {
3358
+ "epoch": 1.7788461538461537,
3359
+ "grad_norm": 4.955335994263196e-07,
3360
+ "learning_rate": 2.2617987533392696e-07,
3361
+ "logits/chosen": 0.828125,
3362
+ "logits/rejected": 1.21875,
3363
+ "logps/chosen": -158.0,
3364
+ "logps/rejected": -318.0,
3365
+ "loss": 0.0,
3366
+ "rewards/accuracies": 1.0,
3367
+ "rewards/chosen": 3.25,
3368
+ "rewards/margins": 21.5,
3369
+ "rewards/rejected": -18.25,
3370
+ "step": 2220
3371
+ },
3372
+ {
3373
+ "epoch": 1.7868589743589745,
3374
+ "grad_norm": 0.0014087673231885866,
3375
+ "learning_rate": 2.246957554170377e-07,
3376
+ "logits/chosen": 0.7265625,
3377
+ "logits/rejected": 1.03125,
3378
+ "logps/chosen": -188.0,
3379
+ "logps/rejected": -316.0,
3380
+ "loss": 0.0,
3381
+ "rewards/accuracies": 1.0,
3382
+ "rewards/chosen": 3.40625,
3383
+ "rewards/margins": 21.5,
3384
+ "rewards/rejected": -18.125,
3385
+ "step": 2230
3386
+ },
3387
+ {
3388
+ "epoch": 1.7948717948717947,
3389
+ "grad_norm": 0.00019680490630025088,
3390
+ "learning_rate": 2.232116355001484e-07,
3391
+ "logits/chosen": 0.7578125,
3392
+ "logits/rejected": 1.1875,
3393
+ "logps/chosen": -167.0,
3394
+ "logps/rejected": -326.0,
3395
+ "loss": 0.0,
3396
+ "rewards/accuracies": 1.0,
3397
+ "rewards/chosen": 3.21875,
3398
+ "rewards/margins": 21.25,
3399
+ "rewards/rejected": -18.0,
3400
+ "step": 2240
3401
+ },
3402
+ {
3403
+ "epoch": 1.8028846153846154,
3404
+ "grad_norm": 1.3924207072166965e-06,
3405
+ "learning_rate": 2.217275155832591e-07,
3406
+ "logits/chosen": 0.72265625,
3407
+ "logits/rejected": 0.98046875,
3408
+ "logps/chosen": -162.0,
3409
+ "logps/rejected": -336.0,
3410
+ "loss": 0.0,
3411
+ "rewards/accuracies": 1.0,
3412
+ "rewards/chosen": 4.03125,
3413
+ "rewards/margins": 23.75,
3414
+ "rewards/rejected": -19.625,
3415
+ "step": 2250
3416
+ },
3417
+ {
3418
+ "epoch": 1.810897435897436,
3419
+ "grad_norm": 2.7880513613496112e-05,
3420
+ "learning_rate": 2.2024339566636982e-07,
3421
+ "logits/chosen": 1.046875,
3422
+ "logits/rejected": 1.3671875,
3423
+ "logps/chosen": -152.0,
3424
+ "logps/rejected": -288.0,
3425
+ "loss": 0.0,
3426
+ "rewards/accuracies": 1.0,
3427
+ "rewards/chosen": 3.796875,
3428
+ "rewards/margins": 19.875,
3429
+ "rewards/rejected": -16.125,
3430
+ "step": 2260
3431
+ },
3432
+ {
3433
+ "epoch": 1.8189102564102564,
3434
+ "grad_norm": 2.01578072742099e-05,
3435
+ "learning_rate": 2.1875927574948056e-07,
3436
+ "logits/chosen": 0.9375,
3437
+ "logits/rejected": 1.015625,
3438
+ "logps/chosen": -170.0,
3439
+ "logps/rejected": -328.0,
3440
+ "loss": 0.0,
3441
+ "rewards/accuracies": 1.0,
3442
+ "rewards/chosen": 3.5625,
3443
+ "rewards/margins": 22.25,
3444
+ "rewards/rejected": -18.75,
3445
+ "step": 2270
3446
+ },
3447
+ {
3448
+ "epoch": 1.8269230769230769,
3449
+ "grad_norm": 0.0001453323384470551,
3450
+ "learning_rate": 2.1727515583259127e-07,
3451
+ "logits/chosen": 1.0390625,
3452
+ "logits/rejected": 1.3671875,
3453
+ "logps/chosen": -159.0,
3454
+ "logps/rejected": -316.0,
3455
+ "loss": 0.0,
3456
+ "rewards/accuracies": 1.0,
3457
+ "rewards/chosen": 3.40625,
3458
+ "rewards/margins": 20.75,
3459
+ "rewards/rejected": -17.375,
3460
+ "step": 2280
3461
+ },
3462
+ {
3463
+ "epoch": 1.8349358974358974,
3464
+ "grad_norm": 4.835283260495599e-06,
3465
+ "learning_rate": 2.1579103591570196e-07,
3466
+ "logits/chosen": 0.8359375,
3467
+ "logits/rejected": 1.375,
3468
+ "logps/chosen": -153.0,
3469
+ "logps/rejected": -314.0,
3470
+ "loss": 0.0,
3471
+ "rewards/accuracies": 1.0,
3472
+ "rewards/chosen": 3.8125,
3473
+ "rewards/margins": 22.125,
3474
+ "rewards/rejected": -18.25,
3475
+ "step": 2290
3476
+ },
3477
+ {
3478
+ "epoch": 1.842948717948718,
3479
+ "grad_norm": 9.787977293423134e-05,
3480
+ "learning_rate": 2.143069159988127e-07,
3481
+ "logits/chosen": 1.0546875,
3482
+ "logits/rejected": 1.3671875,
3483
+ "logps/chosen": -180.0,
3484
+ "logps/rejected": -320.0,
3485
+ "loss": 0.0,
3486
+ "rewards/accuracies": 1.0,
3487
+ "rewards/chosen": 3.59375,
3488
+ "rewards/margins": 21.375,
3489
+ "rewards/rejected": -17.75,
3490
+ "step": 2300
3491
+ },
3492
+ {
3493
+ "epoch": 1.8509615384615383,
3494
+ "grad_norm": 0.003511562539920723,
3495
+ "learning_rate": 2.128227960819234e-07,
3496
+ "logits/chosen": 0.8515625,
3497
+ "logits/rejected": 1.2890625,
3498
+ "logps/chosen": -156.0,
3499
+ "logps/rejected": -334.0,
3500
+ "loss": 0.0,
3501
+ "rewards/accuracies": 1.0,
3502
+ "rewards/chosen": 3.21875,
3503
+ "rewards/margins": 23.5,
3504
+ "rewards/rejected": -20.25,
3505
+ "step": 2310
3506
+ },
3507
+ {
3508
+ "epoch": 1.858974358974359,
3509
+ "grad_norm": 0.0017406830607479289,
3510
+ "learning_rate": 2.1133867616503413e-07,
3511
+ "logits/chosen": 0.84375,
3512
+ "logits/rejected": 1.4375,
3513
+ "logps/chosen": -135.0,
3514
+ "logps/rejected": -326.0,
3515
+ "loss": 0.0,
3516
+ "rewards/accuracies": 1.0,
3517
+ "rewards/chosen": 3.4375,
3518
+ "rewards/margins": 22.875,
3519
+ "rewards/rejected": -19.375,
3520
+ "step": 2320
3521
+ },
3522
+ {
3523
+ "epoch": 1.8669871794871795,
3524
+ "grad_norm": 1.757377642753892e-05,
3525
+ "learning_rate": 2.0985455624814487e-07,
3526
+ "logits/chosen": 0.66015625,
3527
+ "logits/rejected": 0.60546875,
3528
+ "logps/chosen": -167.0,
3529
+ "logps/rejected": -296.0,
3530
+ "loss": 0.0,
3531
+ "rewards/accuracies": 1.0,
3532
+ "rewards/chosen": 3.734375,
3533
+ "rewards/margins": 21.125,
3534
+ "rewards/rejected": -17.375,
3535
+ "step": 2330
3536
+ },
3537
+ {
3538
+ "epoch": 1.875,
3539
+ "grad_norm": 9.585856156991564e-05,
3540
+ "learning_rate": 2.0837043633125555e-07,
3541
+ "logits/chosen": 0.7265625,
3542
+ "logits/rejected": 1.0859375,
3543
+ "logps/chosen": -189.0,
3544
+ "logps/rejected": -318.0,
3545
+ "loss": 0.0,
3546
+ "rewards/accuracies": 1.0,
3547
+ "rewards/chosen": 3.71875,
3548
+ "rewards/margins": 21.875,
3549
+ "rewards/rejected": -18.125,
3550
+ "step": 2340
3551
+ },
3552
+ {
3553
+ "epoch": 1.8830128205128205,
3554
+ "grad_norm": 2.717371131495447e-06,
3555
+ "learning_rate": 2.0688631641436627e-07,
3556
+ "logits/chosen": 0.68359375,
3557
+ "logits/rejected": 0.91015625,
3558
+ "logps/chosen": -173.0,
3559
+ "logps/rejected": -310.0,
3560
+ "loss": 0.0,
3561
+ "rewards/accuracies": 1.0,
3562
+ "rewards/chosen": 3.953125,
3563
+ "rewards/margins": 21.875,
3564
+ "rewards/rejected": -17.875,
3565
+ "step": 2350
3566
+ },
3567
+ {
3568
+ "epoch": 1.891025641025641,
3569
+ "grad_norm": 2.557631865211708e-07,
3570
+ "learning_rate": 2.0540219649747698e-07,
3571
+ "logits/chosen": 0.9140625,
3572
+ "logits/rejected": 1.53125,
3573
+ "logps/chosen": -175.0,
3574
+ "logps/rejected": -328.0,
3575
+ "loss": 0.0,
3576
+ "rewards/accuracies": 1.0,
3577
+ "rewards/chosen": 3.8125,
3578
+ "rewards/margins": 23.25,
3579
+ "rewards/rejected": -19.375,
3580
+ "step": 2360
3581
+ },
3582
+ {
3583
+ "epoch": 1.8990384615384617,
3584
+ "grad_norm": 8.115167511795904e-05,
3585
+ "learning_rate": 2.0391807658058772e-07,
3586
+ "logits/chosen": 0.91015625,
3587
+ "logits/rejected": 1.2109375,
3588
+ "logps/chosen": -186.0,
3589
+ "logps/rejected": -312.0,
3590
+ "loss": 0.0,
3591
+ "rewards/accuracies": 1.0,
3592
+ "rewards/chosen": 3.953125,
3593
+ "rewards/margins": 22.125,
3594
+ "rewards/rejected": -18.25,
3595
+ "step": 2370
3596
+ },
3597
+ {
3598
+ "epoch": 1.907051282051282,
3599
+ "grad_norm": 5.5980367488511575e-06,
3600
+ "learning_rate": 2.024339566636984e-07,
3601
+ "logits/chosen": 0.52734375,
3602
+ "logits/rejected": 1.0859375,
3603
+ "logps/chosen": -178.0,
3604
+ "logps/rejected": -316.0,
3605
+ "loss": 0.0,
3606
+ "rewards/accuracies": 1.0,
3607
+ "rewards/chosen": 3.703125,
3608
+ "rewards/margins": 22.375,
3609
+ "rewards/rejected": -18.75,
3610
+ "step": 2380
3611
+ },
3612
+ {
3613
+ "epoch": 1.9150641025641026,
3614
+ "grad_norm": 0.0025820105156682694,
3615
+ "learning_rate": 2.0094983674680912e-07,
3616
+ "logits/chosen": 0.8828125,
3617
+ "logits/rejected": 1.3359375,
3618
+ "logps/chosen": -150.0,
3619
+ "logps/rejected": -300.0,
3620
+ "loss": 0.0,
3621
+ "rewards/accuracies": 1.0,
3622
+ "rewards/chosen": 3.34375,
3623
+ "rewards/margins": 20.875,
3624
+ "rewards/rejected": -17.5,
3625
+ "step": 2390
3626
+ },
3627
+ {
3628
+ "epoch": 1.9230769230769231,
3629
+ "grad_norm": 7.136824036042445e-07,
3630
+ "learning_rate": 1.9946571682991986e-07,
3631
+ "logits/chosen": 0.78125,
3632
+ "logits/rejected": 1.140625,
3633
+ "logps/chosen": -149.0,
3634
+ "logps/rejected": -328.0,
3635
+ "loss": 0.0,
3636
+ "rewards/accuracies": 1.0,
3637
+ "rewards/chosen": 2.890625,
3638
+ "rewards/margins": 22.5,
3639
+ "rewards/rejected": -19.625,
3640
+ "step": 2400
3641
+ },
3642
+ {
3643
+ "epoch": 1.9310897435897436,
3644
+ "grad_norm": 0.0003371284846547073,
3645
+ "learning_rate": 1.9798159691303057e-07,
3646
+ "logits/chosen": 0.7890625,
3647
+ "logits/rejected": 1.0078125,
3648
+ "logps/chosen": -182.0,
3649
+ "logps/rejected": -332.0,
3650
+ "loss": 0.0,
3651
+ "rewards/accuracies": 1.0,
3652
+ "rewards/chosen": 2.953125,
3653
+ "rewards/margins": 23.0,
3654
+ "rewards/rejected": -20.125,
3655
+ "step": 2410
3656
+ },
3657
+ {
3658
+ "epoch": 1.939102564102564,
3659
+ "grad_norm": 7.032612597839408e-05,
3660
+ "learning_rate": 1.964974769961413e-07,
3661
+ "logits/chosen": 0.8125,
3662
+ "logits/rejected": 1.0234375,
3663
+ "logps/chosen": -155.0,
3664
+ "logps/rejected": -330.0,
3665
+ "loss": 0.0,
3666
+ "rewards/accuracies": 1.0,
3667
+ "rewards/chosen": 2.890625,
3668
+ "rewards/margins": 22.5,
3669
+ "rewards/rejected": -19.625,
3670
+ "step": 2420
3671
+ },
3672
+ {
3673
+ "epoch": 1.9471153846153846,
3674
+ "grad_norm": 0.00020818829472435283,
3675
+ "learning_rate": 1.9501335707925197e-07,
3676
+ "logits/chosen": 0.87890625,
3677
+ "logits/rejected": 1.171875,
3678
+ "logps/chosen": -192.0,
3679
+ "logps/rejected": -338.0,
3680
+ "loss": 0.0,
3681
+ "rewards/accuracies": 1.0,
3682
+ "rewards/chosen": 3.546875,
3683
+ "rewards/margins": 23.125,
3684
+ "rewards/rejected": -19.625,
3685
+ "step": 2430
3686
+ },
3687
+ {
3688
+ "epoch": 1.9551282051282053,
3689
+ "grad_norm": 8.466910776291965e-07,
3690
+ "learning_rate": 1.9352923716236271e-07,
3691
+ "logits/chosen": 0.7421875,
3692
+ "logits/rejected": 1.375,
3693
+ "logps/chosen": -212.0,
3694
+ "logps/rejected": -330.0,
3695
+ "loss": 0.0001,
3696
+ "rewards/accuracies": 1.0,
3697
+ "rewards/chosen": 3.5,
3698
+ "rewards/margins": 22.125,
3699
+ "rewards/rejected": -18.625,
3700
+ "step": 2440
3701
+ },
3702
+ {
3703
+ "epoch": 1.9631410256410255,
3704
+ "grad_norm": 3.7142533186252094e-06,
3705
+ "learning_rate": 1.9204511724547343e-07,
3706
+ "logits/chosen": 0.72265625,
3707
+ "logits/rejected": 1.265625,
3708
+ "logps/chosen": -172.0,
3709
+ "logps/rejected": -348.0,
3710
+ "loss": 0.0,
3711
+ "rewards/accuracies": 1.0,
3712
+ "rewards/chosen": 3.875,
3713
+ "rewards/margins": 23.625,
3714
+ "rewards/rejected": -19.75,
3715
+ "step": 2450
3716
+ },
3717
+ {
3718
+ "epoch": 1.9711538461538463,
3719
+ "grad_norm": 1.4211106326699326e-06,
3720
+ "learning_rate": 1.9056099732858414e-07,
3721
+ "logits/chosen": 0.9140625,
3722
+ "logits/rejected": 1.2421875,
3723
+ "logps/chosen": -160.0,
3724
+ "logps/rejected": -332.0,
3725
+ "loss": 0.0001,
3726
+ "rewards/accuracies": 1.0,
3727
+ "rewards/chosen": 3.84375,
3728
+ "rewards/margins": 24.0,
3729
+ "rewards/rejected": -20.125,
3730
+ "step": 2460
3731
+ },
3732
+ {
3733
+ "epoch": 1.9791666666666665,
3734
+ "grad_norm": 7.012330169691276e-07,
3735
+ "learning_rate": 1.8907687741169488e-07,
3736
+ "logits/chosen": 0.52734375,
3737
+ "logits/rejected": 1.1640625,
3738
+ "logps/chosen": -200.0,
3739
+ "logps/rejected": -336.0,
3740
+ "loss": 0.0,
3741
+ "rewards/accuracies": 1.0,
3742
+ "rewards/chosen": 3.4375,
3743
+ "rewards/margins": 22.0,
3744
+ "rewards/rejected": -18.5,
3745
+ "step": 2470
3746
+ },
3747
+ {
3748
+ "epoch": 1.9871794871794872,
3749
+ "grad_norm": 0.00015207925609359054,
3750
+ "learning_rate": 1.8759275749480557e-07,
3751
+ "logits/chosen": 0.921875,
3752
+ "logits/rejected": 1.421875,
3753
+ "logps/chosen": -203.0,
3754
+ "logps/rejected": -328.0,
3755
+ "loss": 0.0,
3756
+ "rewards/accuracies": 1.0,
3757
+ "rewards/chosen": 3.296875,
3758
+ "rewards/margins": 22.5,
3759
+ "rewards/rejected": -19.25,
3760
+ "step": 2480
3761
+ },
3762
+ {
3763
+ "epoch": 1.9951923076923077,
3764
+ "grad_norm": 1.0755143190145868e-06,
3765
+ "learning_rate": 1.8610863757791628e-07,
3766
+ "logits/chosen": 0.84375,
3767
+ "logits/rejected": 1.3515625,
3768
+ "logps/chosen": -168.0,
3769
+ "logps/rejected": -332.0,
3770
+ "loss": 0.0,
3771
+ "rewards/accuracies": 1.0,
3772
+ "rewards/chosen": 3.75,
3773
+ "rewards/margins": 23.75,
3774
+ "rewards/rejected": -20.0,
3775
+ "step": 2490
3776
+ },
3777
+ {
3778
+ "epoch": 2.0,
3779
+ "eval_logits/chosen": 0.84375,
3780
+ "eval_logits/rejected": 1.3984375,
3781
+ "eval_logps/chosen": -169.0,
3782
+ "eval_logps/rejected": -334.0,
3783
+ "eval_loss": 1.841681842051912e-05,
3784
+ "eval_rewards/accuracies": 1.0,
3785
+ "eval_rewards/chosen": 3.4375,
3786
+ "eval_rewards/margins": 22.5,
3787
+ "eval_rewards/rejected": -19.125,
3788
+ "eval_runtime": 25.8897,
3789
+ "eval_samples_per_second": 7.686,
3790
+ "eval_steps_per_second": 0.966,
3791
+ "step": 2496
3792
  }
3793
  ],
3794
  "logging_steps": 10,