tcapelle commited on
Commit
b40eb1d
·
verified ·
1 Parent(s): 6f244fc

Uploaded from W&B

Browse files
model-00001-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:733d1ad4b591a1e0ea3f24761a4b6d0e99d07f0186201dd2788265b6a256b305
3
  size 4984780784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19d8c44654d03e9974c989cd3b4270e812ad66efec6f20381c1489a201a5d670
3
  size 4984780784
model-00002-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ccdcb7db2051bb04578c15fbc8de0908dc382e2e0c11d12e841a659b060a362
3
  size 4980892048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:719054f207c7550419e0a70ef585a181d64a661bbafbb6c3cac116bacd778896
3
  size 4980892048
model-00003-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64e3bf40e26d50d168975db66ec0be143153d22ac74b8dab5ed7bb399b25352f
3
  size 4928485104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6f0b9916006362401f797946f52af207421b6067cbb1334da7c4da482155994
3
  size 4928485104
model-00004-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd527b82bf98c9aa46042138f842cfcae9ee3f0029cb362d556a8d2344a907a0
3
  size 4980892112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d4829659786275428c6590064abed129a62b07c38df176b8751b97b89ad1265
3
  size 4980892112
model-00005-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a27920d463f37058f3a01ef15e895935a14b076f65075ce57c76fce4775d593
3
  size 4928485104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d55c22cdc249df507d11e2c297cef91135f3b60aa29717a0b29d629a0cafdff
3
  size 4928485104
model-00006-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bae46196571919129ce0bff9017fd324fa4833a01cb82bd6e1d392fa68c3ad9a
3
  size 4733130504
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9b335b24792043e47d8110ba4af61dd8869ad9681ab9a2ebf6157eb79b960d0
3
  size 4733130504
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0,
6
  "eval_steps": 27,
7
- "global_step": 264,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1936,6 +1936,970 @@
1936
  "learning_rate": 1.2622039877423267e-06,
1937
  "loss": 0.1916,
1938
  "step": 264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1939
  }
1940
  ],
1941
  "logging_steps": 1,
@@ -1950,12 +2914,12 @@
1950
  "should_evaluate": false,
1951
  "should_log": false,
1952
  "should_save": true,
1953
- "should_training_stop": false
1954
  },
1955
  "attributes": {}
1956
  }
1957
  },
1958
- "total_flos": 5.809316809735668e+18,
1959
  "train_batch_size": 2,
1960
  "trial_name": null,
1961
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 27,
7
+ "global_step": 396,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1936
  "learning_rate": 1.2622039877423267e-06,
1937
  "loss": 0.1916,
1938
  "step": 264
1939
+ },
1940
+ {
1941
+ "epoch": 2.007575757575758,
1942
+ "grad_norm": 0.281697154045105,
1943
+ "learning_rate": 1.2464987695172266e-06,
1944
+ "loss": 0.178,
1945
+ "step": 265
1946
+ },
1947
+ {
1948
+ "epoch": 2.015151515151515,
1949
+ "grad_norm": 0.3228111267089844,
1950
+ "learning_rate": 1.230822106959742e-06,
1951
+ "loss": 0.1893,
1952
+ "step": 266
1953
+ },
1954
+ {
1955
+ "epoch": 2.022727272727273,
1956
+ "grad_norm": 0.2976820170879364,
1957
+ "learning_rate": 1.2151757659688574e-06,
1958
+ "loss": 0.1755,
1959
+ "step": 267
1960
+ },
1961
+ {
1962
+ "epoch": 2.0303030303030303,
1963
+ "grad_norm": 0.2902336120605469,
1964
+ "learning_rate": 1.1995615090279815e-06,
1965
+ "loss": 0.1826,
1966
+ "step": 268
1967
+ },
1968
+ {
1969
+ "epoch": 2.037878787878788,
1970
+ "grad_norm": 0.32078248262405396,
1971
+ "learning_rate": 1.183981095006411e-06,
1972
+ "loss": 0.2049,
1973
+ "step": 269
1974
+ },
1975
+ {
1976
+ "epoch": 2.0454545454545454,
1977
+ "grad_norm": 0.28856366872787476,
1978
+ "learning_rate": 1.1684362789612054e-06,
1979
+ "loss": 0.1709,
1980
+ "step": 270
1981
+ },
1982
+ {
1983
+ "epoch": 2.0454545454545454,
1984
+ "eval_loss": 0.18923524022102356,
1985
+ "eval_runtime": 3.7815,
1986
+ "eval_samples_per_second": 11.9,
1987
+ "eval_steps_per_second": 0.793,
1988
+ "step": 270
1989
+ },
1990
+ {
1991
+ "epoch": 2.053030303030303,
1992
+ "grad_norm": 0.3012211322784424,
1993
+ "learning_rate": 1.1529288119394879e-06,
1994
+ "loss": 0.1863,
1995
+ "step": 271
1996
+ },
1997
+ {
1998
+ "epoch": 2.0606060606060606,
1999
+ "grad_norm": 0.2844583988189697,
2000
+ "learning_rate": 1.1374604407811962e-06,
2001
+ "loss": 0.1834,
2002
+ "step": 272
2003
+ },
2004
+ {
2005
+ "epoch": 2.0681818181818183,
2006
+ "grad_norm": 0.2947697639465332,
2007
+ "learning_rate": 1.1220329079223124e-06,
2008
+ "loss": 0.1842,
2009
+ "step": 273
2010
+ },
2011
+ {
2012
+ "epoch": 2.0757575757575757,
2013
+ "grad_norm": 0.3138524293899536,
2014
+ "learning_rate": 1.1066479511985838e-06,
2015
+ "loss": 0.1946,
2016
+ "step": 274
2017
+ },
2018
+ {
2019
+ "epoch": 2.0833333333333335,
2020
+ "grad_norm": 0.30240464210510254,
2021
+ "learning_rate": 1.091307303649766e-06,
2022
+ "loss": 0.1921,
2023
+ "step": 275
2024
+ },
2025
+ {
2026
+ "epoch": 2.090909090909091,
2027
+ "grad_norm": 0.2899230420589447,
2028
+ "learning_rate": 1.0760126933244036e-06,
2029
+ "loss": 0.1868,
2030
+ "step": 276
2031
+ },
2032
+ {
2033
+ "epoch": 2.0984848484848486,
2034
+ "grad_norm": 0.3104937672615051,
2035
+ "learning_rate": 1.0607658430851746e-06,
2036
+ "loss": 0.211,
2037
+ "step": 277
2038
+ },
2039
+ {
2040
+ "epoch": 2.106060606060606,
2041
+ "grad_norm": 0.29552116990089417,
2042
+ "learning_rate": 1.0455684704148174e-06,
2043
+ "loss": 0.1914,
2044
+ "step": 278
2045
+ },
2046
+ {
2047
+ "epoch": 2.1136363636363638,
2048
+ "grad_norm": 0.29408350586891174,
2049
+ "learning_rate": 1.030422287222667e-06,
2050
+ "loss": 0.1888,
2051
+ "step": 279
2052
+ },
2053
+ {
2054
+ "epoch": 2.121212121212121,
2055
+ "grad_norm": 0.278326153755188,
2056
+ "learning_rate": 1.0153289996518127e-06,
2057
+ "loss": 0.1642,
2058
+ "step": 280
2059
+ },
2060
+ {
2061
+ "epoch": 2.128787878787879,
2062
+ "grad_norm": 0.3248918354511261,
2063
+ "learning_rate": 1.0002903078869137e-06,
2064
+ "loss": 0.1991,
2065
+ "step": 281
2066
+ },
2067
+ {
2068
+ "epoch": 2.1363636363636362,
2069
+ "grad_norm": 0.3153297007083893,
2070
+ "learning_rate": 9.853079059626806e-07,
2071
+ "loss": 0.1929,
2072
+ "step": 282
2073
+ },
2074
+ {
2075
+ "epoch": 2.143939393939394,
2076
+ "grad_norm": 0.3812675476074219,
2077
+ "learning_rate": 9.703834815730489e-07,
2078
+ "loss": 0.1809,
2079
+ "step": 283
2080
+ },
2081
+ {
2082
+ "epoch": 2.1515151515151514,
2083
+ "grad_norm": 0.28265097737312317,
2084
+ "learning_rate": 9.555187158810704e-07,
2085
+ "loss": 0.1741,
2086
+ "step": 284
2087
+ },
2088
+ {
2089
+ "epoch": 2.159090909090909,
2090
+ "grad_norm": 0.30659934878349304,
2091
+ "learning_rate": 9.407152833295372e-07,
2092
+ "loss": 0.1928,
2093
+ "step": 285
2094
+ },
2095
+ {
2096
+ "epoch": 2.1666666666666665,
2097
+ "grad_norm": 0.2801172137260437,
2098
+ "learning_rate": 9.259748514523654e-07,
2099
+ "loss": 0.1762,
2100
+ "step": 286
2101
+ },
2102
+ {
2103
+ "epoch": 2.1742424242424243,
2104
+ "grad_norm": 0.3246462047100067,
2105
+ "learning_rate": 9.112990806867544e-07,
2106
+ "loss": 0.2035,
2107
+ "step": 287
2108
+ },
2109
+ {
2110
+ "epoch": 2.1818181818181817,
2111
+ "grad_norm": 0.29880762100219727,
2112
+ "learning_rate": 8.966896241861474e-07,
2113
+ "loss": 0.1765,
2114
+ "step": 288
2115
+ },
2116
+ {
2117
+ "epoch": 2.1893939393939394,
2118
+ "grad_norm": 0.32259929180145264,
2119
+ "learning_rate": 8.821481276340112e-07,
2120
+ "loss": 0.2088,
2121
+ "step": 289
2122
+ },
2123
+ {
2124
+ "epoch": 2.196969696969697,
2125
+ "grad_norm": 0.3025321662425995,
2126
+ "learning_rate": 8.676762290584585e-07,
2127
+ "loss": 0.1718,
2128
+ "step": 290
2129
+ },
2130
+ {
2131
+ "epoch": 2.2045454545454546,
2132
+ "grad_norm": 0.3053520917892456,
2133
+ "learning_rate": 8.532755586477326e-07,
2134
+ "loss": 0.196,
2135
+ "step": 291
2136
+ },
2137
+ {
2138
+ "epoch": 2.212121212121212,
2139
+ "grad_norm": 0.28334543108940125,
2140
+ "learning_rate": 8.389477385665733e-07,
2141
+ "loss": 0.1764,
2142
+ "step": 292
2143
+ },
2144
+ {
2145
+ "epoch": 2.2196969696969697,
2146
+ "grad_norm": 0.29423749446868896,
2147
+ "learning_rate": 8.246943827734898e-07,
2148
+ "loss": 0.1805,
2149
+ "step": 293
2150
+ },
2151
+ {
2152
+ "epoch": 2.227272727272727,
2153
+ "grad_norm": 0.2939305007457733,
2154
+ "learning_rate": 8.105170968389552e-07,
2155
+ "loss": 0.1806,
2156
+ "step": 294
2157
+ },
2158
+ {
2159
+ "epoch": 2.234848484848485,
2160
+ "grad_norm": 0.3079436719417572,
2161
+ "learning_rate": 7.964174777645448e-07,
2162
+ "loss": 0.1891,
2163
+ "step": 295
2164
+ },
2165
+ {
2166
+ "epoch": 2.242424242424242,
2167
+ "grad_norm": 0.3062392771244049,
2168
+ "learning_rate": 7.823971138030467e-07,
2169
+ "loss": 0.1859,
2170
+ "step": 296
2171
+ },
2172
+ {
2173
+ "epoch": 2.25,
2174
+ "grad_norm": 0.31168290972709656,
2175
+ "learning_rate": 7.684575842795485e-07,
2176
+ "loss": 0.2003,
2177
+ "step": 297
2178
+ },
2179
+ {
2180
+ "epoch": 2.25,
2181
+ "eval_loss": 0.18853209912776947,
2182
+ "eval_runtime": 4.3319,
2183
+ "eval_samples_per_second": 10.388,
2184
+ "eval_steps_per_second": 0.693,
2185
+ "step": 297
2186
+ },
2187
+ {
2188
+ "epoch": 2.257575757575758,
2189
+ "grad_norm": 0.29143401980400085,
2190
+ "learning_rate": 7.546004594135357e-07,
2191
+ "loss": 0.1889,
2192
+ "step": 298
2193
+ },
2194
+ {
2195
+ "epoch": 2.265151515151515,
2196
+ "grad_norm": 0.2893548607826233,
2197
+ "learning_rate": 7.408273001420153e-07,
2198
+ "loss": 0.1752,
2199
+ "step": 299
2200
+ },
2201
+ {
2202
+ "epoch": 2.2727272727272725,
2203
+ "grad_norm": 0.27464550733566284,
2204
+ "learning_rate": 7.271396579436825e-07,
2205
+ "loss": 0.163,
2206
+ "step": 300
2207
+ },
2208
+ {
2209
+ "epoch": 2.2803030303030303,
2210
+ "grad_norm": 0.2979916036128998,
2211
+ "learning_rate": 7.135390746641527e-07,
2212
+ "loss": 0.1814,
2213
+ "step": 301
2214
+ },
2215
+ {
2216
+ "epoch": 2.287878787878788,
2217
+ "grad_norm": 0.314494252204895,
2218
+ "learning_rate": 7.000270823422838e-07,
2219
+ "loss": 0.1906,
2220
+ "step": 302
2221
+ },
2222
+ {
2223
+ "epoch": 2.2954545454545454,
2224
+ "grad_norm": 0.30358201265335083,
2225
+ "learning_rate": 6.866052030375974e-07,
2226
+ "loss": 0.1807,
2227
+ "step": 303
2228
+ },
2229
+ {
2230
+ "epoch": 2.303030303030303,
2231
+ "grad_norm": 0.32943934202194214,
2232
+ "learning_rate": 6.732749486588266e-07,
2233
+ "loss": 0.2031,
2234
+ "step": 304
2235
+ },
2236
+ {
2237
+ "epoch": 2.3106060606060606,
2238
+ "grad_norm": 0.29630428552627563,
2239
+ "learning_rate": 6.60037820793607e-07,
2240
+ "loss": 0.1941,
2241
+ "step": 305
2242
+ },
2243
+ {
2244
+ "epoch": 2.3181818181818183,
2245
+ "grad_norm": 0.30913493037223816,
2246
+ "learning_rate": 6.468953105393297e-07,
2247
+ "loss": 0.1774,
2248
+ "step": 306
2249
+ },
2250
+ {
2251
+ "epoch": 2.3257575757575757,
2252
+ "grad_norm": 0.2982107400894165,
2253
+ "learning_rate": 6.338488983351778e-07,
2254
+ "loss": 0.19,
2255
+ "step": 307
2256
+ },
2257
+ {
2258
+ "epoch": 2.3333333333333335,
2259
+ "grad_norm": 0.29869621992111206,
2260
+ "learning_rate": 6.209000537953606e-07,
2261
+ "loss": 0.1866,
2262
+ "step": 308
2263
+ },
2264
+ {
2265
+ "epoch": 2.340909090909091,
2266
+ "grad_norm": 0.30003929138183594,
2267
+ "learning_rate": 6.080502355435701e-07,
2268
+ "loss": 0.1842,
2269
+ "step": 309
2270
+ },
2271
+ {
2272
+ "epoch": 2.3484848484848486,
2273
+ "grad_norm": 0.2731933891773224,
2274
+ "learning_rate": 5.953008910486739e-07,
2275
+ "loss": 0.1683,
2276
+ "step": 310
2277
+ },
2278
+ {
2279
+ "epoch": 2.356060606060606,
2280
+ "grad_norm": 0.27522197365760803,
2281
+ "learning_rate": 5.826534564616633e-07,
2282
+ "loss": 0.1493,
2283
+ "step": 311
2284
+ },
2285
+ {
2286
+ "epoch": 2.3636363636363638,
2287
+ "grad_norm": 0.2997157573699951,
2288
+ "learning_rate": 5.701093564538807e-07,
2289
+ "loss": 0.1707,
2290
+ "step": 312
2291
+ },
2292
+ {
2293
+ "epoch": 2.371212121212121,
2294
+ "grad_norm": 0.28120705485343933,
2295
+ "learning_rate": 5.576700040565365e-07,
2296
+ "loss": 0.1693,
2297
+ "step": 313
2298
+ },
2299
+ {
2300
+ "epoch": 2.378787878787879,
2301
+ "grad_norm": 0.2777678966522217,
2302
+ "learning_rate": 5.453368005015363e-07,
2303
+ "loss": 0.1789,
2304
+ "step": 314
2305
+ },
2306
+ {
2307
+ "epoch": 2.3863636363636362,
2308
+ "grad_norm": 0.2859909236431122,
2309
+ "learning_rate": 5.331111350636413e-07,
2310
+ "loss": 0.1654,
2311
+ "step": 315
2312
+ },
2313
+ {
2314
+ "epoch": 2.393939393939394,
2315
+ "grad_norm": 0.3338193893432617,
2316
+ "learning_rate": 5.209943849039722e-07,
2317
+ "loss": 0.2114,
2318
+ "step": 316
2319
+ },
2320
+ {
2321
+ "epoch": 2.4015151515151514,
2322
+ "grad_norm": 0.30051618814468384,
2323
+ "learning_rate": 5.089879149148781e-07,
2324
+ "loss": 0.19,
2325
+ "step": 317
2326
+ },
2327
+ {
2328
+ "epoch": 2.409090909090909,
2329
+ "grad_norm": 0.3063294291496277,
2330
+ "learning_rate": 4.970930775661899e-07,
2331
+ "loss": 0.1813,
2332
+ "step": 318
2333
+ },
2334
+ {
2335
+ "epoch": 2.4166666666666665,
2336
+ "grad_norm": 0.3148181438446045,
2337
+ "learning_rate": 4.853112127528699e-07,
2338
+ "loss": 0.174,
2339
+ "step": 319
2340
+ },
2341
+ {
2342
+ "epoch": 2.4242424242424243,
2343
+ "grad_norm": 0.2876652777194977,
2344
+ "learning_rate": 4.736436476440792e-07,
2345
+ "loss": 0.1658,
2346
+ "step": 320
2347
+ },
2348
+ {
2349
+ "epoch": 2.4318181818181817,
2350
+ "grad_norm": 0.28111517429351807,
2351
+ "learning_rate": 4.620916965336809e-07,
2352
+ "loss": 0.1784,
2353
+ "step": 321
2354
+ },
2355
+ {
2356
+ "epoch": 2.4393939393939394,
2357
+ "grad_norm": 0.27875590324401855,
2358
+ "learning_rate": 4.506566606921865e-07,
2359
+ "loss": 0.1737,
2360
+ "step": 322
2361
+ },
2362
+ {
2363
+ "epoch": 2.446969696969697,
2364
+ "grad_norm": 0.28706124424934387,
2365
+ "learning_rate": 4.3933982822017883e-07,
2366
+ "loss": 0.1678,
2367
+ "step": 323
2368
+ },
2369
+ {
2370
+ "epoch": 2.4545454545454546,
2371
+ "grad_norm": 0.28674250841140747,
2372
+ "learning_rate": 4.281424739032122e-07,
2373
+ "loss": 0.1757,
2374
+ "step": 324
2375
+ },
2376
+ {
2377
+ "epoch": 2.4545454545454546,
2378
+ "eval_loss": 0.18774640560150146,
2379
+ "eval_runtime": 3.7707,
2380
+ "eval_samples_per_second": 11.934,
2381
+ "eval_steps_per_second": 0.796,
2382
+ "step": 324
2383
+ },
2384
+ {
2385
+ "epoch": 2.462121212121212,
2386
+ "grad_norm": 0.30930793285369873,
2387
+ "learning_rate": 4.170658590682134e-07,
2388
+ "loss": 0.2137,
2389
+ "step": 325
2390
+ },
2391
+ {
2392
+ "epoch": 2.4696969696969697,
2393
+ "grad_norm": 0.28437668085098267,
2394
+ "learning_rate": 4.0611123144140083e-07,
2395
+ "loss": 0.1743,
2396
+ "step": 326
2397
+ },
2398
+ {
2399
+ "epoch": 2.4772727272727275,
2400
+ "grad_norm": 0.2986258864402771,
2401
+ "learning_rate": 3.952798250077318e-07,
2402
+ "loss": 0.1813,
2403
+ "step": 327
2404
+ },
2405
+ {
2406
+ "epoch": 2.484848484848485,
2407
+ "grad_norm": 0.28527748584747314,
2408
+ "learning_rate": 3.8457285987190406e-07,
2409
+ "loss": 0.1782,
2410
+ "step": 328
2411
+ },
2412
+ {
2413
+ "epoch": 2.492424242424242,
2414
+ "grad_norm": 0.3192787766456604,
2415
+ "learning_rate": 3.7399154212091333e-07,
2416
+ "loss": 0.1922,
2417
+ "step": 329
2418
+ },
2419
+ {
2420
+ "epoch": 2.5,
2421
+ "grad_norm": 0.3025767505168915,
2422
+ "learning_rate": 3.635370636881958e-07,
2423
+ "loss": 0.1775,
2424
+ "step": 330
2425
+ },
2426
+ {
2427
+ "epoch": 2.507575757575758,
2428
+ "grad_norm": 0.30758988857269287,
2429
+ "learning_rate": 3.532106022193615e-07,
2430
+ "loss": 0.2018,
2431
+ "step": 331
2432
+ },
2433
+ {
2434
+ "epoch": 2.515151515151515,
2435
+ "grad_norm": 0.2812161445617676,
2436
+ "learning_rate": 3.4301332093953813e-07,
2437
+ "loss": 0.175,
2438
+ "step": 332
2439
+ },
2440
+ {
2441
+ "epoch": 2.5227272727272725,
2442
+ "grad_norm": 0.2952413260936737,
2443
+ "learning_rate": 3.3294636852234106e-07,
2444
+ "loss": 0.1815,
2445
+ "step": 333
2446
+ },
2447
+ {
2448
+ "epoch": 2.5303030303030303,
2449
+ "grad_norm": 0.27842938899993896,
2450
+ "learning_rate": 3.230108789604792e-07,
2451
+ "loss": 0.1685,
2452
+ "step": 334
2453
+ },
2454
+ {
2455
+ "epoch": 2.537878787878788,
2456
+ "grad_norm": 0.3101184070110321,
2457
+ "learning_rate": 3.132079714380172e-07,
2458
+ "loss": 0.1805,
2459
+ "step": 335
2460
+ },
2461
+ {
2462
+ "epoch": 2.5454545454545454,
2463
+ "grad_norm": 0.28867068886756897,
2464
+ "learning_rate": 3.035387502043052e-07,
2465
+ "loss": 0.1697,
2466
+ "step": 336
2467
+ },
2468
+ {
2469
+ "epoch": 2.5530303030303028,
2470
+ "grad_norm": 0.2969781458377838,
2471
+ "learning_rate": 2.9400430444958937e-07,
2472
+ "loss": 0.1912,
2473
+ "step": 337
2474
+ },
2475
+ {
2476
+ "epoch": 2.5606060606060606,
2477
+ "grad_norm": 0.28201431035995483,
2478
+ "learning_rate": 2.8460570818232014e-07,
2479
+ "loss": 0.1801,
2480
+ "step": 338
2481
+ },
2482
+ {
2483
+ "epoch": 2.5681818181818183,
2484
+ "grad_norm": 0.2861873209476471,
2485
+ "learning_rate": 2.753440201081716e-07,
2486
+ "loss": 0.1833,
2487
+ "step": 339
2488
+ },
2489
+ {
2490
+ "epoch": 2.5757575757575757,
2491
+ "grad_norm": 0.2919836640357971,
2492
+ "learning_rate": 2.662202835107828e-07,
2493
+ "loss": 0.1702,
2494
+ "step": 340
2495
+ },
2496
+ {
2497
+ "epoch": 2.5833333333333335,
2498
+ "grad_norm": 0.28204408288002014,
2499
+ "learning_rate": 2.572355261342369e-07,
2500
+ "loss": 0.1693,
2501
+ "step": 341
2502
+ },
2503
+ {
2504
+ "epoch": 2.590909090909091,
2505
+ "grad_norm": 0.28755688667297363,
2506
+ "learning_rate": 2.4839076006729086e-07,
2507
+ "loss": 0.1747,
2508
+ "step": 342
2509
+ },
2510
+ {
2511
+ "epoch": 2.5984848484848486,
2512
+ "grad_norm": 0.2947797477245331,
2513
+ "learning_rate": 2.3968698162936857e-07,
2514
+ "loss": 0.1842,
2515
+ "step": 343
2516
+ },
2517
+ {
2518
+ "epoch": 2.606060606060606,
2519
+ "grad_norm": 0.2840390205383301,
2520
+ "learning_rate": 2.3112517125833071e-07,
2521
+ "loss": 0.1859,
2522
+ "step": 344
2523
+ },
2524
+ {
2525
+ "epoch": 2.6136363636363638,
2526
+ "grad_norm": 0.3154660165309906,
2527
+ "learning_rate": 2.2270629340003308e-07,
2528
+ "loss": 0.1898,
2529
+ "step": 345
2530
+ },
2531
+ {
2532
+ "epoch": 2.621212121212121,
2533
+ "grad_norm": 0.27370041608810425,
2534
+ "learning_rate": 2.1443129639968617e-07,
2535
+ "loss": 0.1641,
2536
+ "step": 346
2537
+ },
2538
+ {
2539
+ "epoch": 2.628787878787879,
2540
+ "grad_norm": 0.28645506501197815,
2541
+ "learning_rate": 2.0630111239502954e-07,
2542
+ "loss": 0.1664,
2543
+ "step": 347
2544
+ },
2545
+ {
2546
+ "epoch": 2.6363636363636362,
2547
+ "grad_norm": 0.3576584458351135,
2548
+ "learning_rate": 1.9831665721132957e-07,
2549
+ "loss": 0.1571,
2550
+ "step": 348
2551
+ },
2552
+ {
2553
+ "epoch": 2.643939393939394,
2554
+ "grad_norm": 0.3097120225429535,
2555
+ "learning_rate": 1.9047883025821777e-07,
2556
+ "loss": 0.1918,
2557
+ "step": 349
2558
+ },
2559
+ {
2560
+ "epoch": 2.6515151515151514,
2561
+ "grad_norm": 0.2783317565917969,
2562
+ "learning_rate": 1.827885144283769e-07,
2563
+ "loss": 0.1703,
2564
+ "step": 350
2565
+ },
2566
+ {
2567
+ "epoch": 2.659090909090909,
2568
+ "grad_norm": 0.282617449760437,
2569
+ "learning_rate": 1.7524657599808603e-07,
2570
+ "loss": 0.1743,
2571
+ "step": 351
2572
+ },
2573
+ {
2574
+ "epoch": 2.659090909090909,
2575
+ "eval_loss": 0.18733149766921997,
2576
+ "eval_runtime": 3.7928,
2577
+ "eval_samples_per_second": 11.864,
2578
+ "eval_steps_per_second": 0.791,
2579
+ "step": 351
2580
+ },
2581
+ {
2582
+ "epoch": 2.6666666666666665,
2583
+ "grad_norm": 0.3042234778404236,
2584
+ "learning_rate": 1.6785386452963914e-07,
2585
+ "loss": 0.1945,
2586
+ "step": 352
2587
+ },
2588
+ {
2589
+ "epoch": 2.6742424242424243,
2590
+ "grad_norm": 0.3053586483001709,
2591
+ "learning_rate": 1.6061121277564746e-07,
2592
+ "loss": 0.1968,
2593
+ "step": 353
2594
+ },
2595
+ {
2596
+ "epoch": 2.6818181818181817,
2597
+ "grad_norm": 0.30519694089889526,
2598
+ "learning_rate": 1.5351943658523153e-07,
2599
+ "loss": 0.1854,
2600
+ "step": 354
2601
+ },
2602
+ {
2603
+ "epoch": 2.6893939393939394,
2604
+ "grad_norm": 0.2871159613132477,
2605
+ "learning_rate": 1.4657933481212243e-07,
2606
+ "loss": 0.1891,
2607
+ "step": 355
2608
+ },
2609
+ {
2610
+ "epoch": 2.6969696969696972,
2611
+ "grad_norm": 0.2823057770729065,
2612
+ "learning_rate": 1.39791689224673e-07,
2613
+ "loss": 0.1788,
2614
+ "step": 356
2615
+ },
2616
+ {
2617
+ "epoch": 2.7045454545454546,
2618
+ "grad_norm": 0.3045274615287781,
2619
+ "learning_rate": 1.3315726441779629e-07,
2620
+ "loss": 0.1808,
2621
+ "step": 357
2622
+ },
2623
+ {
2624
+ "epoch": 2.712121212121212,
2625
+ "grad_norm": 0.2801673114299774,
2626
+ "learning_rate": 1.2667680772683826e-07,
2627
+ "loss": 0.1724,
2628
+ "step": 358
2629
+ },
2630
+ {
2631
+ "epoch": 2.7196969696969697,
2632
+ "grad_norm": 0.27538782358169556,
2633
+ "learning_rate": 1.203510491433919e-07,
2634
+ "loss": 0.168,
2635
+ "step": 359
2636
+ },
2637
+ {
2638
+ "epoch": 2.7272727272727275,
2639
+ "grad_norm": 0.2858332097530365,
2640
+ "learning_rate": 1.141807012330699e-07,
2641
+ "loss": 0.1781,
2642
+ "step": 360
2643
+ },
2644
+ {
2645
+ "epoch": 2.734848484848485,
2646
+ "grad_norm": 0.26091665029525757,
2647
+ "learning_rate": 1.0816645905523597e-07,
2648
+ "loss": 0.1618,
2649
+ "step": 361
2650
+ },
2651
+ {
2652
+ "epoch": 2.742424242424242,
2653
+ "grad_norm": 0.2917574346065521,
2654
+ "learning_rate": 1.0230900008471073e-07,
2655
+ "loss": 0.1867,
2656
+ "step": 362
2657
+ },
2658
+ {
2659
+ "epoch": 2.75,
2660
+ "grad_norm": 0.25802233815193176,
2661
+ "learning_rate": 9.660898413545694e-08,
2662
+ "loss": 0.1661,
2663
+ "step": 363
2664
+ },
2665
+ {
2666
+ "epoch": 2.757575757575758,
2667
+ "grad_norm": 0.29267409443855286,
2668
+ "learning_rate": 9.106705328625408e-08,
2669
+ "loss": 0.1777,
2670
+ "step": 364
2671
+ },
2672
+ {
2673
+ "epoch": 2.765151515151515,
2674
+ "grad_norm": 0.2785191237926483,
2675
+ "learning_rate": 8.568383180837369e-08,
2676
+ "loss": 0.1731,
2677
+ "step": 365
2678
+ },
2679
+ {
2680
+ "epoch": 2.7727272727272725,
2681
+ "grad_norm": 0.2805950939655304,
2682
+ "learning_rate": 8.045992609525571e-08,
2683
+ "loss": 0.1727,
2684
+ "step": 366
2685
+ },
2686
+ {
2687
+ "epoch": 2.7803030303030303,
2688
+ "grad_norm": 0.29362258315086365,
2689
+ "learning_rate": 7.539592459420219e-08,
2690
+ "loss": 0.1751,
2691
+ "step": 367
2692
+ },
2693
+ {
2694
+ "epoch": 2.787878787878788,
2695
+ "grad_norm": 0.28130269050598145,
2696
+ "learning_rate": 7.049239774009214e-08,
2697
+ "loss": 0.1861,
2698
+ "step": 368
2699
+ },
2700
+ {
2701
+ "epoch": 2.7954545454545454,
2702
+ "grad_norm": 0.29464268684387207,
2703
+ "learning_rate": 6.574989789112374e-08,
2704
+ "loss": 0.1967,
2705
+ "step": 369
2706
+ },
2707
+ {
2708
+ "epoch": 2.8030303030303028,
2709
+ "grad_norm": 0.2706058621406555,
2710
+ "learning_rate": 6.11689592665951e-08,
2711
+ "loss": 0.1656,
2712
+ "step": 370
2713
+ },
2714
+ {
2715
+ "epoch": 2.8106060606060606,
2716
+ "grad_norm": 0.31391721963882446,
2717
+ "learning_rate": 5.675009788672597e-08,
2718
+ "loss": 0.1395,
2719
+ "step": 371
2720
+ },
2721
+ {
2722
+ "epoch": 2.8181818181818183,
2723
+ "grad_norm": 0.3240405023097992,
2724
+ "learning_rate": 5.249381151453164e-08,
2725
+ "loss": 0.1889,
2726
+ "step": 372
2727
+ },
2728
+ {
2729
+ "epoch": 2.8257575757575757,
2730
+ "grad_norm": 0.286594033241272,
2731
+ "learning_rate": 4.8400579599751696e-08,
2732
+ "loss": 0.1758,
2733
+ "step": 373
2734
+ },
2735
+ {
2736
+ "epoch": 2.8333333333333335,
2737
+ "grad_norm": 0.29338592290878296,
2738
+ "learning_rate": 4.447086322484251e-08,
2739
+ "loss": 0.1869,
2740
+ "step": 374
2741
+ },
2742
+ {
2743
+ "epoch": 2.840909090909091,
2744
+ "grad_norm": 0.29780998826026917,
2745
+ "learning_rate": 4.070510505303815e-08,
2746
+ "loss": 0.1775,
2747
+ "step": 375
2748
+ },
2749
+ {
2750
+ "epoch": 2.8484848484848486,
2751
+ "grad_norm": 0.3508531153202057,
2752
+ "learning_rate": 3.7103729278487766e-08,
2753
+ "loss": 0.189,
2754
+ "step": 376
2755
+ },
2756
+ {
2757
+ "epoch": 2.856060606060606,
2758
+ "grad_norm": 0.28885170817375183,
2759
+ "learning_rate": 3.3667141578470783e-08,
2760
+ "loss": 0.1744,
2761
+ "step": 377
2762
+ },
2763
+ {
2764
+ "epoch": 2.8636363636363638,
2765
+ "grad_norm": 0.31082722544670105,
2766
+ "learning_rate": 3.039572906770033e-08,
2767
+ "loss": 0.1869,
2768
+ "step": 378
2769
+ },
2770
+ {
2771
+ "epoch": 2.8636363636363638,
2772
+ "eval_loss": 0.1873067021369934,
2773
+ "eval_runtime": 4.1732,
2774
+ "eval_samples_per_second": 10.783,
2775
+ "eval_steps_per_second": 0.719,
2776
+ "step": 378
2777
+ },
2778
+ {
2779
+ "epoch": 2.871212121212121,
2780
+ "grad_norm": 0.30285000801086426,
2781
+ "learning_rate": 2.7289860254716416e-08,
2782
+ "loss": 0.1871,
2783
+ "step": 379
2784
+ },
2785
+ {
2786
+ "epoch": 2.878787878787879,
2787
+ "grad_norm": 0.2864581048488617,
2788
+ "learning_rate": 2.434988500037466e-08,
2789
+ "loss": 0.1778,
2790
+ "step": 380
2791
+ },
2792
+ {
2793
+ "epoch": 2.8863636363636362,
2794
+ "grad_norm": 0.2580985724925995,
2795
+ "learning_rate": 2.1576134478437316e-08,
2796
+ "loss": 0.1421,
2797
+ "step": 381
2798
+ },
2799
+ {
2800
+ "epoch": 2.893939393939394,
2801
+ "grad_norm": 0.2779240608215332,
2802
+ "learning_rate": 1.896892113826709e-08,
2803
+ "loss": 0.1685,
2804
+ "step": 382
2805
+ },
2806
+ {
2807
+ "epoch": 2.9015151515151514,
2808
+ "grad_norm": 0.27729371190071106,
2809
+ "learning_rate": 1.6528538669631998e-08,
2810
+ "loss": 0.1854,
2811
+ "step": 383
2812
+ },
2813
+ {
2814
+ "epoch": 2.909090909090909,
2815
+ "grad_norm": 0.28567954897880554,
2816
+ "learning_rate": 1.4255261969622457e-08,
2817
+ "loss": 0.1767,
2818
+ "step": 384
2819
+ },
2820
+ {
2821
+ "epoch": 2.9166666666666665,
2822
+ "grad_norm": 0.3133796453475952,
2823
+ "learning_rate": 1.214934711168475e-08,
2824
+ "loss": 0.2042,
2825
+ "step": 385
2826
+ },
2827
+ {
2828
+ "epoch": 2.9242424242424243,
2829
+ "grad_norm": 0.31388959288597107,
2830
+ "learning_rate": 1.021103131677692e-08,
2831
+ "loss": 0.2098,
2832
+ "step": 386
2833
+ },
2834
+ {
2835
+ "epoch": 2.9318181818181817,
2836
+ "grad_norm": 0.30285680294036865,
2837
+ "learning_rate": 8.440532926646316e-09,
2838
+ "loss": 0.1935,
2839
+ "step": 387
2840
+ },
2841
+ {
2842
+ "epoch": 2.9393939393939394,
2843
+ "grad_norm": 0.29303839802742004,
2844
+ "learning_rate": 6.8380513792341e-09,
2845
+ "loss": 0.1649,
2846
+ "step": 388
2847
+ },
2848
+ {
2849
+ "epoch": 2.9469696969696972,
2850
+ "grad_norm": 0.28131937980651855,
2851
+ "learning_rate": 5.403767186210218e-09,
2852
+ "loss": 0.1741,
2853
+ "step": 389
2854
+ },
2855
+ {
2856
+ "epoch": 2.9545454545454546,
2857
+ "grad_norm": 0.27338099479675293,
2858
+ "learning_rate": 4.1378419126393285e-09,
2859
+ "loss": 0.1688,
2860
+ "step": 390
2861
+ },
2862
+ {
2863
+ "epoch": 2.962121212121212,
2864
+ "grad_norm": 0.2868610918521881,
2865
+ "learning_rate": 3.0404181587811996e-09,
2866
+ "loss": 0.1774,
2867
+ "step": 391
2868
+ },
2869
+ {
2870
+ "epoch": 2.9696969696969697,
2871
+ "grad_norm": 0.28331971168518066,
2872
+ "learning_rate": 2.1116195440278876e-09,
2873
+ "loss": 0.1636,
2874
+ "step": 392
2875
+ },
2876
+ {
2877
+ "epoch": 2.9772727272727275,
2878
+ "grad_norm": 0.2694188356399536,
2879
+ "learning_rate": 1.3515506929778764e-09,
2880
+ "loss": 0.1611,
2881
+ "step": 393
2882
+ },
2883
+ {
2884
+ "epoch": 2.984848484848485,
2885
+ "grad_norm": 0.2967832088470459,
2886
+ "learning_rate": 7.602972236513405e-10,
2887
+ "loss": 0.1834,
2888
+ "step": 394
2889
+ },
2890
+ {
2891
+ "epoch": 2.992424242424242,
2892
+ "grad_norm": 0.2818254828453064,
2893
+ "learning_rate": 3.379257378458567e-10,
2894
+ "loss": 0.1668,
2895
+ "step": 395
2896
+ },
2897
+ {
2898
+ "epoch": 3.0,
2899
+ "grad_norm": 0.291985422372818,
2900
+ "learning_rate": 8.448381363307389e-11,
2901
+ "loss": 0.1836,
2902
+ "step": 396
2903
  }
2904
  ],
2905
  "logging_steps": 1,
 
2914
  "should_evaluate": false,
2915
  "should_log": false,
2916
  "should_save": true,
2917
+ "should_training_stop": true
2918
  },
2919
  "attributes": {}
2920
  }
2921
  },
2922
+ "total_flos": 8.713975214603502e+18,
2923
  "train_batch_size": 2,
2924
  "trial_name": null,
2925
  "trial_params": null