stanpony commited on
Commit
1fbbc7d
·
verified ·
1 Parent(s): da209f8

Upload fine-tuned checkpoint

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +2 -2
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +5 -2813
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf80eb8bd6a1ec725556c7af1b6d06add9065e441daca5029f7f7b3caa4708af
3
- size 51880928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bffa9d18e49a345a9b66654a8b379c12896d28ad3d84beb951c5224569da50f1
3
+ size 13269912
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7db2e78c375a177814c7ee3591a0571d5341b57de4c55b68245693610e344ca
3
  size 7354554
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a94175e53964c6c84a5af9ab8815c4ee3b327e86338e6970e19fe608e3799137
3
  size 7354554
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c63bc435b1d960c6463e1ff9f5b79dff30f95a2fdcc198fa937f4e89051ed38e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1351e3cbde5fba28ca4ac8afda43f48563c490779fc18bffc14596d2ecdfa130
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:898bc6e3cf885914956e4b9a7ef3392a11cd744dc029e768b06639461408311a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e055298148c186988a54be766385f194e9e1ed4ff410a4916a76b1dcb1be936e
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 3.6955573558807373,
3
- "best_model_checkpoint": "checkpoints/test_1M_1-2025-02-12-12-32/checkpoint-20000",
4
- "epoch": 1.5829046299960428,
5
  "eval_steps": 10000,
6
- "global_step": 20000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2815,2814 +2815,6 @@
2815
  "eval_samples_per_second": 264.165,
2816
  "eval_steps_per_second": 33.021,
2817
  "step": 10000
2818
- },
2819
- {
2820
- "epoch": 0.7934309457855164,
2821
- "grad_norm": 130.07545471191406,
2822
- "learning_rate": 2.169432389294147e-07,
2823
- "loss": 3.7883,
2824
- "step": 10025
2825
- },
2826
- {
2827
- "epoch": 0.7954095765730115,
2828
- "grad_norm": 135.5163116455078,
2829
- "learning_rate": 2.16860794892427e-07,
2830
- "loss": 3.8028,
2831
- "step": 10050
2832
- },
2833
- {
2834
- "epoch": 0.7973882073605065,
2835
- "grad_norm": 130.33010864257812,
2836
- "learning_rate": 2.1677835085543934e-07,
2837
- "loss": 3.9204,
2838
- "step": 10075
2839
- },
2840
- {
2841
- "epoch": 0.7993668381480016,
2842
- "grad_norm": 131.47166442871094,
2843
- "learning_rate": 2.1669590681845162e-07,
2844
- "loss": 3.7961,
2845
- "step": 10100
2846
- },
2847
- {
2848
- "epoch": 0.8013454689354966,
2849
- "grad_norm": 126.78470611572266,
2850
- "learning_rate": 2.1661346278146396e-07,
2851
- "loss": 3.8249,
2852
- "step": 10125
2853
- },
2854
- {
2855
- "epoch": 0.8033240997229917,
2856
- "grad_norm": 188.20448303222656,
2857
- "learning_rate": 2.1653101874447624e-07,
2858
- "loss": 3.778,
2859
- "step": 10150
2860
- },
2861
- {
2862
- "epoch": 0.8053027305104867,
2863
- "grad_norm": 137.7311553955078,
2864
- "learning_rate": 2.1644857470748853e-07,
2865
- "loss": 3.9346,
2866
- "step": 10175
2867
- },
2868
- {
2869
- "epoch": 0.8072813612979818,
2870
- "grad_norm": 89.07192993164062,
2871
- "learning_rate": 2.1636613067050087e-07,
2872
- "loss": 3.7072,
2873
- "step": 10200
2874
- },
2875
- {
2876
- "epoch": 0.8092599920854768,
2877
- "grad_norm": 96.4891357421875,
2878
- "learning_rate": 2.1628368663351315e-07,
2879
- "loss": 3.8609,
2880
- "step": 10225
2881
- },
2882
- {
2883
- "epoch": 0.8112386228729719,
2884
- "grad_norm": 144.67372131347656,
2885
- "learning_rate": 2.1620124259652546e-07,
2886
- "loss": 3.6157,
2887
- "step": 10250
2888
- },
2889
- {
2890
- "epoch": 0.8132172536604669,
2891
- "grad_norm": 115.51387023925781,
2892
- "learning_rate": 2.1611879855953778e-07,
2893
- "loss": 3.7226,
2894
- "step": 10275
2895
- },
2896
- {
2897
- "epoch": 0.815195884447962,
2898
- "grad_norm": 99.25152587890625,
2899
- "learning_rate": 2.160363545225501e-07,
2900
- "loss": 3.9583,
2901
- "step": 10300
2902
- },
2903
- {
2904
- "epoch": 0.817174515235457,
2905
- "grad_norm": 129.25799560546875,
2906
- "learning_rate": 2.1595391048556237e-07,
2907
- "loss": 3.9272,
2908
- "step": 10325
2909
- },
2910
- {
2911
- "epoch": 0.8191531460229521,
2912
- "grad_norm": 142.94081115722656,
2913
- "learning_rate": 2.158714664485747e-07,
2914
- "loss": 4.0403,
2915
- "step": 10350
2916
- },
2917
- {
2918
- "epoch": 0.8211317768104471,
2919
- "grad_norm": 138.73974609375,
2920
- "learning_rate": 2.15789022411587e-07,
2921
- "loss": 3.9837,
2922
- "step": 10375
2923
- },
2924
- {
2925
- "epoch": 0.8231104075979422,
2926
- "grad_norm": 128.65940856933594,
2927
- "learning_rate": 2.1570657837459933e-07,
2928
- "loss": 3.9414,
2929
- "step": 10400
2930
- },
2931
- {
2932
- "epoch": 0.8250890383854372,
2933
- "grad_norm": 98.70417022705078,
2934
- "learning_rate": 2.1562413433761162e-07,
2935
- "loss": 3.9875,
2936
- "step": 10425
2937
- },
2938
- {
2939
- "epoch": 0.8270676691729323,
2940
- "grad_norm": 95.58749389648438,
2941
- "learning_rate": 2.155416903006239e-07,
2942
- "loss": 4.0485,
2943
- "step": 10450
2944
- },
2945
- {
2946
- "epoch": 0.8290462999604273,
2947
- "grad_norm": 115.98033905029297,
2948
- "learning_rate": 2.1545924626363624e-07,
2949
- "loss": 3.9302,
2950
- "step": 10475
2951
- },
2952
- {
2953
- "epoch": 0.8310249307479224,
2954
- "grad_norm": 115.73841094970703,
2955
- "learning_rate": 2.1537680222664853e-07,
2956
- "loss": 3.8931,
2957
- "step": 10500
2958
- },
2959
- {
2960
- "epoch": 0.8330035615354174,
2961
- "grad_norm": 120.9448013305664,
2962
- "learning_rate": 2.1529435818966086e-07,
2963
- "loss": 3.9164,
2964
- "step": 10525
2965
- },
2966
- {
2967
- "epoch": 0.8349821923229126,
2968
- "grad_norm": 108.95042419433594,
2969
- "learning_rate": 2.1521191415267315e-07,
2970
- "loss": 3.8218,
2971
- "step": 10550
2972
- },
2973
- {
2974
- "epoch": 0.8369608231104076,
2975
- "grad_norm": 134.7786407470703,
2976
- "learning_rate": 2.1512947011568546e-07,
2977
- "loss": 3.8017,
2978
- "step": 10575
2979
- },
2980
- {
2981
- "epoch": 0.8389394538979027,
2982
- "grad_norm": 135.17352294921875,
2983
- "learning_rate": 2.1504702607869777e-07,
2984
- "loss": 3.7896,
2985
- "step": 10600
2986
- },
2987
- {
2988
- "epoch": 0.8409180846853977,
2989
- "grad_norm": 107.60875701904297,
2990
- "learning_rate": 2.1496458204171008e-07,
2991
- "loss": 3.8653,
2992
- "step": 10625
2993
- },
2994
- {
2995
- "epoch": 0.8428967154728928,
2996
- "grad_norm": 106.85396575927734,
2997
- "learning_rate": 2.1488213800472237e-07,
2998
- "loss": 3.7688,
2999
- "step": 10650
3000
- },
3001
- {
3002
- "epoch": 0.8448753462603878,
3003
- "grad_norm": 154.32797241210938,
3004
- "learning_rate": 2.147996939677347e-07,
3005
- "loss": 3.7928,
3006
- "step": 10675
3007
- },
3008
- {
3009
- "epoch": 0.8468539770478829,
3010
- "grad_norm": 105.45416259765625,
3011
- "learning_rate": 2.14717249930747e-07,
3012
- "loss": 3.9773,
3013
- "step": 10700
3014
- },
3015
- {
3016
- "epoch": 0.848832607835378,
3017
- "grad_norm": 130.95082092285156,
3018
- "learning_rate": 2.1463480589375933e-07,
3019
- "loss": 3.9007,
3020
- "step": 10725
3021
- },
3022
- {
3023
- "epoch": 0.850811238622873,
3024
- "grad_norm": 111.79964447021484,
3025
- "learning_rate": 2.1455236185677162e-07,
3026
- "loss": 3.8503,
3027
- "step": 10750
3028
- },
3029
- {
3030
- "epoch": 0.852789869410368,
3031
- "grad_norm": 111.66275787353516,
3032
- "learning_rate": 2.144699178197839e-07,
3033
- "loss": 4.1484,
3034
- "step": 10775
3035
- },
3036
- {
3037
- "epoch": 0.8547685001978631,
3038
- "grad_norm": 105.59733581542969,
3039
- "learning_rate": 2.1438747378279624e-07,
3040
- "loss": 3.8871,
3041
- "step": 10800
3042
- },
3043
- {
3044
- "epoch": 0.8567471309853582,
3045
- "grad_norm": 100.40887451171875,
3046
- "learning_rate": 2.1430502974580852e-07,
3047
- "loss": 3.8456,
3048
- "step": 10825
3049
- },
3050
- {
3051
- "epoch": 0.8587257617728532,
3052
- "grad_norm": 129.26177978515625,
3053
- "learning_rate": 2.1422258570882084e-07,
3054
- "loss": 3.673,
3055
- "step": 10850
3056
- },
3057
- {
3058
- "epoch": 0.8607043925603483,
3059
- "grad_norm": 144.7709197998047,
3060
- "learning_rate": 2.1414014167183315e-07,
3061
- "loss": 3.7465,
3062
- "step": 10875
3063
- },
3064
- {
3065
- "epoch": 0.8626830233478433,
3066
- "grad_norm": 137.7886505126953,
3067
- "learning_rate": 2.1405769763484546e-07,
3068
- "loss": 3.8944,
3069
- "step": 10900
3070
- },
3071
- {
3072
- "epoch": 0.8646616541353384,
3073
- "grad_norm": 117.49480438232422,
3074
- "learning_rate": 2.1397525359785774e-07,
3075
- "loss": 3.7995,
3076
- "step": 10925
3077
- },
3078
- {
3079
- "epoch": 0.8666402849228334,
3080
- "grad_norm": 168.4739990234375,
3081
- "learning_rate": 2.1389280956087008e-07,
3082
- "loss": 4.0448,
3083
- "step": 10950
3084
- },
3085
- {
3086
- "epoch": 0.8686189157103285,
3087
- "grad_norm": 133.91720581054688,
3088
- "learning_rate": 2.1381036552388237e-07,
3089
- "loss": 3.6778,
3090
- "step": 10975
3091
- },
3092
- {
3093
- "epoch": 0.8705975464978235,
3094
- "grad_norm": 100.49728393554688,
3095
- "learning_rate": 2.137279214868947e-07,
3096
- "loss": 3.9251,
3097
- "step": 11000
3098
- },
3099
- {
3100
- "epoch": 0.8725761772853186,
3101
- "grad_norm": 189.62327575683594,
3102
- "learning_rate": 2.13645477449907e-07,
3103
- "loss": 3.8032,
3104
- "step": 11025
3105
- },
3106
- {
3107
- "epoch": 0.8745548080728136,
3108
- "grad_norm": 108.47595977783203,
3109
- "learning_rate": 2.135630334129193e-07,
3110
- "loss": 3.92,
3111
- "step": 11050
3112
- },
3113
- {
3114
- "epoch": 0.8765334388603087,
3115
- "grad_norm": 150.95584106445312,
3116
- "learning_rate": 2.1348058937593161e-07,
3117
- "loss": 3.9559,
3118
- "step": 11075
3119
- },
3120
- {
3121
- "epoch": 0.8785120696478037,
3122
- "grad_norm": 126.86693572998047,
3123
- "learning_rate": 2.133981453389439e-07,
3124
- "loss": 3.9322,
3125
- "step": 11100
3126
- },
3127
- {
3128
- "epoch": 0.8804907004352988,
3129
- "grad_norm": 158.07049560546875,
3130
- "learning_rate": 2.1331570130195624e-07,
3131
- "loss": 3.7498,
3132
- "step": 11125
3133
- },
3134
- {
3135
- "epoch": 0.8824693312227938,
3136
- "grad_norm": 156.98699951171875,
3137
- "learning_rate": 2.1323325726496852e-07,
3138
- "loss": 3.9679,
3139
- "step": 11150
3140
- },
3141
- {
3142
- "epoch": 0.8844479620102889,
3143
- "grad_norm": 91.1626968383789,
3144
- "learning_rate": 2.1315081322798083e-07,
3145
- "loss": 3.9669,
3146
- "step": 11175
3147
- },
3148
- {
3149
- "epoch": 0.8864265927977839,
3150
- "grad_norm": 145.4545135498047,
3151
- "learning_rate": 2.1306836919099315e-07,
3152
- "loss": 3.9038,
3153
- "step": 11200
3154
- },
3155
- {
3156
- "epoch": 0.888405223585279,
3157
- "grad_norm": 134.78793334960938,
3158
- "learning_rate": 2.1298592515400546e-07,
3159
- "loss": 3.9122,
3160
- "step": 11225
3161
- },
3162
- {
3163
- "epoch": 0.890383854372774,
3164
- "grad_norm": 132.1155242919922,
3165
- "learning_rate": 2.1290348111701774e-07,
3166
- "loss": 4.0016,
3167
- "step": 11250
3168
- },
3169
- {
3170
- "epoch": 0.8923624851602691,
3171
- "grad_norm": 116.72430419921875,
3172
- "learning_rate": 2.1282103708003008e-07,
3173
- "loss": 3.9878,
3174
- "step": 11275
3175
- },
3176
- {
3177
- "epoch": 0.8943411159477641,
3178
- "grad_norm": 117.89616394042969,
3179
- "learning_rate": 2.1273859304304237e-07,
3180
- "loss": 3.7834,
3181
- "step": 11300
3182
- },
3183
- {
3184
- "epoch": 0.8963197467352592,
3185
- "grad_norm": 151.65277099609375,
3186
- "learning_rate": 2.126561490060547e-07,
3187
- "loss": 3.9696,
3188
- "step": 11325
3189
- },
3190
- {
3191
- "epoch": 0.8982983775227542,
3192
- "grad_norm": 139.39405822753906,
3193
- "learning_rate": 2.12573704969067e-07,
3194
- "loss": 3.9585,
3195
- "step": 11350
3196
- },
3197
- {
3198
- "epoch": 0.9002770083102493,
3199
- "grad_norm": 123.25849914550781,
3200
- "learning_rate": 2.124912609320793e-07,
3201
- "loss": 3.816,
3202
- "step": 11375
3203
- },
3204
- {
3205
- "epoch": 0.9022556390977443,
3206
- "grad_norm": 148.54562377929688,
3207
- "learning_rate": 2.124088168950916e-07,
3208
- "loss": 3.8433,
3209
- "step": 11400
3210
- },
3211
- {
3212
- "epoch": 0.9042342698852394,
3213
- "grad_norm": 135.60752868652344,
3214
- "learning_rate": 2.123263728581039e-07,
3215
- "loss": 3.7851,
3216
- "step": 11425
3217
- },
3218
- {
3219
- "epoch": 0.9062129006727345,
3220
- "grad_norm": 129.5711212158203,
3221
- "learning_rate": 2.122439288211162e-07,
3222
- "loss": 3.7133,
3223
- "step": 11450
3224
- },
3225
- {
3226
- "epoch": 0.9081915314602296,
3227
- "grad_norm": 136.88392639160156,
3228
- "learning_rate": 2.1216148478412852e-07,
3229
- "loss": 3.9864,
3230
- "step": 11475
3231
- },
3232
- {
3233
- "epoch": 0.9101701622477246,
3234
- "grad_norm": 148.6637725830078,
3235
- "learning_rate": 2.1207904074714083e-07,
3236
- "loss": 3.6906,
3237
- "step": 11500
3238
- },
3239
- {
3240
- "epoch": 0.9121487930352197,
3241
- "grad_norm": 164.6747283935547,
3242
- "learning_rate": 2.1199659671015314e-07,
3243
- "loss": 3.8543,
3244
- "step": 11525
3245
- },
3246
- {
3247
- "epoch": 0.9141274238227147,
3248
- "grad_norm": 114.64603424072266,
3249
- "learning_rate": 2.1191415267316546e-07,
3250
- "loss": 3.6973,
3251
- "step": 11550
3252
- },
3253
- {
3254
- "epoch": 0.9161060546102098,
3255
- "grad_norm": 129.53265380859375,
3256
- "learning_rate": 2.1183170863617774e-07,
3257
- "loss": 3.5896,
3258
- "step": 11575
3259
- },
3260
- {
3261
- "epoch": 0.9180846853977048,
3262
- "grad_norm": 145.63973999023438,
3263
- "learning_rate": 2.1174926459919008e-07,
3264
- "loss": 3.963,
3265
- "step": 11600
3266
- },
3267
- {
3268
- "epoch": 0.9200633161851999,
3269
- "grad_norm": 132.06729125976562,
3270
- "learning_rate": 2.1166682056220236e-07,
3271
- "loss": 3.6703,
3272
- "step": 11625
3273
- },
3274
- {
3275
- "epoch": 0.9220419469726949,
3276
- "grad_norm": 129.76583862304688,
3277
- "learning_rate": 2.1158437652521468e-07,
3278
- "loss": 3.6965,
3279
- "step": 11650
3280
- },
3281
- {
3282
- "epoch": 0.92402057776019,
3283
- "grad_norm": 115.21681213378906,
3284
- "learning_rate": 2.1150193248822699e-07,
3285
- "loss": 3.8711,
3286
- "step": 11675
3287
- },
3288
- {
3289
- "epoch": 0.925999208547685,
3290
- "grad_norm": 85.92119598388672,
3291
- "learning_rate": 2.114194884512393e-07,
3292
- "loss": 3.8377,
3293
- "step": 11700
3294
- },
3295
- {
3296
- "epoch": 0.9279778393351801,
3297
- "grad_norm": 117.11397552490234,
3298
- "learning_rate": 2.113370444142516e-07,
3299
- "loss": 3.7965,
3300
- "step": 11725
3301
- },
3302
- {
3303
- "epoch": 0.9299564701226751,
3304
- "grad_norm": 113.50032806396484,
3305
- "learning_rate": 2.112546003772639e-07,
3306
- "loss": 3.7684,
3307
- "step": 11750
3308
- },
3309
- {
3310
- "epoch": 0.9319351009101702,
3311
- "grad_norm": 107.85367584228516,
3312
- "learning_rate": 2.111721563402762e-07,
3313
- "loss": 3.6988,
3314
- "step": 11775
3315
- },
3316
- {
3317
- "epoch": 0.9339137316976652,
3318
- "grad_norm": 121.42901611328125,
3319
- "learning_rate": 2.1108971230328852e-07,
3320
- "loss": 3.6745,
3321
- "step": 11800
3322
- },
3323
- {
3324
- "epoch": 0.9358923624851603,
3325
- "grad_norm": 139.971923828125,
3326
- "learning_rate": 2.1100726826630083e-07,
3327
- "loss": 3.8403,
3328
- "step": 11825
3329
- },
3330
- {
3331
- "epoch": 0.9378709932726553,
3332
- "grad_norm": 126.40741729736328,
3333
- "learning_rate": 2.1092482422931312e-07,
3334
- "loss": 3.7846,
3335
- "step": 11850
3336
- },
3337
- {
3338
- "epoch": 0.9398496240601504,
3339
- "grad_norm": 110.96858215332031,
3340
- "learning_rate": 2.1084238019232545e-07,
3341
- "loss": 3.9256,
3342
- "step": 11875
3343
- },
3344
- {
3345
- "epoch": 0.9418282548476454,
3346
- "grad_norm": 107.72772979736328,
3347
- "learning_rate": 2.1075993615533774e-07,
3348
- "loss": 3.7888,
3349
- "step": 11900
3350
- },
3351
- {
3352
- "epoch": 0.9438068856351405,
3353
- "grad_norm": 94.70952606201172,
3354
- "learning_rate": 2.1067749211835008e-07,
3355
- "loss": 3.8496,
3356
- "step": 11925
3357
- },
3358
- {
3359
- "epoch": 0.9457855164226355,
3360
- "grad_norm": 178.84500122070312,
3361
- "learning_rate": 2.1059504808136236e-07,
3362
- "loss": 3.737,
3363
- "step": 11950
3364
- },
3365
- {
3366
- "epoch": 0.9477641472101306,
3367
- "grad_norm": 179.0364227294922,
3368
- "learning_rate": 2.1051260404437467e-07,
3369
- "loss": 3.7584,
3370
- "step": 11975
3371
- },
3372
- {
3373
- "epoch": 0.9497427779976256,
3374
- "grad_norm": 141.7266845703125,
3375
- "learning_rate": 2.1043016000738698e-07,
3376
- "loss": 3.7948,
3377
- "step": 12000
3378
- },
3379
- {
3380
- "epoch": 0.9517214087851207,
3381
- "grad_norm": 120.56340026855469,
3382
- "learning_rate": 2.1034771597039927e-07,
3383
- "loss": 3.8951,
3384
- "step": 12025
3385
- },
3386
- {
3387
- "epoch": 0.9537000395726157,
3388
- "grad_norm": 113.90511322021484,
3389
- "learning_rate": 2.1026527193341158e-07,
3390
- "loss": 3.8823,
3391
- "step": 12050
3392
- },
3393
- {
3394
- "epoch": 0.9556786703601108,
3395
- "grad_norm": 103.57383728027344,
3396
- "learning_rate": 2.101828278964239e-07,
3397
- "loss": 3.7853,
3398
- "step": 12075
3399
- },
3400
- {
3401
- "epoch": 0.9576573011476058,
3402
- "grad_norm": 111.0738525390625,
3403
- "learning_rate": 2.101003838594362e-07,
3404
- "loss": 3.8394,
3405
- "step": 12100
3406
- },
3407
- {
3408
- "epoch": 0.9596359319351009,
3409
- "grad_norm": 130.9629364013672,
3410
- "learning_rate": 2.1001793982244852e-07,
3411
- "loss": 3.7752,
3412
- "step": 12125
3413
- },
3414
- {
3415
- "epoch": 0.9616145627225959,
3416
- "grad_norm": 98.29790496826172,
3417
- "learning_rate": 2.0993549578546083e-07,
3418
- "loss": 3.8332,
3419
- "step": 12150
3420
- },
3421
- {
3422
- "epoch": 0.963593193510091,
3423
- "grad_norm": 116.66043853759766,
3424
- "learning_rate": 2.098530517484731e-07,
3425
- "loss": 3.964,
3426
- "step": 12175
3427
- },
3428
- {
3429
- "epoch": 0.965571824297586,
3430
- "grad_norm": 179.0226287841797,
3431
- "learning_rate": 2.0977060771148545e-07,
3432
- "loss": 3.8848,
3433
- "step": 12200
3434
- },
3435
- {
3436
- "epoch": 0.9675504550850811,
3437
- "grad_norm": 100.40991973876953,
3438
- "learning_rate": 2.0968816367449774e-07,
3439
- "loss": 3.8461,
3440
- "step": 12225
3441
- },
3442
- {
3443
- "epoch": 0.9695290858725761,
3444
- "grad_norm": 117.44912719726562,
3445
- "learning_rate": 2.0960571963751005e-07,
3446
- "loss": 3.723,
3447
- "step": 12250
3448
- },
3449
- {
3450
- "epoch": 0.9715077166600712,
3451
- "grad_norm": 106.5282974243164,
3452
- "learning_rate": 2.0952327560052236e-07,
3453
- "loss": 3.8557,
3454
- "step": 12275
3455
- },
3456
- {
3457
- "epoch": 0.9734863474475662,
3458
- "grad_norm": 111.7376480102539,
3459
- "learning_rate": 2.0944083156353467e-07,
3460
- "loss": 3.8505,
3461
- "step": 12300
3462
- },
3463
- {
3464
- "epoch": 0.9754649782350613,
3465
- "grad_norm": 185.3255157470703,
3466
- "learning_rate": 2.0935838752654698e-07,
3467
- "loss": 3.9339,
3468
- "step": 12325
3469
- },
3470
- {
3471
- "epoch": 0.9774436090225563,
3472
- "grad_norm": 128.0303955078125,
3473
- "learning_rate": 2.0927594348955927e-07,
3474
- "loss": 3.9595,
3475
- "step": 12350
3476
- },
3477
- {
3478
- "epoch": 0.9794222398100515,
3479
- "grad_norm": 117.01809692382812,
3480
- "learning_rate": 2.0919349945257158e-07,
3481
- "loss": 3.8385,
3482
- "step": 12375
3483
- },
3484
- {
3485
- "epoch": 0.9814008705975465,
3486
- "grad_norm": 132.46775817871094,
3487
- "learning_rate": 2.091110554155839e-07,
3488
- "loss": 3.6976,
3489
- "step": 12400
3490
- },
3491
- {
3492
- "epoch": 0.9833795013850416,
3493
- "grad_norm": 125.96062469482422,
3494
- "learning_rate": 2.090286113785962e-07,
3495
- "loss": 3.9652,
3496
- "step": 12425
3497
- },
3498
- {
3499
- "epoch": 0.9853581321725366,
3500
- "grad_norm": 172.13600158691406,
3501
- "learning_rate": 2.089461673416085e-07,
3502
- "loss": 3.9026,
3503
- "step": 12450
3504
- },
3505
- {
3506
- "epoch": 0.9873367629600317,
3507
- "grad_norm": 107.95278930664062,
3508
- "learning_rate": 2.0886372330462083e-07,
3509
- "loss": 3.8457,
3510
- "step": 12475
3511
- },
3512
- {
3513
- "epoch": 0.9893153937475268,
3514
- "grad_norm": 196.08303833007812,
3515
- "learning_rate": 2.087812792676331e-07,
3516
- "loss": 3.8319,
3517
- "step": 12500
3518
- },
3519
- {
3520
- "epoch": 0.9912940245350218,
3521
- "grad_norm": 119.07249450683594,
3522
- "learning_rate": 2.0869883523064545e-07,
3523
- "loss": 3.7464,
3524
- "step": 12525
3525
- },
3526
- {
3527
- "epoch": 0.9932726553225169,
3528
- "grad_norm": 118.91383361816406,
3529
- "learning_rate": 2.0861639119365773e-07,
3530
- "loss": 3.7734,
3531
- "step": 12550
3532
- },
3533
- {
3534
- "epoch": 0.9952512861100119,
3535
- "grad_norm": 122.37627410888672,
3536
- "learning_rate": 2.0853394715667005e-07,
3537
- "loss": 3.9263,
3538
- "step": 12575
3539
- },
3540
- {
3541
- "epoch": 0.997229916897507,
3542
- "grad_norm": 101.24188232421875,
3543
- "learning_rate": 2.0845150311968236e-07,
3544
- "loss": 3.6865,
3545
- "step": 12600
3546
- },
3547
- {
3548
- "epoch": 0.999208547685002,
3549
- "grad_norm": 253.74789428710938,
3550
- "learning_rate": 2.0836905908269467e-07,
3551
- "loss": 3.857,
3552
- "step": 12625
3553
- },
3554
- {
3555
- "epoch": 1.001187178472497,
3556
- "grad_norm": 121.46297454833984,
3557
- "learning_rate": 2.0828661504570695e-07,
3558
- "loss": 3.9145,
3559
- "step": 12650
3560
- },
3561
- {
3562
- "epoch": 1.003165809259992,
3563
- "grad_norm": 115.41104125976562,
3564
- "learning_rate": 2.0820417100871927e-07,
3565
- "loss": 3.9441,
3566
- "step": 12675
3567
- },
3568
- {
3569
- "epoch": 1.0051444400474872,
3570
- "grad_norm": 105.03096008300781,
3571
- "learning_rate": 2.0812172697173158e-07,
3572
- "loss": 3.6926,
3573
- "step": 12700
3574
- },
3575
- {
3576
- "epoch": 1.007123070834982,
3577
- "grad_norm": 98.86603546142578,
3578
- "learning_rate": 2.080392829347439e-07,
3579
- "loss": 3.5524,
3580
- "step": 12725
3581
- },
3582
- {
3583
- "epoch": 1.0091017016224773,
3584
- "grad_norm": 120.75003814697266,
3585
- "learning_rate": 2.079568388977562e-07,
3586
- "loss": 3.8494,
3587
- "step": 12750
3588
- },
3589
- {
3590
- "epoch": 1.0110803324099722,
3591
- "grad_norm": 86.03683471679688,
3592
- "learning_rate": 2.0787439486076849e-07,
3593
- "loss": 3.7635,
3594
- "step": 12775
3595
- },
3596
- {
3597
- "epoch": 1.0130589631974674,
3598
- "grad_norm": 132.28945922851562,
3599
- "learning_rate": 2.0779195082378082e-07,
3600
- "loss": 4.007,
3601
- "step": 12800
3602
- },
3603
- {
3604
- "epoch": 1.0150375939849625,
3605
- "grad_norm": 129.51051330566406,
3606
- "learning_rate": 2.077095067867931e-07,
3607
- "loss": 3.6065,
3608
- "step": 12825
3609
- },
3610
- {
3611
- "epoch": 1.0170162247724575,
3612
- "grad_norm": 103.38876342773438,
3613
- "learning_rate": 2.0762706274980542e-07,
3614
- "loss": 3.7508,
3615
- "step": 12850
3616
- },
3617
- {
3618
- "epoch": 1.0189948555599526,
3619
- "grad_norm": 163.7755584716797,
3620
- "learning_rate": 2.0754461871281773e-07,
3621
- "loss": 3.7519,
3622
- "step": 12875
3623
- },
3624
- {
3625
- "epoch": 1.0209734863474476,
3626
- "grad_norm": 123.19217681884766,
3627
- "learning_rate": 2.0746217467583004e-07,
3628
- "loss": 3.8441,
3629
- "step": 12900
3630
- },
3631
- {
3632
- "epoch": 1.0229521171349427,
3633
- "grad_norm": 122.48994445800781,
3634
- "learning_rate": 2.0737973063884236e-07,
3635
- "loss": 3.8664,
3636
- "step": 12925
3637
- },
3638
- {
3639
- "epoch": 1.0249307479224377,
3640
- "grad_norm": 97.31283569335938,
3641
- "learning_rate": 2.0729728660185467e-07,
3642
- "loss": 3.9103,
3643
- "step": 12950
3644
- },
3645
- {
3646
- "epoch": 1.0269093787099328,
3647
- "grad_norm": 178.35073852539062,
3648
- "learning_rate": 2.0721484256486695e-07,
3649
- "loss": 3.8718,
3650
- "step": 12975
3651
- },
3652
- {
3653
- "epoch": 1.0288880094974278,
3654
- "grad_norm": 124.15072631835938,
3655
- "learning_rate": 2.0713239852787926e-07,
3656
- "loss": 3.7962,
3657
- "step": 13000
3658
- },
3659
- {
3660
- "epoch": 1.030866640284923,
3661
- "grad_norm": 141.29360961914062,
3662
- "learning_rate": 2.0704995449089158e-07,
3663
- "loss": 3.8207,
3664
- "step": 13025
3665
- },
3666
- {
3667
- "epoch": 1.0328452710724179,
3668
- "grad_norm": 117.48384857177734,
3669
- "learning_rate": 2.0696751045390386e-07,
3670
- "loss": 4.0525,
3671
- "step": 13050
3672
- },
3673
- {
3674
- "epoch": 1.034823901859913,
3675
- "grad_norm": 124.6396713256836,
3676
- "learning_rate": 2.068850664169162e-07,
3677
- "loss": 3.8003,
3678
- "step": 13075
3679
- },
3680
- {
3681
- "epoch": 1.036802532647408,
3682
- "grad_norm": 164.6786346435547,
3683
- "learning_rate": 2.0680262237992848e-07,
3684
- "loss": 3.6338,
3685
- "step": 13100
3686
- },
3687
- {
3688
- "epoch": 1.0387811634349031,
3689
- "grad_norm": 144.25732421875,
3690
- "learning_rate": 2.0672017834294082e-07,
3691
- "loss": 3.9446,
3692
- "step": 13125
3693
- },
3694
- {
3695
- "epoch": 1.040759794222398,
3696
- "grad_norm": 130.24571228027344,
3697
- "learning_rate": 2.066377343059531e-07,
3698
- "loss": 3.9012,
3699
- "step": 13150
3700
- },
3701
- {
3702
- "epoch": 1.0427384250098932,
3703
- "grad_norm": 122.11434173583984,
3704
- "learning_rate": 2.0655529026896542e-07,
3705
- "loss": 3.7799,
3706
- "step": 13175
3707
- },
3708
- {
3709
- "epoch": 1.0447170557973882,
3710
- "grad_norm": 161.62745666503906,
3711
- "learning_rate": 2.0647284623197773e-07,
3712
- "loss": 3.8881,
3713
- "step": 13200
3714
- },
3715
- {
3716
- "epoch": 1.0466956865848833,
3717
- "grad_norm": 129.13072204589844,
3718
- "learning_rate": 2.0639040219499004e-07,
3719
- "loss": 3.8295,
3720
- "step": 13225
3721
- },
3722
- {
3723
- "epoch": 1.0486743173723783,
3724
- "grad_norm": 153.86805725097656,
3725
- "learning_rate": 2.0630795815800233e-07,
3726
- "loss": 3.7328,
3727
- "step": 13250
3728
- },
3729
- {
3730
- "epoch": 1.0506529481598734,
3731
- "grad_norm": 173.9022979736328,
3732
- "learning_rate": 2.0622551412101464e-07,
3733
- "loss": 3.9866,
3734
- "step": 13275
3735
- },
3736
- {
3737
- "epoch": 1.0526315789473684,
3738
- "grad_norm": 104.5372543334961,
3739
- "learning_rate": 2.0614307008402695e-07,
3740
- "loss": 3.6121,
3741
- "step": 13300
3742
- },
3743
- {
3744
- "epoch": 1.0546102097348635,
3745
- "grad_norm": 115.93688201904297,
3746
- "learning_rate": 2.0606062604703926e-07,
3747
- "loss": 3.8851,
3748
- "step": 13325
3749
- },
3750
- {
3751
- "epoch": 1.0565888405223585,
3752
- "grad_norm": 140.26748657226562,
3753
- "learning_rate": 2.0597818201005157e-07,
3754
- "loss": 3.9924,
3755
- "step": 13350
3756
- },
3757
- {
3758
- "epoch": 1.0585674713098536,
3759
- "grad_norm": 122.87477111816406,
3760
- "learning_rate": 2.0589573797306386e-07,
3761
- "loss": 3.7768,
3762
- "step": 13375
3763
- },
3764
- {
3765
- "epoch": 1.0605461020973486,
3766
- "grad_norm": 105.3402099609375,
3767
- "learning_rate": 2.058132939360762e-07,
3768
- "loss": 3.6371,
3769
- "step": 13400
3770
- },
3771
- {
3772
- "epoch": 1.0625247328848437,
3773
- "grad_norm": 112.1316146850586,
3774
- "learning_rate": 2.0573084989908848e-07,
3775
- "loss": 3.8185,
3776
- "step": 13425
3777
- },
3778
- {
3779
- "epoch": 1.0645033636723387,
3780
- "grad_norm": 105.67694854736328,
3781
- "learning_rate": 2.056484058621008e-07,
3782
- "loss": 3.7202,
3783
- "step": 13450
3784
- },
3785
- {
3786
- "epoch": 1.0664819944598338,
3787
- "grad_norm": 160.9032440185547,
3788
- "learning_rate": 2.055659618251131e-07,
3789
- "loss": 3.6819,
3790
- "step": 13475
3791
- },
3792
- {
3793
- "epoch": 1.0684606252473288,
3794
- "grad_norm": 197.6964569091797,
3795
- "learning_rate": 2.0548351778812542e-07,
3796
- "loss": 3.914,
3797
- "step": 13500
3798
- },
3799
- {
3800
- "epoch": 1.070439256034824,
3801
- "grad_norm": 100.68509674072266,
3802
- "learning_rate": 2.0540107375113773e-07,
3803
- "loss": 3.7281,
3804
- "step": 13525
3805
- },
3806
- {
3807
- "epoch": 1.0724178868223189,
3808
- "grad_norm": 121.31140899658203,
3809
- "learning_rate": 2.0531862971415004e-07,
3810
- "loss": 3.7413,
3811
- "step": 13550
3812
- },
3813
- {
3814
- "epoch": 1.074396517609814,
3815
- "grad_norm": 176.4757080078125,
3816
- "learning_rate": 2.0523618567716232e-07,
3817
- "loss": 4.1066,
3818
- "step": 13575
3819
- },
3820
- {
3821
- "epoch": 1.076375148397309,
3822
- "grad_norm": 98.44918823242188,
3823
- "learning_rate": 2.0515374164017464e-07,
3824
- "loss": 3.6721,
3825
- "step": 13600
3826
- },
3827
- {
3828
- "epoch": 1.0783537791848041,
3829
- "grad_norm": 114.9334487915039,
3830
- "learning_rate": 2.0507129760318695e-07,
3831
- "loss": 3.7888,
3832
- "step": 13625
3833
- },
3834
- {
3835
- "epoch": 1.080332409972299,
3836
- "grad_norm": 172.9710235595703,
3837
- "learning_rate": 2.0498885356619923e-07,
3838
- "loss": 3.8811,
3839
- "step": 13650
3840
- },
3841
- {
3842
- "epoch": 1.0823110407597942,
3843
- "grad_norm": 77.68281555175781,
3844
- "learning_rate": 2.0490640952921157e-07,
3845
- "loss": 3.7865,
3846
- "step": 13675
3847
- },
3848
- {
3849
- "epoch": 1.0842896715472894,
3850
- "grad_norm": 117.03499603271484,
3851
- "learning_rate": 2.0482396549222386e-07,
3852
- "loss": 3.6545,
3853
- "step": 13700
3854
- },
3855
- {
3856
- "epoch": 1.0862683023347843,
3857
- "grad_norm": 111.88362121582031,
3858
- "learning_rate": 2.047415214552362e-07,
3859
- "loss": 3.7833,
3860
- "step": 13725
3861
- },
3862
- {
3863
- "epoch": 1.0882469331222793,
3864
- "grad_norm": 177.05654907226562,
3865
- "learning_rate": 2.0465907741824848e-07,
3866
- "loss": 3.8182,
3867
- "step": 13750
3868
- },
3869
- {
3870
- "epoch": 1.0902255639097744,
3871
- "grad_norm": 110.29389953613281,
3872
- "learning_rate": 2.045766333812608e-07,
3873
- "loss": 3.571,
3874
- "step": 13775
3875
- },
3876
- {
3877
- "epoch": 1.0922041946972696,
3878
- "grad_norm": 81.29917907714844,
3879
- "learning_rate": 2.044941893442731e-07,
3880
- "loss": 3.7003,
3881
- "step": 13800
3882
- },
3883
- {
3884
- "epoch": 1.0941828254847645,
3885
- "grad_norm": 130.32589721679688,
3886
- "learning_rate": 2.0441174530728541e-07,
3887
- "loss": 3.965,
3888
- "step": 13825
3889
- },
3890
- {
3891
- "epoch": 1.0961614562722597,
3892
- "grad_norm": 134.07106018066406,
3893
- "learning_rate": 2.043293012702977e-07,
3894
- "loss": 4.0457,
3895
- "step": 13850
3896
- },
3897
- {
3898
- "epoch": 1.0981400870597546,
3899
- "grad_norm": 204.80294799804688,
3900
- "learning_rate": 2.0424685723331004e-07,
3901
- "loss": 3.5721,
3902
- "step": 13875
3903
- },
3904
- {
3905
- "epoch": 1.1001187178472498,
3906
- "grad_norm": 157.00570678710938,
3907
- "learning_rate": 2.0416441319632232e-07,
3908
- "loss": 3.8119,
3909
- "step": 13900
3910
- },
3911
- {
3912
- "epoch": 1.1020973486347447,
3913
- "grad_norm": 118.97384643554688,
3914
- "learning_rate": 2.0408196915933463e-07,
3915
- "loss": 3.6172,
3916
- "step": 13925
3917
- },
3918
- {
3919
- "epoch": 1.10407597942224,
3920
- "grad_norm": 147.43740844726562,
3921
- "learning_rate": 2.0399952512234695e-07,
3922
- "loss": 3.8369,
3923
- "step": 13950
3924
- },
3925
- {
3926
- "epoch": 1.1060546102097348,
3927
- "grad_norm": 116.36746215820312,
3928
- "learning_rate": 2.0391708108535923e-07,
3929
- "loss": 3.8669,
3930
- "step": 13975
3931
- },
3932
- {
3933
- "epoch": 1.10803324099723,
3934
- "grad_norm": 119.54280090332031,
3935
- "learning_rate": 2.0383463704837157e-07,
3936
- "loss": 3.8281,
3937
- "step": 14000
3938
- },
3939
- {
3940
- "epoch": 1.110011871784725,
3941
- "grad_norm": 161.6844024658203,
3942
- "learning_rate": 2.0375219301138385e-07,
3943
- "loss": 3.632,
3944
- "step": 14025
3945
- },
3946
- {
3947
- "epoch": 1.11199050257222,
3948
- "grad_norm": 115.4676284790039,
3949
- "learning_rate": 2.0366974897439617e-07,
3950
- "loss": 3.6665,
3951
- "step": 14050
3952
- },
3953
- {
3954
- "epoch": 1.113969133359715,
3955
- "grad_norm": 149.53504943847656,
3956
- "learning_rate": 2.0358730493740848e-07,
3957
- "loss": 3.8808,
3958
- "step": 14075
3959
- },
3960
- {
3961
- "epoch": 1.1159477641472102,
3962
- "grad_norm": 108.68144989013672,
3963
- "learning_rate": 2.035048609004208e-07,
3964
- "loss": 3.763,
3965
- "step": 14100
3966
- },
3967
- {
3968
- "epoch": 1.1179263949347051,
3969
- "grad_norm": 140.69000244140625,
3970
- "learning_rate": 2.034224168634331e-07,
3971
- "loss": 3.8483,
3972
- "step": 14125
3973
- },
3974
- {
3975
- "epoch": 1.1199050257222003,
3976
- "grad_norm": 116.24478149414062,
3977
- "learning_rate": 2.033399728264454e-07,
3978
- "loss": 3.6232,
3979
- "step": 14150
3980
- },
3981
- {
3982
- "epoch": 1.1218836565096952,
3983
- "grad_norm": 151.31192016601562,
3984
- "learning_rate": 2.032575287894577e-07,
3985
- "loss": 3.9463,
3986
- "step": 14175
3987
- },
3988
- {
3989
- "epoch": 1.1238622872971904,
3990
- "grad_norm": 104.94828796386719,
3991
- "learning_rate": 2.0317508475247004e-07,
3992
- "loss": 3.8141,
3993
- "step": 14200
3994
- },
3995
- {
3996
- "epoch": 1.1258409180846853,
3997
- "grad_norm": 174.0530548095703,
3998
- "learning_rate": 2.0309264071548232e-07,
3999
- "loss": 3.7278,
4000
- "step": 14225
4001
- },
4002
- {
4003
- "epoch": 1.1278195488721805,
4004
- "grad_norm": 123.31523895263672,
4005
- "learning_rate": 2.030101966784946e-07,
4006
- "loss": 3.6594,
4007
- "step": 14250
4008
- },
4009
- {
4010
- "epoch": 1.1297981796596754,
4011
- "grad_norm": 199.5244140625,
4012
- "learning_rate": 2.0292775264150694e-07,
4013
- "loss": 3.7397,
4014
- "step": 14275
4015
- },
4016
- {
4017
- "epoch": 1.1317768104471706,
4018
- "grad_norm": 121.85034942626953,
4019
- "learning_rate": 2.0284530860451923e-07,
4020
- "loss": 3.7268,
4021
- "step": 14300
4022
- },
4023
- {
4024
- "epoch": 1.1337554412346655,
4025
- "grad_norm": 100.23025512695312,
4026
- "learning_rate": 2.0276286456753157e-07,
4027
- "loss": 3.6129,
4028
- "step": 14325
4029
- },
4030
- {
4031
- "epoch": 1.1357340720221607,
4032
- "grad_norm": 194.6273651123047,
4033
- "learning_rate": 2.0268042053054385e-07,
4034
- "loss": 3.8529,
4035
- "step": 14350
4036
- },
4037
- {
4038
- "epoch": 1.1377127028096556,
4039
- "grad_norm": 86.26454162597656,
4040
- "learning_rate": 2.0259797649355616e-07,
4041
- "loss": 3.7558,
4042
- "step": 14375
4043
- },
4044
- {
4045
- "epoch": 1.1396913335971508,
4046
- "grad_norm": 94.16887664794922,
4047
- "learning_rate": 2.0251553245656847e-07,
4048
- "loss": 3.743,
4049
- "step": 14400
4050
- },
4051
- {
4052
- "epoch": 1.1416699643846457,
4053
- "grad_norm": 112.39076232910156,
4054
- "learning_rate": 2.0243308841958079e-07,
4055
- "loss": 3.6799,
4056
- "step": 14425
4057
- },
4058
- {
4059
- "epoch": 1.143648595172141,
4060
- "grad_norm": 146.2572479248047,
4061
- "learning_rate": 2.0235064438259307e-07,
4062
- "loss": 3.8062,
4063
- "step": 14450
4064
- },
4065
- {
4066
- "epoch": 1.1456272259596358,
4067
- "grad_norm": 100.57402801513672,
4068
- "learning_rate": 2.022682003456054e-07,
4069
- "loss": 3.7005,
4070
- "step": 14475
4071
- },
4072
- {
4073
- "epoch": 1.147605856747131,
4074
- "grad_norm": 110.64191436767578,
4075
- "learning_rate": 2.021857563086177e-07,
4076
- "loss": 4.007,
4077
- "step": 14500
4078
- },
4079
- {
4080
- "epoch": 1.149584487534626,
4081
- "grad_norm": 182.17724609375,
4082
- "learning_rate": 2.0210331227163003e-07,
4083
- "loss": 3.782,
4084
- "step": 14525
4085
- },
4086
- {
4087
- "epoch": 1.151563118322121,
4088
- "grad_norm": 135.72979736328125,
4089
- "learning_rate": 2.0202086823464232e-07,
4090
- "loss": 3.6697,
4091
- "step": 14550
4092
- },
4093
- {
4094
- "epoch": 1.1535417491096163,
4095
- "grad_norm": 101.66889953613281,
4096
- "learning_rate": 2.019384241976546e-07,
4097
- "loss": 3.7594,
4098
- "step": 14575
4099
- },
4100
- {
4101
- "epoch": 1.1555203798971112,
4102
- "grad_norm": 160.48109436035156,
4103
- "learning_rate": 2.0185598016066694e-07,
4104
- "loss": 3.7643,
4105
- "step": 14600
4106
- },
4107
- {
4108
- "epoch": 1.1574990106846061,
4109
- "grad_norm": 104.34113311767578,
4110
- "learning_rate": 2.0177353612367923e-07,
4111
- "loss": 3.9202,
4112
- "step": 14625
4113
- },
4114
- {
4115
- "epoch": 1.1594776414721013,
4116
- "grad_norm": 89.21700286865234,
4117
- "learning_rate": 2.0169109208669154e-07,
4118
- "loss": 3.5803,
4119
- "step": 14650
4120
- },
4121
- {
4122
- "epoch": 1.1614562722595965,
4123
- "grad_norm": 111.87966918945312,
4124
- "learning_rate": 2.0160864804970385e-07,
4125
- "loss": 3.5052,
4126
- "step": 14675
4127
- },
4128
- {
4129
- "epoch": 1.1634349030470914,
4130
- "grad_norm": 134.26589965820312,
4131
- "learning_rate": 2.0152620401271616e-07,
4132
- "loss": 3.7542,
4133
- "step": 14700
4134
- },
4135
- {
4136
- "epoch": 1.1654135338345863,
4137
- "grad_norm": 93.41925811767578,
4138
- "learning_rate": 2.0144375997572847e-07,
4139
- "loss": 3.8865,
4140
- "step": 14725
4141
- },
4142
- {
4143
- "epoch": 1.1673921646220815,
4144
- "grad_norm": 121.87728881835938,
4145
- "learning_rate": 2.0136131593874078e-07,
4146
- "loss": 3.9852,
4147
- "step": 14750
4148
- },
4149
- {
4150
- "epoch": 1.1693707954095767,
4151
- "grad_norm": 129.56564331054688,
4152
- "learning_rate": 2.0127887190175307e-07,
4153
- "loss": 3.6025,
4154
- "step": 14775
4155
- },
4156
- {
4157
- "epoch": 1.1713494261970716,
4158
- "grad_norm": 138.36422729492188,
4159
- "learning_rate": 2.011964278647654e-07,
4160
- "loss": 3.9298,
4161
- "step": 14800
4162
- },
4163
- {
4164
- "epoch": 1.1733280569845668,
4165
- "grad_norm": 115.63604736328125,
4166
- "learning_rate": 2.011139838277777e-07,
4167
- "loss": 3.811,
4168
- "step": 14825
4169
- },
4170
- {
4171
- "epoch": 1.1753066877720617,
4172
- "grad_norm": 128.9251708984375,
4173
- "learning_rate": 2.0103153979078998e-07,
4174
- "loss": 4.013,
4175
- "step": 14850
4176
- },
4177
- {
4178
- "epoch": 1.1772853185595569,
4179
- "grad_norm": 115.66397857666016,
4180
- "learning_rate": 2.0094909575380232e-07,
4181
- "loss": 3.9727,
4182
- "step": 14875
4183
- },
4184
- {
4185
- "epoch": 1.1792639493470518,
4186
- "grad_norm": 105.60704040527344,
4187
- "learning_rate": 2.008666517168146e-07,
4188
- "loss": 4.0072,
4189
- "step": 14900
4190
- },
4191
- {
4192
- "epoch": 1.181242580134547,
4193
- "grad_norm": 127.380859375,
4194
- "learning_rate": 2.0078420767982694e-07,
4195
- "loss": 3.763,
4196
- "step": 14925
4197
- },
4198
- {
4199
- "epoch": 1.183221210922042,
4200
- "grad_norm": 87.45874786376953,
4201
- "learning_rate": 2.0070176364283922e-07,
4202
- "loss": 3.6529,
4203
- "step": 14950
4204
- },
4205
- {
4206
- "epoch": 1.185199841709537,
4207
- "grad_norm": 257.6990966796875,
4208
- "learning_rate": 2.0061931960585154e-07,
4209
- "loss": 3.8049,
4210
- "step": 14975
4211
- },
4212
- {
4213
- "epoch": 1.187178472497032,
4214
- "grad_norm": 111.80567169189453,
4215
- "learning_rate": 2.0053687556886385e-07,
4216
- "loss": 3.9666,
4217
- "step": 15000
4218
- },
4219
- {
4220
- "epoch": 1.1891571032845272,
4221
- "grad_norm": 98.00049591064453,
4222
- "learning_rate": 2.0045443153187616e-07,
4223
- "loss": 3.773,
4224
- "step": 15025
4225
- },
4226
- {
4227
- "epoch": 1.1911357340720221,
4228
- "grad_norm": 117.19742584228516,
4229
- "learning_rate": 2.0037198749488844e-07,
4230
- "loss": 3.8228,
4231
- "step": 15050
4232
- },
4233
- {
4234
- "epoch": 1.1931143648595173,
4235
- "grad_norm": 99.34817504882812,
4236
- "learning_rate": 2.0028954345790078e-07,
4237
- "loss": 3.8045,
4238
- "step": 15075
4239
- },
4240
- {
4241
- "epoch": 1.1950929956470122,
4242
- "grad_norm": 193.61734008789062,
4243
- "learning_rate": 2.0020709942091307e-07,
4244
- "loss": 3.7231,
4245
- "step": 15100
4246
- },
4247
- {
4248
- "epoch": 1.1970716264345074,
4249
- "grad_norm": 110.1533203125,
4250
- "learning_rate": 2.001246553839254e-07,
4251
- "loss": 3.7013,
4252
- "step": 15125
4253
- },
4254
- {
4255
- "epoch": 1.1990502572220023,
4256
- "grad_norm": 81.41429901123047,
4257
- "learning_rate": 2.000422113469377e-07,
4258
- "loss": 3.5668,
4259
- "step": 15150
4260
- },
4261
- {
4262
- "epoch": 1.2010288880094975,
4263
- "grad_norm": 103.47528839111328,
4264
- "learning_rate": 1.9995976730994998e-07,
4265
- "loss": 3.6627,
4266
- "step": 15175
4267
- },
4268
- {
4269
- "epoch": 1.2030075187969924,
4270
- "grad_norm": 179.40489196777344,
4271
- "learning_rate": 1.9987732327296231e-07,
4272
- "loss": 3.8307,
4273
- "step": 15200
4274
- },
4275
- {
4276
- "epoch": 1.2049861495844876,
4277
- "grad_norm": 108.21305084228516,
4278
- "learning_rate": 1.997948792359746e-07,
4279
- "loss": 3.7619,
4280
- "step": 15225
4281
- },
4282
- {
4283
- "epoch": 1.2069647803719825,
4284
- "grad_norm": 206.6326904296875,
4285
- "learning_rate": 1.997124351989869e-07,
4286
- "loss": 3.7036,
4287
- "step": 15250
4288
- },
4289
- {
4290
- "epoch": 1.2089434111594777,
4291
- "grad_norm": 115.525390625,
4292
- "learning_rate": 1.9962999116199922e-07,
4293
- "loss": 3.823,
4294
- "step": 15275
4295
- },
4296
- {
4297
- "epoch": 1.2109220419469726,
4298
- "grad_norm": 124.59696197509766,
4299
- "learning_rate": 1.9954754712501153e-07,
4300
- "loss": 3.7983,
4301
- "step": 15300
4302
- },
4303
- {
4304
- "epoch": 1.2129006727344678,
4305
- "grad_norm": 107.92992401123047,
4306
- "learning_rate": 1.9946510308802385e-07,
4307
- "loss": 3.7677,
4308
- "step": 15325
4309
- },
4310
- {
4311
- "epoch": 1.2148793035219627,
4312
- "grad_norm": 164.7980194091797,
4313
- "learning_rate": 1.9938265905103616e-07,
4314
- "loss": 3.7692,
4315
- "step": 15350
4316
- },
4317
- {
4318
- "epoch": 1.2168579343094579,
4319
- "grad_norm": 95.48210906982422,
4320
- "learning_rate": 1.9930021501404844e-07,
4321
- "loss": 3.8568,
4322
- "step": 15375
4323
- },
4324
- {
4325
- "epoch": 1.2188365650969528,
4326
- "grad_norm": 172.18484497070312,
4327
- "learning_rate": 1.9921777097706078e-07,
4328
- "loss": 3.596,
4329
- "step": 15400
4330
- },
4331
- {
4332
- "epoch": 1.220815195884448,
4333
- "grad_norm": 146.89479064941406,
4334
- "learning_rate": 1.9913532694007307e-07,
4335
- "loss": 3.7992,
4336
- "step": 15425
4337
- },
4338
- {
4339
- "epoch": 1.222793826671943,
4340
- "grad_norm": 106.06961822509766,
4341
- "learning_rate": 1.9905288290308538e-07,
4342
- "loss": 3.5689,
4343
- "step": 15450
4344
- },
4345
- {
4346
- "epoch": 1.224772457459438,
4347
- "grad_norm": 124.81307220458984,
4348
- "learning_rate": 1.989704388660977e-07,
4349
- "loss": 3.8072,
4350
- "step": 15475
4351
- },
4352
- {
4353
- "epoch": 1.226751088246933,
4354
- "grad_norm": 144.33468627929688,
4355
- "learning_rate": 1.9888799482910997e-07,
4356
- "loss": 3.7889,
4357
- "step": 15500
4358
- },
4359
- {
4360
- "epoch": 1.2287297190344282,
4361
- "grad_norm": 108.416015625,
4362
- "learning_rate": 1.988055507921223e-07,
4363
- "loss": 3.7345,
4364
- "step": 15525
4365
- },
4366
- {
4367
- "epoch": 1.2307083498219233,
4368
- "grad_norm": 118.40252685546875,
4369
- "learning_rate": 1.987231067551346e-07,
4370
- "loss": 3.9306,
4371
- "step": 15550
4372
- },
4373
- {
4374
- "epoch": 1.2326869806094183,
4375
- "grad_norm": 121.7845230102539,
4376
- "learning_rate": 1.986406627181469e-07,
4377
- "loss": 3.6828,
4378
- "step": 15575
4379
- },
4380
- {
4381
- "epoch": 1.2346656113969132,
4382
- "grad_norm": 126.77731323242188,
4383
- "learning_rate": 1.9855821868115922e-07,
4384
- "loss": 3.8064,
4385
- "step": 15600
4386
- },
4387
- {
4388
- "epoch": 1.2366442421844084,
4389
- "grad_norm": 128.55198669433594,
4390
- "learning_rate": 1.9847577464417153e-07,
4391
- "loss": 3.952,
4392
- "step": 15625
4393
- },
4394
- {
4395
- "epoch": 1.2386228729719035,
4396
- "grad_norm": 99.97991180419922,
4397
- "learning_rate": 1.9839333060718382e-07,
4398
- "loss": 3.8026,
4399
- "step": 15650
4400
- },
4401
- {
4402
- "epoch": 1.2406015037593985,
4403
- "grad_norm": 109.74803924560547,
4404
- "learning_rate": 1.9831088657019615e-07,
4405
- "loss": 3.8499,
4406
- "step": 15675
4407
- },
4408
- {
4409
- "epoch": 1.2425801345468936,
4410
- "grad_norm": 125.961669921875,
4411
- "learning_rate": 1.9822844253320844e-07,
4412
- "loss": 3.8659,
4413
- "step": 15700
4414
- },
4415
- {
4416
- "epoch": 1.2445587653343886,
4417
- "grad_norm": 106.30962371826172,
4418
- "learning_rate": 1.9814599849622078e-07,
4419
- "loss": 3.7085,
4420
- "step": 15725
4421
- },
4422
- {
4423
- "epoch": 1.2465373961218837,
4424
- "grad_norm": 101.37333679199219,
4425
- "learning_rate": 1.9806355445923306e-07,
4426
- "loss": 3.6006,
4427
- "step": 15750
4428
- },
4429
- {
4430
- "epoch": 1.2485160269093787,
4431
- "grad_norm": 117.87637329101562,
4432
- "learning_rate": 1.9798111042224537e-07,
4433
- "loss": 3.7463,
4434
- "step": 15775
4435
- },
4436
- {
4437
- "epoch": 1.2504946576968738,
4438
- "grad_norm": 158.502685546875,
4439
- "learning_rate": 1.9789866638525769e-07,
4440
- "loss": 3.7699,
4441
- "step": 15800
4442
- },
4443
- {
4444
- "epoch": 1.2524732884843688,
4445
- "grad_norm": 213.62936401367188,
4446
- "learning_rate": 1.9781622234826997e-07,
4447
- "loss": 3.9892,
4448
- "step": 15825
4449
- },
4450
- {
4451
- "epoch": 1.254451919271864,
4452
- "grad_norm": 120.2110595703125,
4453
- "learning_rate": 1.9773377831128228e-07,
4454
- "loss": 3.6972,
4455
- "step": 15850
4456
- },
4457
- {
4458
- "epoch": 1.2564305500593589,
4459
- "grad_norm": 115.94564819335938,
4460
- "learning_rate": 1.976513342742946e-07,
4461
- "loss": 3.6659,
4462
- "step": 15875
4463
- },
4464
- {
4465
- "epoch": 1.258409180846854,
4466
- "grad_norm": 100.15674591064453,
4467
- "learning_rate": 1.975688902373069e-07,
4468
- "loss": 3.6585,
4469
- "step": 15900
4470
- },
4471
- {
4472
- "epoch": 1.260387811634349,
4473
- "grad_norm": 126.27003479003906,
4474
- "learning_rate": 1.9748644620031922e-07,
4475
- "loss": 3.793,
4476
- "step": 15925
4477
- },
4478
- {
4479
- "epoch": 1.2623664424218441,
4480
- "grad_norm": 122.43645477294922,
4481
- "learning_rate": 1.9740400216333153e-07,
4482
- "loss": 3.7562,
4483
- "step": 15950
4484
- },
4485
- {
4486
- "epoch": 1.264345073209339,
4487
- "grad_norm": 121.08145904541016,
4488
- "learning_rate": 1.9732155812634381e-07,
4489
- "loss": 3.759,
4490
- "step": 15975
4491
- },
4492
- {
4493
- "epoch": 1.2663237039968342,
4494
- "grad_norm": 116.81194305419922,
4495
- "learning_rate": 1.9723911408935615e-07,
4496
- "loss": 3.6303,
4497
- "step": 16000
4498
- },
4499
- {
4500
- "epoch": 1.2683023347843292,
4501
- "grad_norm": 107.8984146118164,
4502
- "learning_rate": 1.9715667005236844e-07,
4503
- "loss": 3.7525,
4504
- "step": 16025
4505
- },
4506
- {
4507
- "epoch": 1.2702809655718243,
4508
- "grad_norm": 114.40152740478516,
4509
- "learning_rate": 1.9707422601538075e-07,
4510
- "loss": 3.6044,
4511
- "step": 16050
4512
- },
4513
- {
4514
- "epoch": 1.2722595963593193,
4515
- "grad_norm": 128.336181640625,
4516
- "learning_rate": 1.9699178197839306e-07,
4517
- "loss": 3.6883,
4518
- "step": 16075
4519
- },
4520
- {
4521
- "epoch": 1.2742382271468145,
4522
- "grad_norm": 177.9373321533203,
4523
- "learning_rate": 1.9690933794140535e-07,
4524
- "loss": 3.7305,
4525
- "step": 16100
4526
- },
4527
- {
4528
- "epoch": 1.2762168579343094,
4529
- "grad_norm": 137.6163787841797,
4530
- "learning_rate": 1.9682689390441768e-07,
4531
- "loss": 3.4276,
4532
- "step": 16125
4533
- },
4534
- {
4535
- "epoch": 1.2781954887218046,
4536
- "grad_norm": 93.84378814697266,
4537
- "learning_rate": 1.9674444986742997e-07,
4538
- "loss": 3.954,
4539
- "step": 16150
4540
- },
4541
- {
4542
- "epoch": 1.2801741195092995,
4543
- "grad_norm": 114.7217788696289,
4544
- "learning_rate": 1.9666200583044228e-07,
4545
- "loss": 3.9032,
4546
- "step": 16175
4547
- },
4548
- {
4549
- "epoch": 1.2821527502967947,
4550
- "grad_norm": 127.64576721191406,
4551
- "learning_rate": 1.965795617934546e-07,
4552
- "loss": 3.9561,
4553
- "step": 16200
4554
- },
4555
- {
4556
- "epoch": 1.2841313810842896,
4557
- "grad_norm": 147.38531494140625,
4558
- "learning_rate": 1.964971177564669e-07,
4559
- "loss": 3.7639,
4560
- "step": 16225
4561
- },
4562
- {
4563
- "epoch": 1.2861100118717848,
4564
- "grad_norm": 106.52879333496094,
4565
- "learning_rate": 1.964146737194792e-07,
4566
- "loss": 3.6835,
4567
- "step": 16250
4568
- },
4569
- {
4570
- "epoch": 1.2880886426592797,
4571
- "grad_norm": 117.00553131103516,
4572
- "learning_rate": 1.9633222968249153e-07,
4573
- "loss": 3.5749,
4574
- "step": 16275
4575
- },
4576
- {
4577
- "epoch": 1.2900672734467749,
4578
- "grad_norm": 145.50201416015625,
4579
- "learning_rate": 1.962497856455038e-07,
4580
- "loss": 3.8452,
4581
- "step": 16300
4582
- },
4583
- {
4584
- "epoch": 1.29204590423427,
4585
- "grad_norm": 109.88056945800781,
4586
- "learning_rate": 1.9616734160851615e-07,
4587
- "loss": 3.7797,
4588
- "step": 16325
4589
- },
4590
- {
4591
- "epoch": 1.294024535021765,
4592
- "grad_norm": 98.50968170166016,
4593
- "learning_rate": 1.9608489757152844e-07,
4594
- "loss": 3.9423,
4595
- "step": 16350
4596
- },
4597
- {
4598
- "epoch": 1.29600316580926,
4599
- "grad_norm": 137.3463134765625,
4600
- "learning_rate": 1.9600245353454075e-07,
4601
- "loss": 3.8175,
4602
- "step": 16375
4603
- },
4604
- {
4605
- "epoch": 1.297981796596755,
4606
- "grad_norm": 104.44812774658203,
4607
- "learning_rate": 1.9592000949755306e-07,
4608
- "loss": 3.8284,
4609
- "step": 16400
4610
- },
4611
- {
4612
- "epoch": 1.2999604273842502,
4613
- "grad_norm": 134.79293823242188,
4614
- "learning_rate": 1.9583756546056534e-07,
4615
- "loss": 3.8295,
4616
- "step": 16425
4617
- },
4618
- {
4619
- "epoch": 1.3019390581717452,
4620
- "grad_norm": 136.05784606933594,
4621
- "learning_rate": 1.9575512142357766e-07,
4622
- "loss": 3.9577,
4623
- "step": 16450
4624
- },
4625
- {
4626
- "epoch": 1.30391768895924,
4627
- "grad_norm": 162.8216094970703,
4628
- "learning_rate": 1.9567267738658997e-07,
4629
- "loss": 3.9843,
4630
- "step": 16475
4631
- },
4632
- {
4633
- "epoch": 1.3058963197467353,
4634
- "grad_norm": 132.25741577148438,
4635
- "learning_rate": 1.9559023334960228e-07,
4636
- "loss": 3.8045,
4637
- "step": 16500
4638
- },
4639
- {
4640
- "epoch": 1.3078749505342304,
4641
- "grad_norm": 140.72642517089844,
4642
- "learning_rate": 1.955077893126146e-07,
4643
- "loss": 3.8217,
4644
- "step": 16525
4645
- },
4646
- {
4647
- "epoch": 1.3098535813217254,
4648
- "grad_norm": 119.49310302734375,
4649
- "learning_rate": 1.954253452756269e-07,
4650
- "loss": 3.6075,
4651
- "step": 16550
4652
- },
4653
- {
4654
- "epoch": 1.3118322121092203,
4655
- "grad_norm": 189.51280212402344,
4656
- "learning_rate": 1.953429012386392e-07,
4657
- "loss": 3.9516,
4658
- "step": 16575
4659
- },
4660
- {
4661
- "epoch": 1.3138108428967155,
4662
- "grad_norm": 106.88180541992188,
4663
- "learning_rate": 1.9526045720165153e-07,
4664
- "loss": 3.9774,
4665
- "step": 16600
4666
- },
4667
- {
4668
- "epoch": 1.3157894736842106,
4669
- "grad_norm": 136.42471313476562,
4670
- "learning_rate": 1.951780131646638e-07,
4671
- "loss": 3.7752,
4672
- "step": 16625
4673
- },
4674
- {
4675
- "epoch": 1.3177681044717056,
4676
- "grad_norm": 134.68948364257812,
4677
- "learning_rate": 1.9509556912767612e-07,
4678
- "loss": 3.6695,
4679
- "step": 16650
4680
- },
4681
- {
4682
- "epoch": 1.3197467352592005,
4683
- "grad_norm": 88.88003540039062,
4684
- "learning_rate": 1.9501312509068843e-07,
4685
- "loss": 3.9039,
4686
- "step": 16675
4687
- },
4688
- {
4689
- "epoch": 1.3217253660466957,
4690
- "grad_norm": 106.34395599365234,
4691
- "learning_rate": 1.9493068105370075e-07,
4692
- "loss": 3.6726,
4693
- "step": 16700
4694
- },
4695
- {
4696
- "epoch": 1.3237039968341908,
4697
- "grad_norm": 94.92339324951172,
4698
- "learning_rate": 1.9484823701671306e-07,
4699
- "loss": 3.7362,
4700
- "step": 16725
4701
- },
4702
- {
4703
- "epoch": 1.3256826276216858,
4704
- "grad_norm": 131.17221069335938,
4705
- "learning_rate": 1.9476579297972534e-07,
4706
- "loss": 3.6668,
4707
- "step": 16750
4708
- },
4709
- {
4710
- "epoch": 1.327661258409181,
4711
- "grad_norm": 115.50166320800781,
4712
- "learning_rate": 1.9468334894273765e-07,
4713
- "loss": 3.9881,
4714
- "step": 16775
4715
- },
4716
- {
4717
- "epoch": 1.3296398891966759,
4718
- "grad_norm": 146.72767639160156,
4719
- "learning_rate": 1.9460090490574997e-07,
4720
- "loss": 3.8375,
4721
- "step": 16800
4722
- },
4723
- {
4724
- "epoch": 1.331618519984171,
4725
- "grad_norm": 125.41651153564453,
4726
- "learning_rate": 1.9451846086876228e-07,
4727
- "loss": 3.7523,
4728
- "step": 16825
4729
- },
4730
- {
4731
- "epoch": 1.333597150771666,
4732
- "grad_norm": 148.9279327392578,
4733
- "learning_rate": 1.9443601683177456e-07,
4734
- "loss": 3.7209,
4735
- "step": 16850
4736
- },
4737
- {
4738
- "epoch": 1.3355757815591611,
4739
- "grad_norm": 132.95213317871094,
4740
- "learning_rate": 1.943535727947869e-07,
4741
- "loss": 3.8043,
4742
- "step": 16875
4743
- },
4744
- {
4745
- "epoch": 1.337554412346656,
4746
- "grad_norm": 121.32286834716797,
4747
- "learning_rate": 1.9427112875779919e-07,
4748
- "loss": 3.8463,
4749
- "step": 16900
4750
- },
4751
- {
4752
- "epoch": 1.3395330431341512,
4753
- "grad_norm": 104.13847351074219,
4754
- "learning_rate": 1.9418868472081152e-07,
4755
- "loss": 3.6028,
4756
- "step": 16925
4757
- },
4758
- {
4759
- "epoch": 1.3415116739216462,
4760
- "grad_norm": 116.32896423339844,
4761
- "learning_rate": 1.941062406838238e-07,
4762
- "loss": 3.6065,
4763
- "step": 16950
4764
- },
4765
- {
4766
- "epoch": 1.3434903047091413,
4767
- "grad_norm": 123.75446319580078,
4768
- "learning_rate": 1.9402379664683612e-07,
4769
- "loss": 3.8383,
4770
- "step": 16975
4771
- },
4772
- {
4773
- "epoch": 1.3454689354966363,
4774
- "grad_norm": 146.36181640625,
4775
- "learning_rate": 1.9394135260984843e-07,
4776
- "loss": 3.7784,
4777
- "step": 17000
4778
- },
4779
- {
4780
- "epoch": 1.3474475662841314,
4781
- "grad_norm": 153.75225830078125,
4782
- "learning_rate": 1.9385890857286074e-07,
4783
- "loss": 3.8691,
4784
- "step": 17025
4785
- },
4786
- {
4787
- "epoch": 1.3494261970716264,
4788
- "grad_norm": 144.79983520507812,
4789
- "learning_rate": 1.9377646453587303e-07,
4790
- "loss": 3.7902,
4791
- "step": 17050
4792
- },
4793
- {
4794
- "epoch": 1.3514048278591215,
4795
- "grad_norm": 107.72118377685547,
4796
- "learning_rate": 1.9369402049888534e-07,
4797
- "loss": 3.7424,
4798
- "step": 17075
4799
- },
4800
- {
4801
- "epoch": 1.3533834586466165,
4802
- "grad_norm": 107.33843994140625,
4803
- "learning_rate": 1.9361157646189765e-07,
4804
- "loss": 3.7423,
4805
- "step": 17100
4806
- },
4807
- {
4808
- "epoch": 1.3553620894341116,
4809
- "grad_norm": 154.79566955566406,
4810
- "learning_rate": 1.9352913242490996e-07,
4811
- "loss": 3.5984,
4812
- "step": 17125
4813
- },
4814
- {
4815
- "epoch": 1.3573407202216066,
4816
- "grad_norm": 118.29646301269531,
4817
- "learning_rate": 1.9344668838792227e-07,
4818
- "loss": 3.6184,
4819
- "step": 17150
4820
- },
4821
- {
4822
- "epoch": 1.3593193510091017,
4823
- "grad_norm": 194.39981079101562,
4824
- "learning_rate": 1.9336424435093456e-07,
4825
- "loss": 3.7566,
4826
- "step": 17175
4827
- },
4828
- {
4829
- "epoch": 1.361297981796597,
4830
- "grad_norm": 126.4017333984375,
4831
- "learning_rate": 1.932818003139469e-07,
4832
- "loss": 3.7378,
4833
- "step": 17200
4834
- },
4835
- {
4836
- "epoch": 1.3632766125840918,
4837
- "grad_norm": 154.46499633789062,
4838
- "learning_rate": 1.9319935627695918e-07,
4839
- "loss": 3.575,
4840
- "step": 17225
4841
- },
4842
- {
4843
- "epoch": 1.3652552433715868,
4844
- "grad_norm": 121.02112579345703,
4845
- "learning_rate": 1.9311691223997152e-07,
4846
- "loss": 3.5395,
4847
- "step": 17250
4848
- },
4849
- {
4850
- "epoch": 1.367233874159082,
4851
- "grad_norm": 112.81676483154297,
4852
- "learning_rate": 1.930344682029838e-07,
4853
- "loss": 3.7668,
4854
- "step": 17275
4855
- },
4856
- {
4857
- "epoch": 1.369212504946577,
4858
- "grad_norm": 128.57958984375,
4859
- "learning_rate": 1.9295202416599612e-07,
4860
- "loss": 3.6508,
4861
- "step": 17300
4862
- },
4863
- {
4864
- "epoch": 1.371191135734072,
4865
- "grad_norm": 100.09276580810547,
4866
- "learning_rate": 1.9286958012900843e-07,
4867
- "loss": 3.568,
4868
- "step": 17325
4869
- },
4870
- {
4871
- "epoch": 1.373169766521567,
4872
- "grad_norm": 102.0973892211914,
4873
- "learning_rate": 1.9278713609202074e-07,
4874
- "loss": 3.6707,
4875
- "step": 17350
4876
- },
4877
- {
4878
- "epoch": 1.3751483973090621,
4879
- "grad_norm": 180.7505645751953,
4880
- "learning_rate": 1.9270469205503303e-07,
4881
- "loss": 3.6053,
4882
- "step": 17375
4883
- },
4884
- {
4885
- "epoch": 1.3771270280965573,
4886
- "grad_norm": 141.00286865234375,
4887
- "learning_rate": 1.9262224801804534e-07,
4888
- "loss": 3.802,
4889
- "step": 17400
4890
- },
4891
- {
4892
- "epoch": 1.3791056588840522,
4893
- "grad_norm": 105.14612579345703,
4894
- "learning_rate": 1.9253980398105765e-07,
4895
- "loss": 3.8125,
4896
- "step": 17425
4897
- },
4898
- {
4899
- "epoch": 1.3810842896715472,
4900
- "grad_norm": 134.7199249267578,
4901
- "learning_rate": 1.9245735994406993e-07,
4902
- "loss": 3.6837,
4903
- "step": 17450
4904
- },
4905
- {
4906
- "epoch": 1.3830629204590423,
4907
- "grad_norm": 161.79367065429688,
4908
- "learning_rate": 1.9237491590708227e-07,
4909
- "loss": 3.4173,
4910
- "step": 17475
4911
- },
4912
- {
4913
- "epoch": 1.3850415512465375,
4914
- "grad_norm": 90.2686996459961,
4915
- "learning_rate": 1.9229247187009456e-07,
4916
- "loss": 3.7035,
4917
- "step": 17500
4918
- },
4919
- {
4920
- "epoch": 1.3870201820340324,
4921
- "grad_norm": 138.9261016845703,
4922
- "learning_rate": 1.922100278331069e-07,
4923
- "loss": 3.7375,
4924
- "step": 17525
4925
- },
4926
- {
4927
- "epoch": 1.3889988128215274,
4928
- "grad_norm": 99.18257141113281,
4929
- "learning_rate": 1.9212758379611918e-07,
4930
- "loss": 3.7228,
4931
- "step": 17550
4932
- },
4933
- {
4934
- "epoch": 1.3909774436090225,
4935
- "grad_norm": 108.61868286132812,
4936
- "learning_rate": 1.920451397591315e-07,
4937
- "loss": 3.8505,
4938
- "step": 17575
4939
- },
4940
- {
4941
- "epoch": 1.3929560743965177,
4942
- "grad_norm": 89.02510070800781,
4943
- "learning_rate": 1.919626957221438e-07,
4944
- "loss": 3.7366,
4945
- "step": 17600
4946
- },
4947
- {
4948
- "epoch": 1.3949347051840126,
4949
- "grad_norm": 183.8499755859375,
4950
- "learning_rate": 1.9188025168515612e-07,
4951
- "loss": 3.7842,
4952
- "step": 17625
4953
- },
4954
- {
4955
- "epoch": 1.3969133359715078,
4956
- "grad_norm": 111.4132080078125,
4957
- "learning_rate": 1.917978076481684e-07,
4958
- "loss": 4.0444,
4959
- "step": 17650
4960
- },
4961
- {
4962
- "epoch": 1.3988919667590027,
4963
- "grad_norm": 114.52892303466797,
4964
- "learning_rate": 1.917153636111807e-07,
4965
- "loss": 3.6568,
4966
- "step": 17675
4967
- },
4968
- {
4969
- "epoch": 1.400870597546498,
4970
- "grad_norm": 108.65158081054688,
4971
- "learning_rate": 1.9163291957419302e-07,
4972
- "loss": 3.7598,
4973
- "step": 17700
4974
- },
4975
- {
4976
- "epoch": 1.4028492283339928,
4977
- "grad_norm": 118.24126434326172,
4978
- "learning_rate": 1.9155047553720534e-07,
4979
- "loss": 3.5768,
4980
- "step": 17725
4981
- },
4982
- {
4983
- "epoch": 1.404827859121488,
4984
- "grad_norm": 116.43769073486328,
4985
- "learning_rate": 1.9146803150021765e-07,
4986
- "loss": 3.6195,
4987
- "step": 17750
4988
- },
4989
- {
4990
- "epoch": 1.406806489908983,
4991
- "grad_norm": 96.20083618164062,
4992
- "learning_rate": 1.9138558746322993e-07,
4993
- "loss": 3.7732,
4994
- "step": 17775
4995
- },
4996
- {
4997
- "epoch": 1.408785120696478,
4998
- "grad_norm": 106.93498229980469,
4999
- "learning_rate": 1.9130314342624227e-07,
5000
- "loss": 3.5733,
5001
- "step": 17800
5002
- },
5003
- {
5004
- "epoch": 1.410763751483973,
5005
- "grad_norm": 126.78841400146484,
5006
- "learning_rate": 1.9122069938925456e-07,
5007
- "loss": 3.4865,
5008
- "step": 17825
5009
- },
5010
- {
5011
- "epoch": 1.4127423822714682,
5012
- "grad_norm": 161.895263671875,
5013
- "learning_rate": 1.911382553522669e-07,
5014
- "loss": 3.7985,
5015
- "step": 17850
5016
- },
5017
- {
5018
- "epoch": 1.4147210130589631,
5019
- "grad_norm": 137.6743927001953,
5020
- "learning_rate": 1.9105581131527918e-07,
5021
- "loss": 3.5315,
5022
- "step": 17875
5023
- },
5024
- {
5025
- "epoch": 1.4166996438464583,
5026
- "grad_norm": 81.24407196044922,
5027
- "learning_rate": 1.909733672782915e-07,
5028
- "loss": 3.6007,
5029
- "step": 17900
5030
- },
5031
- {
5032
- "epoch": 1.4186782746339532,
5033
- "grad_norm": 109.42854309082031,
5034
- "learning_rate": 1.908909232413038e-07,
5035
- "loss": 3.5264,
5036
- "step": 17925
5037
- },
5038
- {
5039
- "epoch": 1.4206569054214484,
5040
- "grad_norm": 113.56609344482422,
5041
- "learning_rate": 1.9080847920431611e-07,
5042
- "loss": 3.7304,
5043
- "step": 17950
5044
- },
5045
- {
5046
- "epoch": 1.4226355362089433,
5047
- "grad_norm": 148.14793395996094,
5048
- "learning_rate": 1.907260351673284e-07,
5049
- "loss": 3.6923,
5050
- "step": 17975
5051
- },
5052
- {
5053
- "epoch": 1.4246141669964385,
5054
- "grad_norm": 141.70896911621094,
5055
- "learning_rate": 1.906435911303407e-07,
5056
- "loss": 3.813,
5057
- "step": 18000
5058
- },
5059
- {
5060
- "epoch": 1.4265927977839334,
5061
- "grad_norm": 107.8005142211914,
5062
- "learning_rate": 1.9056114709335302e-07,
5063
- "loss": 3.8025,
5064
- "step": 18025
5065
- },
5066
- {
5067
- "epoch": 1.4285714285714286,
5068
- "grad_norm": 119.71678924560547,
5069
- "learning_rate": 1.904787030563653e-07,
5070
- "loss": 3.6632,
5071
- "step": 18050
5072
- },
5073
- {
5074
- "epoch": 1.4305500593589235,
5075
- "grad_norm": 89.46141815185547,
5076
- "learning_rate": 1.9039625901937765e-07,
5077
- "loss": 3.5954,
5078
- "step": 18075
5079
- },
5080
- {
5081
- "epoch": 1.4325286901464187,
5082
- "grad_norm": 94.6341552734375,
5083
- "learning_rate": 1.9031381498238993e-07,
5084
- "loss": 3.7922,
5085
- "step": 18100
5086
- },
5087
- {
5088
- "epoch": 1.4345073209339136,
5089
- "grad_norm": 119.52442932128906,
5090
- "learning_rate": 1.9023137094540227e-07,
5091
- "loss": 3.8677,
5092
- "step": 18125
5093
- },
5094
- {
5095
- "epoch": 1.4364859517214088,
5096
- "grad_norm": 85.59278869628906,
5097
- "learning_rate": 1.9014892690841455e-07,
5098
- "loss": 3.7679,
5099
- "step": 18150
5100
- },
5101
- {
5102
- "epoch": 1.438464582508904,
5103
- "grad_norm": 140.89190673828125,
5104
- "learning_rate": 1.9006648287142687e-07,
5105
- "loss": 3.7617,
5106
- "step": 18175
5107
- },
5108
- {
5109
- "epoch": 1.440443213296399,
5110
- "grad_norm": 114.29212188720703,
5111
- "learning_rate": 1.8998403883443918e-07,
5112
- "loss": 3.9376,
5113
- "step": 18200
5114
- },
5115
- {
5116
- "epoch": 1.4424218440838938,
5117
- "grad_norm": 118.00497436523438,
5118
- "learning_rate": 1.899015947974515e-07,
5119
- "loss": 3.7841,
5120
- "step": 18225
5121
- },
5122
- {
5123
- "epoch": 1.444400474871389,
5124
- "grad_norm": 140.94200134277344,
5125
- "learning_rate": 1.8981915076046377e-07,
5126
- "loss": 3.694,
5127
- "step": 18250
5128
- },
5129
- {
5130
- "epoch": 1.4463791056588842,
5131
- "grad_norm": 153.66397094726562,
5132
- "learning_rate": 1.897367067234761e-07,
5133
- "loss": 3.7603,
5134
- "step": 18275
5135
- },
5136
- {
5137
- "epoch": 1.448357736446379,
5138
- "grad_norm": 128.06732177734375,
5139
- "learning_rate": 1.896542626864884e-07,
5140
- "loss": 3.7799,
5141
- "step": 18300
5142
- },
5143
- {
5144
- "epoch": 1.450336367233874,
5145
- "grad_norm": 112.23291778564453,
5146
- "learning_rate": 1.895718186495007e-07,
5147
- "loss": 3.7655,
5148
- "step": 18325
5149
- },
5150
- {
5151
- "epoch": 1.4523149980213692,
5152
- "grad_norm": 91.07350158691406,
5153
- "learning_rate": 1.8948937461251302e-07,
5154
- "loss": 3.643,
5155
- "step": 18350
5156
- },
5157
- {
5158
- "epoch": 1.4542936288088644,
5159
- "grad_norm": 98.65179443359375,
5160
- "learning_rate": 1.894069305755253e-07,
5161
- "loss": 3.7346,
5162
- "step": 18375
5163
- },
5164
- {
5165
- "epoch": 1.4562722595963593,
5166
- "grad_norm": 118.8671875,
5167
- "learning_rate": 1.8932448653853764e-07,
5168
- "loss": 3.7361,
5169
- "step": 18400
5170
- },
5171
- {
5172
- "epoch": 1.4582508903838542,
5173
- "grad_norm": 121.17200469970703,
5174
- "learning_rate": 1.8924204250154993e-07,
5175
- "loss": 3.7823,
5176
- "step": 18425
5177
- },
5178
- {
5179
- "epoch": 1.4602295211713494,
5180
- "grad_norm": 91.53225708007812,
5181
- "learning_rate": 1.8915959846456227e-07,
5182
- "loss": 3.659,
5183
- "step": 18450
5184
- },
5185
- {
5186
- "epoch": 1.4622081519588446,
5187
- "grad_norm": 140.78236389160156,
5188
- "learning_rate": 1.8907715442757455e-07,
5189
- "loss": 3.8495,
5190
- "step": 18475
5191
- },
5192
- {
5193
- "epoch": 1.4641867827463395,
5194
- "grad_norm": 136.43206787109375,
5195
- "learning_rate": 1.8899471039058686e-07,
5196
- "loss": 3.8468,
5197
- "step": 18500
5198
- },
5199
- {
5200
- "epoch": 1.4661654135338344,
5201
- "grad_norm": 106.83531951904297,
5202
- "learning_rate": 1.8891226635359917e-07,
5203
- "loss": 3.6799,
5204
- "step": 18525
5205
- },
5206
- {
5207
- "epoch": 1.4681440443213296,
5208
- "grad_norm": 132.8029327392578,
5209
- "learning_rate": 1.8882982231661149e-07,
5210
- "loss": 3.7441,
5211
- "step": 18550
5212
- },
5213
- {
5214
- "epoch": 1.4701226751088248,
5215
- "grad_norm": 125.74601745605469,
5216
- "learning_rate": 1.8874737827962377e-07,
5217
- "loss": 3.586,
5218
- "step": 18575
5219
- },
5220
- {
5221
- "epoch": 1.4721013058963197,
5222
- "grad_norm": 126.61151123046875,
5223
- "learning_rate": 1.886649342426361e-07,
5224
- "loss": 3.7686,
5225
- "step": 18600
5226
- },
5227
- {
5228
- "epoch": 1.4740799366838149,
5229
- "grad_norm": 127.62877655029297,
5230
- "learning_rate": 1.885824902056484e-07,
5231
- "loss": 3.74,
5232
- "step": 18625
5233
- },
5234
- {
5235
- "epoch": 1.4760585674713098,
5236
- "grad_norm": 134.3148193359375,
5237
- "learning_rate": 1.8850004616866068e-07,
5238
- "loss": 3.8815,
5239
- "step": 18650
5240
- },
5241
- {
5242
- "epoch": 1.478037198258805,
5243
- "grad_norm": 167.0948028564453,
5244
- "learning_rate": 1.8841760213167302e-07,
5245
- "loss": 3.7258,
5246
- "step": 18675
5247
- },
5248
- {
5249
- "epoch": 1.4800158290463,
5250
- "grad_norm": 103.17122650146484,
5251
- "learning_rate": 1.883351580946853e-07,
5252
- "loss": 3.7211,
5253
- "step": 18700
5254
- },
5255
- {
5256
- "epoch": 1.481994459833795,
5257
- "grad_norm": 98.9283218383789,
5258
- "learning_rate": 1.8825271405769764e-07,
5259
- "loss": 3.6401,
5260
- "step": 18725
5261
- },
5262
- {
5263
- "epoch": 1.48397309062129,
5264
- "grad_norm": 110.53363800048828,
5265
- "learning_rate": 1.8817027002070993e-07,
5266
- "loss": 3.84,
5267
- "step": 18750
5268
- },
5269
- {
5270
- "epoch": 1.4859517214087852,
5271
- "grad_norm": 115.60739135742188,
5272
- "learning_rate": 1.8808782598372224e-07,
5273
- "loss": 3.9043,
5274
- "step": 18775
5275
- },
5276
- {
5277
- "epoch": 1.4879303521962801,
5278
- "grad_norm": 130.4183807373047,
5279
- "learning_rate": 1.8800538194673455e-07,
5280
- "loss": 3.6778,
5281
- "step": 18800
5282
- },
5283
- {
5284
- "epoch": 1.4899089829837753,
5285
- "grad_norm": 94.30120086669922,
5286
- "learning_rate": 1.8792293790974686e-07,
5287
- "loss": 3.5853,
5288
- "step": 18825
5289
- },
5290
- {
5291
- "epoch": 1.4918876137712702,
5292
- "grad_norm": 102.75646209716797,
5293
- "learning_rate": 1.8784049387275917e-07,
5294
- "loss": 4.0326,
5295
- "step": 18850
5296
- },
5297
- {
5298
- "epoch": 1.4938662445587654,
5299
- "grad_norm": 153.1327362060547,
5300
- "learning_rate": 1.8775804983577148e-07,
5301
- "loss": 3.8785,
5302
- "step": 18875
5303
- },
5304
- {
5305
- "epoch": 1.4958448753462603,
5306
- "grad_norm": 100.03012084960938,
5307
- "learning_rate": 1.8767560579878377e-07,
5308
- "loss": 3.692,
5309
- "step": 18900
5310
- },
5311
- {
5312
- "epoch": 1.4978235061337555,
5313
- "grad_norm": 128.64657592773438,
5314
- "learning_rate": 1.8759316176179608e-07,
5315
- "loss": 3.7799,
5316
- "step": 18925
5317
- },
5318
- {
5319
- "epoch": 1.4998021369212504,
5320
- "grad_norm": 99.55155944824219,
5321
- "learning_rate": 1.875107177248084e-07,
5322
- "loss": 3.7112,
5323
- "step": 18950
5324
- },
5325
- {
5326
- "epoch": 1.5017807677087456,
5327
- "grad_norm": 171.94178771972656,
5328
- "learning_rate": 1.8742827368782068e-07,
5329
- "loss": 3.7663,
5330
- "step": 18975
5331
- },
5332
- {
5333
- "epoch": 1.5037593984962405,
5334
- "grad_norm": 107.23114013671875,
5335
- "learning_rate": 1.8734582965083302e-07,
5336
- "loss": 3.6127,
5337
- "step": 19000
5338
- },
5339
- {
5340
- "epoch": 1.5057380292837357,
5341
- "grad_norm": 101.01669311523438,
5342
- "learning_rate": 1.872633856138453e-07,
5343
- "loss": 3.5948,
5344
- "step": 19025
5345
- },
5346
- {
5347
- "epoch": 1.5077166600712308,
5348
- "grad_norm": 158.38211059570312,
5349
- "learning_rate": 1.8718094157685764e-07,
5350
- "loss": 3.5552,
5351
- "step": 19050
5352
- },
5353
- {
5354
- "epoch": 1.5096952908587258,
5355
- "grad_norm": 117.05270385742188,
5356
- "learning_rate": 1.8709849753986992e-07,
5357
- "loss": 3.7912,
5358
- "step": 19075
5359
- },
5360
- {
5361
- "epoch": 1.5116739216462207,
5362
- "grad_norm": 117.87310028076172,
5363
- "learning_rate": 1.8701605350288224e-07,
5364
- "loss": 3.7512,
5365
- "step": 19100
5366
- },
5367
- {
5368
- "epoch": 1.5136525524337159,
5369
- "grad_norm": 106.46678161621094,
5370
- "learning_rate": 1.8693360946589455e-07,
5371
- "loss": 3.7715,
5372
- "step": 19125
5373
- },
5374
- {
5375
- "epoch": 1.515631183221211,
5376
- "grad_norm": 107.43925476074219,
5377
- "learning_rate": 1.8685116542890686e-07,
5378
- "loss": 3.7395,
5379
- "step": 19150
5380
- },
5381
- {
5382
- "epoch": 1.517609814008706,
5383
- "grad_norm": 101.0313491821289,
5384
- "learning_rate": 1.8676872139191914e-07,
5385
- "loss": 3.6996,
5386
- "step": 19175
5387
- },
5388
- {
5389
- "epoch": 1.519588444796201,
5390
- "grad_norm": 179.96051025390625,
5391
- "learning_rate": 1.8668627735493148e-07,
5392
- "loss": 3.6133,
5393
- "step": 19200
5394
- },
5395
- {
5396
- "epoch": 1.521567075583696,
5397
- "grad_norm": 90.7691879272461,
5398
- "learning_rate": 1.8660383331794377e-07,
5399
- "loss": 3.6053,
5400
- "step": 19225
5401
- },
5402
- {
5403
- "epoch": 1.5235457063711912,
5404
- "grad_norm": 137.12637329101562,
5405
- "learning_rate": 1.8652138928095605e-07,
5406
- "loss": 3.6739,
5407
- "step": 19250
5408
- },
5409
- {
5410
- "epoch": 1.5255243371586862,
5411
- "grad_norm": 138.26014709472656,
5412
- "learning_rate": 1.864389452439684e-07,
5413
- "loss": 3.8328,
5414
- "step": 19275
5415
- },
5416
- {
5417
- "epoch": 1.5275029679461811,
5418
- "grad_norm": 94.83419799804688,
5419
- "learning_rate": 1.8635650120698068e-07,
5420
- "loss": 3.7555,
5421
- "step": 19300
5422
- },
5423
- {
5424
- "epoch": 1.5294815987336763,
5425
- "grad_norm": 105.08883666992188,
5426
- "learning_rate": 1.8627405716999301e-07,
5427
- "loss": 3.8474,
5428
- "step": 19325
5429
- },
5430
- {
5431
- "epoch": 1.5314602295211714,
5432
- "grad_norm": 124.04432678222656,
5433
- "learning_rate": 1.861916131330053e-07,
5434
- "loss": 3.5361,
5435
- "step": 19350
5436
- },
5437
- {
5438
- "epoch": 1.5334388603086664,
5439
- "grad_norm": 97.83289337158203,
5440
- "learning_rate": 1.861091690960176e-07,
5441
- "loss": 3.6283,
5442
- "step": 19375
5443
- },
5444
- {
5445
- "epoch": 1.5354174910961613,
5446
- "grad_norm": 110.26960754394531,
5447
- "learning_rate": 1.8602672505902992e-07,
5448
- "loss": 3.7872,
5449
- "step": 19400
5450
- },
5451
- {
5452
- "epoch": 1.5373961218836565,
5453
- "grad_norm": 115.99392700195312,
5454
- "learning_rate": 1.8594428102204223e-07,
5455
- "loss": 3.6191,
5456
- "step": 19425
5457
- },
5458
- {
5459
- "epoch": 1.5393747526711516,
5460
- "grad_norm": 105.29808044433594,
5461
- "learning_rate": 1.8586183698505454e-07,
5462
- "loss": 3.6837,
5463
- "step": 19450
5464
- },
5465
- {
5466
- "epoch": 1.5413533834586466,
5467
- "grad_norm": 100.55680847167969,
5468
- "learning_rate": 1.8577939294806686e-07,
5469
- "loss": 3.7439,
5470
- "step": 19475
5471
- },
5472
- {
5473
- "epoch": 1.5433320142461415,
5474
- "grad_norm": 117.83526611328125,
5475
- "learning_rate": 1.8569694891107914e-07,
5476
- "loss": 3.5938,
5477
- "step": 19500
5478
- },
5479
- {
5480
- "epoch": 1.5453106450336367,
5481
- "grad_norm": 84.95865631103516,
5482
- "learning_rate": 1.8561450487409148e-07,
5483
- "loss": 3.7425,
5484
- "step": 19525
5485
- },
5486
- {
5487
- "epoch": 1.5472892758211318,
5488
- "grad_norm": 128.07754516601562,
5489
- "learning_rate": 1.8553206083710376e-07,
5490
- "loss": 3.7379,
5491
- "step": 19550
5492
- },
5493
- {
5494
- "epoch": 1.5492679066086268,
5495
- "grad_norm": 103.81727600097656,
5496
- "learning_rate": 1.8544961680011605e-07,
5497
- "loss": 3.8121,
5498
- "step": 19575
5499
- },
5500
- {
5501
- "epoch": 1.5512465373961217,
5502
- "grad_norm": 113.71387481689453,
5503
- "learning_rate": 1.853671727631284e-07,
5504
- "loss": 3.5854,
5505
- "step": 19600
5506
- },
5507
- {
5508
- "epoch": 1.5532251681836169,
5509
- "grad_norm": 106.94216918945312,
5510
- "learning_rate": 1.8528472872614067e-07,
5511
- "loss": 3.7723,
5512
- "step": 19625
5513
- },
5514
- {
5515
- "epoch": 1.555203798971112,
5516
- "grad_norm": 113.77094268798828,
5517
- "learning_rate": 1.85202284689153e-07,
5518
- "loss": 3.7218,
5519
- "step": 19650
5520
- },
5521
- {
5522
- "epoch": 1.557182429758607,
5523
- "grad_norm": 102.69156646728516,
5524
- "learning_rate": 1.851198406521653e-07,
5525
- "loss": 3.5478,
5526
- "step": 19675
5527
- },
5528
- {
5529
- "epoch": 1.5591610605461022,
5530
- "grad_norm": 146.14686584472656,
5531
- "learning_rate": 1.850373966151776e-07,
5532
- "loss": 3.5888,
5533
- "step": 19700
5534
- },
5535
- {
5536
- "epoch": 1.5611396913335973,
5537
- "grad_norm": 117.55293273925781,
5538
- "learning_rate": 1.8495495257818992e-07,
5539
- "loss": 3.5802,
5540
- "step": 19725
5541
- },
5542
- {
5543
- "epoch": 1.5631183221210923,
5544
- "grad_norm": 116.557861328125,
5545
- "learning_rate": 1.8487250854120223e-07,
5546
- "loss": 3.6588,
5547
- "step": 19750
5548
- },
5549
- {
5550
- "epoch": 1.5650969529085872,
5551
- "grad_norm": 143.797607421875,
5552
- "learning_rate": 1.8479006450421452e-07,
5553
- "loss": 3.7888,
5554
- "step": 19775
5555
- },
5556
- {
5557
- "epoch": 1.5670755836960824,
5558
- "grad_norm": 90.89989471435547,
5559
- "learning_rate": 1.8470762046722685e-07,
5560
- "loss": 3.7565,
5561
- "step": 19800
5562
- },
5563
- {
5564
- "epoch": 1.5690542144835775,
5565
- "grad_norm": 93.96920776367188,
5566
- "learning_rate": 1.8462517643023914e-07,
5567
- "loss": 3.7576,
5568
- "step": 19825
5569
- },
5570
- {
5571
- "epoch": 1.5710328452710725,
5572
- "grad_norm": 146.9014129638672,
5573
- "learning_rate": 1.8454273239325148e-07,
5574
- "loss": 3.7507,
5575
- "step": 19850
5576
- },
5577
- {
5578
- "epoch": 1.5730114760585674,
5579
- "grad_norm": 119.38358306884766,
5580
- "learning_rate": 1.8446028835626376e-07,
5581
- "loss": 3.6588,
5582
- "step": 19875
5583
- },
5584
- {
5585
- "epoch": 1.5749901068460626,
5586
- "grad_norm": 113.38998413085938,
5587
- "learning_rate": 1.8437784431927605e-07,
5588
- "loss": 3.7866,
5589
- "step": 19900
5590
- },
5591
- {
5592
- "epoch": 1.5769687376335577,
5593
- "grad_norm": 103.051513671875,
5594
- "learning_rate": 1.8429540028228839e-07,
5595
- "loss": 3.6443,
5596
- "step": 19925
5597
- },
5598
- {
5599
- "epoch": 1.5789473684210527,
5600
- "grad_norm": 94.77304077148438,
5601
- "learning_rate": 1.8421295624530067e-07,
5602
- "loss": 3.6966,
5603
- "step": 19950
5604
- },
5605
- {
5606
- "epoch": 1.5809259992085476,
5607
- "grad_norm": 167.68310546875,
5608
- "learning_rate": 1.8413051220831298e-07,
5609
- "loss": 3.684,
5610
- "step": 19975
5611
- },
5612
- {
5613
- "epoch": 1.5829046299960428,
5614
- "grad_norm": 96.78482055664062,
5615
- "learning_rate": 1.840480681713253e-07,
5616
- "loss": 3.733,
5617
- "step": 20000
5618
- },
5619
- {
5620
- "epoch": 1.5829046299960428,
5621
- "eval_loss": 3.6955573558807373,
5622
- "eval_runtime": 9.5434,
5623
- "eval_samples_per_second": 264.894,
5624
- "eval_steps_per_second": 33.112,
5625
- "step": 20000
5626
  }
5627
  ],
5628
  "logging_steps": 25,
@@ -5642,7 +2834,7 @@
5642
  "attributes": {}
5643
  }
5644
  },
5645
- "total_flos": 71534592000000.0,
5646
  "train_batch_size": 2,
5647
  "trial_name": null,
5648
  "trial_params": null
 
1
  {
2
+ "best_metric": 3.8681728839874268,
3
+ "best_model_checkpoint": "checkpoints/test_1M_1-2025-02-12-12-32/checkpoint-10000",
4
+ "epoch": 0.7914523149980214,
5
  "eval_steps": 10000,
6
+ "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2815
  "eval_samples_per_second": 264.165,
2816
  "eval_steps_per_second": 33.021,
2817
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2818
  }
2819
  ],
2820
  "logging_steps": 25,
 
2834
  "attributes": {}
2835
  }
2836
  },
2837
+ "total_flos": 35767296000000.0,
2838
  "train_batch_size": 2,
2839
  "trial_name": null,
2840
  "trial_params": null