huseinzol05's picture
Add files using upload-large-folder tool
cf4ce55 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1380738695201933,
"eval_steps": 500,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00034518467380048324,
"grad_norm": 25.833335876464844,
"learning_rate": 0.0,
"loss": 2.2926,
"step": 1
},
{
"epoch": 0.0006903693476009665,
"grad_norm": 23.58083724975586,
"learning_rate": 2e-05,
"loss": 2.1943,
"step": 2
},
{
"epoch": 0.0010355540214014498,
"grad_norm": 24.2187442779541,
"learning_rate": 2e-05,
"loss": 2.3772,
"step": 3
},
{
"epoch": 0.001380738695201933,
"grad_norm": 14.579557418823242,
"learning_rate": 2e-05,
"loss": 2.0945,
"step": 4
},
{
"epoch": 0.0017259233690024164,
"grad_norm": 15.069305419921875,
"learning_rate": 2e-05,
"loss": 2.1537,
"step": 5
},
{
"epoch": 0.0020711080428028996,
"grad_norm": 14.385339736938477,
"learning_rate": 2e-05,
"loss": 2.1585,
"step": 6
},
{
"epoch": 0.002416292716603383,
"grad_norm": 9.95476245880127,
"learning_rate": 2e-05,
"loss": 1.8333,
"step": 7
},
{
"epoch": 0.002761477390403866,
"grad_norm": 9.15005111694336,
"learning_rate": 2e-05,
"loss": 1.937,
"step": 8
},
{
"epoch": 0.0031066620642043494,
"grad_norm": 7.361661911010742,
"learning_rate": 2e-05,
"loss": 1.9484,
"step": 9
},
{
"epoch": 0.0034518467380048328,
"grad_norm": 5.931179046630859,
"learning_rate": 2e-05,
"loss": 1.7668,
"step": 10
},
{
"epoch": 0.0037970314118053157,
"grad_norm": 8.192151069641113,
"learning_rate": 2e-05,
"loss": 1.8234,
"step": 11
},
{
"epoch": 0.004142216085605799,
"grad_norm": 6.827078342437744,
"learning_rate": 2e-05,
"loss": 1.9133,
"step": 12
},
{
"epoch": 0.0044874007594062825,
"grad_norm": 7.808759689331055,
"learning_rate": 2e-05,
"loss": 1.8314,
"step": 13
},
{
"epoch": 0.004832585433206766,
"grad_norm": 7.531504154205322,
"learning_rate": 2e-05,
"loss": 1.7961,
"step": 14
},
{
"epoch": 0.0051777701070072485,
"grad_norm": 6.617609977722168,
"learning_rate": 2e-05,
"loss": 1.7624,
"step": 15
},
{
"epoch": 0.005522954780807732,
"grad_norm": 4.966954708099365,
"learning_rate": 2e-05,
"loss": 1.7034,
"step": 16
},
{
"epoch": 0.005868139454608215,
"grad_norm": 5.566157817840576,
"learning_rate": 2e-05,
"loss": 1.7722,
"step": 17
},
{
"epoch": 0.006213324128408699,
"grad_norm": 4.915079593658447,
"learning_rate": 2e-05,
"loss": 1.6602,
"step": 18
},
{
"epoch": 0.006558508802209182,
"grad_norm": 4.77263069152832,
"learning_rate": 2e-05,
"loss": 1.6854,
"step": 19
},
{
"epoch": 0.0069036934760096655,
"grad_norm": 4.299682140350342,
"learning_rate": 2e-05,
"loss": 1.6217,
"step": 20
},
{
"epoch": 0.007248878149810148,
"grad_norm": 4.1203694343566895,
"learning_rate": 2e-05,
"loss": 1.6546,
"step": 21
},
{
"epoch": 0.0075940628236106315,
"grad_norm": 4.058311939239502,
"learning_rate": 2e-05,
"loss": 1.6795,
"step": 22
},
{
"epoch": 0.007939247497411116,
"grad_norm": 3.668001651763916,
"learning_rate": 2e-05,
"loss": 1.7228,
"step": 23
},
{
"epoch": 0.008284432171211598,
"grad_norm": 3.397109270095825,
"learning_rate": 2e-05,
"loss": 1.555,
"step": 24
},
{
"epoch": 0.00862961684501208,
"grad_norm": 4.351656436920166,
"learning_rate": 2e-05,
"loss": 1.6977,
"step": 25
},
{
"epoch": 0.008974801518812565,
"grad_norm": 3.7789554595947266,
"learning_rate": 2e-05,
"loss": 1.6534,
"step": 26
},
{
"epoch": 0.009319986192613048,
"grad_norm": 3.4693193435668945,
"learning_rate": 2e-05,
"loss": 1.6248,
"step": 27
},
{
"epoch": 0.009665170866413532,
"grad_norm": 2.6027116775512695,
"learning_rate": 2e-05,
"loss": 1.4901,
"step": 28
},
{
"epoch": 0.010010355540214014,
"grad_norm": 2.6816513538360596,
"learning_rate": 2e-05,
"loss": 1.4964,
"step": 29
},
{
"epoch": 0.010355540214014497,
"grad_norm": 5.244405746459961,
"learning_rate": 2e-05,
"loss": 1.5238,
"step": 30
},
{
"epoch": 0.010700724887814981,
"grad_norm": 4.071628570556641,
"learning_rate": 2e-05,
"loss": 1.5401,
"step": 31
},
{
"epoch": 0.011045909561615464,
"grad_norm": 3.897395372390747,
"learning_rate": 2e-05,
"loss": 1.4759,
"step": 32
},
{
"epoch": 0.011391094235415948,
"grad_norm": 2.9609882831573486,
"learning_rate": 2e-05,
"loss": 1.4919,
"step": 33
},
{
"epoch": 0.01173627890921643,
"grad_norm": 3.1106183528900146,
"learning_rate": 2e-05,
"loss": 1.5436,
"step": 34
},
{
"epoch": 0.012081463583016915,
"grad_norm": 3.7441351413726807,
"learning_rate": 2e-05,
"loss": 1.5496,
"step": 35
},
{
"epoch": 0.012426648256817397,
"grad_norm": 3.6406350135803223,
"learning_rate": 2e-05,
"loss": 1.4531,
"step": 36
},
{
"epoch": 0.01277183293061788,
"grad_norm": 2.915447950363159,
"learning_rate": 2e-05,
"loss": 1.48,
"step": 37
},
{
"epoch": 0.013117017604418364,
"grad_norm": 2.8513331413269043,
"learning_rate": 2e-05,
"loss": 1.5192,
"step": 38
},
{
"epoch": 0.013462202278218847,
"grad_norm": 3.2347466945648193,
"learning_rate": 2e-05,
"loss": 1.5727,
"step": 39
},
{
"epoch": 0.013807386952019331,
"grad_norm": 2.82135272026062,
"learning_rate": 2e-05,
"loss": 1.5098,
"step": 40
},
{
"epoch": 0.014152571625819814,
"grad_norm": 2.694873809814453,
"learning_rate": 2e-05,
"loss": 1.4223,
"step": 41
},
{
"epoch": 0.014497756299620296,
"grad_norm": 2.7129478454589844,
"learning_rate": 2e-05,
"loss": 1.4062,
"step": 42
},
{
"epoch": 0.01484294097342078,
"grad_norm": 2.6555328369140625,
"learning_rate": 2e-05,
"loss": 1.4993,
"step": 43
},
{
"epoch": 0.015188125647221263,
"grad_norm": 2.3439159393310547,
"learning_rate": 2e-05,
"loss": 1.4281,
"step": 44
},
{
"epoch": 0.015533310321021747,
"grad_norm": 2.40368914604187,
"learning_rate": 2e-05,
"loss": 1.4261,
"step": 45
},
{
"epoch": 0.01587849499482223,
"grad_norm": 2.3288614749908447,
"learning_rate": 2e-05,
"loss": 1.3958,
"step": 46
},
{
"epoch": 0.016223679668622714,
"grad_norm": 2.474519968032837,
"learning_rate": 2e-05,
"loss": 1.4246,
"step": 47
},
{
"epoch": 0.016568864342423197,
"grad_norm": 2.680997371673584,
"learning_rate": 2e-05,
"loss": 1.4318,
"step": 48
},
{
"epoch": 0.01691404901622368,
"grad_norm": 2.6899235248565674,
"learning_rate": 2e-05,
"loss": 1.3581,
"step": 49
},
{
"epoch": 0.01725923369002416,
"grad_norm": 2.5045225620269775,
"learning_rate": 2e-05,
"loss": 1.3983,
"step": 50
},
{
"epoch": 0.017604418363824648,
"grad_norm": 2.650184154510498,
"learning_rate": 2e-05,
"loss": 1.4005,
"step": 51
},
{
"epoch": 0.01794960303762513,
"grad_norm": 2.560302495956421,
"learning_rate": 2e-05,
"loss": 1.3706,
"step": 52
},
{
"epoch": 0.018294787711425613,
"grad_norm": 2.116626739501953,
"learning_rate": 2e-05,
"loss": 1.2987,
"step": 53
},
{
"epoch": 0.018639972385226095,
"grad_norm": 3.0732431411743164,
"learning_rate": 2e-05,
"loss": 1.4394,
"step": 54
},
{
"epoch": 0.018985157059026578,
"grad_norm": 2.880014657974243,
"learning_rate": 2e-05,
"loss": 1.3772,
"step": 55
},
{
"epoch": 0.019330341732827064,
"grad_norm": 2.65002179145813,
"learning_rate": 2e-05,
"loss": 1.344,
"step": 56
},
{
"epoch": 0.019675526406627546,
"grad_norm": 2.4109294414520264,
"learning_rate": 2e-05,
"loss": 1.3818,
"step": 57
},
{
"epoch": 0.02002071108042803,
"grad_norm": 3.318305730819702,
"learning_rate": 2e-05,
"loss": 1.4488,
"step": 58
},
{
"epoch": 0.02036589575422851,
"grad_norm": 2.9551541805267334,
"learning_rate": 2e-05,
"loss": 1.3245,
"step": 59
},
{
"epoch": 0.020711080428028994,
"grad_norm": 2.512965679168701,
"learning_rate": 2e-05,
"loss": 1.3476,
"step": 60
},
{
"epoch": 0.02105626510182948,
"grad_norm": 2.608680248260498,
"learning_rate": 2e-05,
"loss": 1.335,
"step": 61
},
{
"epoch": 0.021401449775629963,
"grad_norm": 2.2880616188049316,
"learning_rate": 2e-05,
"loss": 1.2781,
"step": 62
},
{
"epoch": 0.021746634449430445,
"grad_norm": 2.7310123443603516,
"learning_rate": 2e-05,
"loss": 1.3449,
"step": 63
},
{
"epoch": 0.022091819123230928,
"grad_norm": 2.5008130073547363,
"learning_rate": 2e-05,
"loss": 1.4042,
"step": 64
},
{
"epoch": 0.02243700379703141,
"grad_norm": 3.5907320976257324,
"learning_rate": 2e-05,
"loss": 1.344,
"step": 65
},
{
"epoch": 0.022782188470831896,
"grad_norm": 2.4797489643096924,
"learning_rate": 2e-05,
"loss": 1.3342,
"step": 66
},
{
"epoch": 0.02312737314463238,
"grad_norm": 2.556204319000244,
"learning_rate": 2e-05,
"loss": 1.3244,
"step": 67
},
{
"epoch": 0.02347255781843286,
"grad_norm": 5.068964004516602,
"learning_rate": 2e-05,
"loss": 1.3041,
"step": 68
},
{
"epoch": 0.023817742492233344,
"grad_norm": 2.165076494216919,
"learning_rate": 2e-05,
"loss": 1.2727,
"step": 69
},
{
"epoch": 0.02416292716603383,
"grad_norm": 2.202164649963379,
"learning_rate": 2e-05,
"loss": 1.2887,
"step": 70
},
{
"epoch": 0.024508111839834312,
"grad_norm": 2.3834869861602783,
"learning_rate": 2e-05,
"loss": 1.2694,
"step": 71
},
{
"epoch": 0.024853296513634795,
"grad_norm": 2.5316550731658936,
"learning_rate": 2e-05,
"loss": 1.2514,
"step": 72
},
{
"epoch": 0.025198481187435277,
"grad_norm": 1.9449459314346313,
"learning_rate": 2e-05,
"loss": 1.26,
"step": 73
},
{
"epoch": 0.02554366586123576,
"grad_norm": 2.2826900482177734,
"learning_rate": 2e-05,
"loss": 1.2931,
"step": 74
},
{
"epoch": 0.025888850535036246,
"grad_norm": 2.260650396347046,
"learning_rate": 2e-05,
"loss": 1.2732,
"step": 75
},
{
"epoch": 0.02623403520883673,
"grad_norm": 2.356182336807251,
"learning_rate": 2e-05,
"loss": 1.2449,
"step": 76
},
{
"epoch": 0.02657921988263721,
"grad_norm": 2.199906826019287,
"learning_rate": 2e-05,
"loss": 1.2733,
"step": 77
},
{
"epoch": 0.026924404556437694,
"grad_norm": 2.3083510398864746,
"learning_rate": 2e-05,
"loss": 1.2257,
"step": 78
},
{
"epoch": 0.027269589230238176,
"grad_norm": 2.2658169269561768,
"learning_rate": 2e-05,
"loss": 1.2525,
"step": 79
},
{
"epoch": 0.027614773904038662,
"grad_norm": 2.352308988571167,
"learning_rate": 2e-05,
"loss": 1.2202,
"step": 80
},
{
"epoch": 0.027959958577839145,
"grad_norm": 2.523381471633911,
"learning_rate": 2e-05,
"loss": 1.2996,
"step": 81
},
{
"epoch": 0.028305143251639627,
"grad_norm": 2.4327428340911865,
"learning_rate": 2e-05,
"loss": 1.2135,
"step": 82
},
{
"epoch": 0.02865032792544011,
"grad_norm": 2.4549570083618164,
"learning_rate": 2e-05,
"loss": 1.2813,
"step": 83
},
{
"epoch": 0.028995512599240592,
"grad_norm": 2.4394640922546387,
"learning_rate": 2e-05,
"loss": 1.2828,
"step": 84
},
{
"epoch": 0.029340697273041078,
"grad_norm": 2.4780633449554443,
"learning_rate": 2e-05,
"loss": 1.2517,
"step": 85
},
{
"epoch": 0.02968588194684156,
"grad_norm": 2.326880931854248,
"learning_rate": 2e-05,
"loss": 1.2288,
"step": 86
},
{
"epoch": 0.030031066620642043,
"grad_norm": 3.1833627223968506,
"learning_rate": 2e-05,
"loss": 1.1978,
"step": 87
},
{
"epoch": 0.030376251294442526,
"grad_norm": 2.624091625213623,
"learning_rate": 2e-05,
"loss": 1.2534,
"step": 88
},
{
"epoch": 0.03072143596824301,
"grad_norm": 2.531895160675049,
"learning_rate": 2e-05,
"loss": 1.2263,
"step": 89
},
{
"epoch": 0.031066620642043494,
"grad_norm": 2.2346715927124023,
"learning_rate": 2e-05,
"loss": 1.2301,
"step": 90
},
{
"epoch": 0.03141180531584398,
"grad_norm": 2.237839698791504,
"learning_rate": 2e-05,
"loss": 1.1897,
"step": 91
},
{
"epoch": 0.03175698998964446,
"grad_norm": 2.4267807006835938,
"learning_rate": 2e-05,
"loss": 1.2508,
"step": 92
},
{
"epoch": 0.03210217466344494,
"grad_norm": 2.2506682872772217,
"learning_rate": 2e-05,
"loss": 1.2262,
"step": 93
},
{
"epoch": 0.03244735933724543,
"grad_norm": 2.5266077518463135,
"learning_rate": 2e-05,
"loss": 1.3139,
"step": 94
},
{
"epoch": 0.03279254401104591,
"grad_norm": 2.2002406120300293,
"learning_rate": 2e-05,
"loss": 1.2755,
"step": 95
},
{
"epoch": 0.03313772868484639,
"grad_norm": 2.20263409614563,
"learning_rate": 2e-05,
"loss": 1.2388,
"step": 96
},
{
"epoch": 0.03348291335864688,
"grad_norm": 2.297576665878296,
"learning_rate": 2e-05,
"loss": 1.2406,
"step": 97
},
{
"epoch": 0.03382809803244736,
"grad_norm": 2.3951361179351807,
"learning_rate": 2e-05,
"loss": 1.2244,
"step": 98
},
{
"epoch": 0.034173282706247844,
"grad_norm": 2.2707271575927734,
"learning_rate": 2e-05,
"loss": 1.2719,
"step": 99
},
{
"epoch": 0.03451846738004832,
"grad_norm": 2.2766411304473877,
"learning_rate": 2e-05,
"loss": 1.23,
"step": 100
},
{
"epoch": 0.03486365205384881,
"grad_norm": 2.253887414932251,
"learning_rate": 2e-05,
"loss": 1.1369,
"step": 101
},
{
"epoch": 0.035208836727649295,
"grad_norm": 2.0003821849823,
"learning_rate": 2e-05,
"loss": 1.2547,
"step": 102
},
{
"epoch": 0.035554021401449774,
"grad_norm": 2.277153253555298,
"learning_rate": 2e-05,
"loss": 1.2244,
"step": 103
},
{
"epoch": 0.03589920607525026,
"grad_norm": 2.1561081409454346,
"learning_rate": 2e-05,
"loss": 1.2222,
"step": 104
},
{
"epoch": 0.03624439074905074,
"grad_norm": 2.0002012252807617,
"learning_rate": 2e-05,
"loss": 1.1599,
"step": 105
},
{
"epoch": 0.036589575422851225,
"grad_norm": 2.3313021659851074,
"learning_rate": 2e-05,
"loss": 1.1658,
"step": 106
},
{
"epoch": 0.03693476009665171,
"grad_norm": 2.58686900138855,
"learning_rate": 2e-05,
"loss": 1.2282,
"step": 107
},
{
"epoch": 0.03727994477045219,
"grad_norm": 2.485671043395996,
"learning_rate": 2e-05,
"loss": 1.1537,
"step": 108
},
{
"epoch": 0.037625129444252677,
"grad_norm": 2.3962278366088867,
"learning_rate": 2e-05,
"loss": 1.133,
"step": 109
},
{
"epoch": 0.037970314118053156,
"grad_norm": 2.118319034576416,
"learning_rate": 2e-05,
"loss": 1.1769,
"step": 110
},
{
"epoch": 0.03831549879185364,
"grad_norm": 2.095940113067627,
"learning_rate": 2e-05,
"loss": 1.1763,
"step": 111
},
{
"epoch": 0.03866068346565413,
"grad_norm": 2.0862512588500977,
"learning_rate": 2e-05,
"loss": 1.1356,
"step": 112
},
{
"epoch": 0.03900586813945461,
"grad_norm": 1.9276546239852905,
"learning_rate": 2e-05,
"loss": 1.2068,
"step": 113
},
{
"epoch": 0.03935105281325509,
"grad_norm": 2.5604825019836426,
"learning_rate": 2e-05,
"loss": 1.16,
"step": 114
},
{
"epoch": 0.03969623748705557,
"grad_norm": 2.200289726257324,
"learning_rate": 2e-05,
"loss": 1.1811,
"step": 115
},
{
"epoch": 0.04004142216085606,
"grad_norm": 2.2654318809509277,
"learning_rate": 2e-05,
"loss": 1.2745,
"step": 116
},
{
"epoch": 0.040386606834656544,
"grad_norm": 2.194129228591919,
"learning_rate": 2e-05,
"loss": 1.1633,
"step": 117
},
{
"epoch": 0.04073179150845702,
"grad_norm": 2.267609119415283,
"learning_rate": 2e-05,
"loss": 1.1025,
"step": 118
},
{
"epoch": 0.04107697618225751,
"grad_norm": 2.3966379165649414,
"learning_rate": 2e-05,
"loss": 1.1624,
"step": 119
},
{
"epoch": 0.04142216085605799,
"grad_norm": 2.3428127765655518,
"learning_rate": 2e-05,
"loss": 1.2521,
"step": 120
},
{
"epoch": 0.041767345529858474,
"grad_norm": 2.257154941558838,
"learning_rate": 2e-05,
"loss": 1.1427,
"step": 121
},
{
"epoch": 0.04211253020365896,
"grad_norm": 2.4615261554718018,
"learning_rate": 2e-05,
"loss": 1.209,
"step": 122
},
{
"epoch": 0.04245771487745944,
"grad_norm": 2.215366840362549,
"learning_rate": 2e-05,
"loss": 1.1213,
"step": 123
},
{
"epoch": 0.042802899551259925,
"grad_norm": 2.293936014175415,
"learning_rate": 2e-05,
"loss": 1.1118,
"step": 124
},
{
"epoch": 0.043148084225060404,
"grad_norm": 2.351991653442383,
"learning_rate": 2e-05,
"loss": 1.1849,
"step": 125
},
{
"epoch": 0.04349326889886089,
"grad_norm": 2.118906259536743,
"learning_rate": 2e-05,
"loss": 1.1905,
"step": 126
},
{
"epoch": 0.043838453572661376,
"grad_norm": 2.1108460426330566,
"learning_rate": 2e-05,
"loss": 1.1787,
"step": 127
},
{
"epoch": 0.044183638246461855,
"grad_norm": 2.560715436935425,
"learning_rate": 2e-05,
"loss": 1.0727,
"step": 128
},
{
"epoch": 0.04452882292026234,
"grad_norm": 1.9983900785446167,
"learning_rate": 2e-05,
"loss": 1.1665,
"step": 129
},
{
"epoch": 0.04487400759406282,
"grad_norm": 2.3714847564697266,
"learning_rate": 2e-05,
"loss": 1.1488,
"step": 130
},
{
"epoch": 0.045219192267863306,
"grad_norm": 2.243546724319458,
"learning_rate": 2e-05,
"loss": 1.0965,
"step": 131
},
{
"epoch": 0.04556437694166379,
"grad_norm": 1.989858627319336,
"learning_rate": 2e-05,
"loss": 1.0833,
"step": 132
},
{
"epoch": 0.04590956161546427,
"grad_norm": 2.298943519592285,
"learning_rate": 2e-05,
"loss": 1.1548,
"step": 133
},
{
"epoch": 0.04625474628926476,
"grad_norm": 2.256289005279541,
"learning_rate": 2e-05,
"loss": 1.1139,
"step": 134
},
{
"epoch": 0.04659993096306524,
"grad_norm": 2.386530876159668,
"learning_rate": 2e-05,
"loss": 1.1245,
"step": 135
},
{
"epoch": 0.04694511563686572,
"grad_norm": 2.073687791824341,
"learning_rate": 2e-05,
"loss": 1.1337,
"step": 136
},
{
"epoch": 0.04729030031066621,
"grad_norm": 2.307697057723999,
"learning_rate": 2e-05,
"loss": 1.1325,
"step": 137
},
{
"epoch": 0.04763548498446669,
"grad_norm": 2.1158194541931152,
"learning_rate": 2e-05,
"loss": 1.1637,
"step": 138
},
{
"epoch": 0.047980669658267174,
"grad_norm": 2.21449613571167,
"learning_rate": 2e-05,
"loss": 1.1802,
"step": 139
},
{
"epoch": 0.04832585433206766,
"grad_norm": 2.041476249694824,
"learning_rate": 2e-05,
"loss": 1.1007,
"step": 140
},
{
"epoch": 0.04867103900586814,
"grad_norm": 2.262849807739258,
"learning_rate": 2e-05,
"loss": 1.1006,
"step": 141
},
{
"epoch": 0.049016223679668625,
"grad_norm": 2.207761287689209,
"learning_rate": 2e-05,
"loss": 1.133,
"step": 142
},
{
"epoch": 0.049361408353469104,
"grad_norm": 2.083613872528076,
"learning_rate": 2e-05,
"loss": 1.1319,
"step": 143
},
{
"epoch": 0.04970659302726959,
"grad_norm": 2.1225838661193848,
"learning_rate": 2e-05,
"loss": 1.0786,
"step": 144
},
{
"epoch": 0.050051777701070076,
"grad_norm": 2.17154598236084,
"learning_rate": 2e-05,
"loss": 1.2138,
"step": 145
},
{
"epoch": 0.050396962374870555,
"grad_norm": 2.655251979827881,
"learning_rate": 2e-05,
"loss": 1.1639,
"step": 146
},
{
"epoch": 0.05074214704867104,
"grad_norm": 2.241605758666992,
"learning_rate": 2e-05,
"loss": 1.1345,
"step": 147
},
{
"epoch": 0.05108733172247152,
"grad_norm": 2.397520065307617,
"learning_rate": 2e-05,
"loss": 1.1592,
"step": 148
},
{
"epoch": 0.051432516396272006,
"grad_norm": 2.2444629669189453,
"learning_rate": 2e-05,
"loss": 1.134,
"step": 149
},
{
"epoch": 0.05177770107007249,
"grad_norm": 2.179332971572876,
"learning_rate": 2e-05,
"loss": 1.1726,
"step": 150
},
{
"epoch": 0.05212288574387297,
"grad_norm": 2.1803085803985596,
"learning_rate": 2e-05,
"loss": 1.1458,
"step": 151
},
{
"epoch": 0.05246807041767346,
"grad_norm": 2.06752872467041,
"learning_rate": 2e-05,
"loss": 1.1741,
"step": 152
},
{
"epoch": 0.052813255091473936,
"grad_norm": 1.9567921161651611,
"learning_rate": 2e-05,
"loss": 1.1783,
"step": 153
},
{
"epoch": 0.05315843976527442,
"grad_norm": 1.8780710697174072,
"learning_rate": 2e-05,
"loss": 1.0461,
"step": 154
},
{
"epoch": 0.05350362443907491,
"grad_norm": 2.142355442047119,
"learning_rate": 2e-05,
"loss": 1.1683,
"step": 155
},
{
"epoch": 0.05384880911287539,
"grad_norm": 2.404834270477295,
"learning_rate": 2e-05,
"loss": 1.1367,
"step": 156
},
{
"epoch": 0.05419399378667587,
"grad_norm": 2.467586040496826,
"learning_rate": 2e-05,
"loss": 1.0616,
"step": 157
},
{
"epoch": 0.05453917846047635,
"grad_norm": 2.428678035736084,
"learning_rate": 2e-05,
"loss": 1.1587,
"step": 158
},
{
"epoch": 0.05488436313427684,
"grad_norm": 1.9721519947052002,
"learning_rate": 2e-05,
"loss": 1.0864,
"step": 159
},
{
"epoch": 0.055229547808077324,
"grad_norm": 2.282735586166382,
"learning_rate": 2e-05,
"loss": 1.1805,
"step": 160
},
{
"epoch": 0.0555747324818778,
"grad_norm": 2.136899471282959,
"learning_rate": 2e-05,
"loss": 1.0941,
"step": 161
},
{
"epoch": 0.05591991715567829,
"grad_norm": 2.0251846313476562,
"learning_rate": 2e-05,
"loss": 1.0796,
"step": 162
},
{
"epoch": 0.05626510182947877,
"grad_norm": 2.1328299045562744,
"learning_rate": 2e-05,
"loss": 1.0917,
"step": 163
},
{
"epoch": 0.056610286503279254,
"grad_norm": 2.205331802368164,
"learning_rate": 2e-05,
"loss": 1.1792,
"step": 164
},
{
"epoch": 0.05695547117707974,
"grad_norm": 2.0339744091033936,
"learning_rate": 2e-05,
"loss": 1.1874,
"step": 165
},
{
"epoch": 0.05730065585088022,
"grad_norm": 1.8030365705490112,
"learning_rate": 2e-05,
"loss": 1.0929,
"step": 166
},
{
"epoch": 0.057645840524680705,
"grad_norm": 2.1905670166015625,
"learning_rate": 2e-05,
"loss": 1.1831,
"step": 167
},
{
"epoch": 0.057991025198481184,
"grad_norm": 2.142336845397949,
"learning_rate": 2e-05,
"loss": 1.1237,
"step": 168
},
{
"epoch": 0.05833620987228167,
"grad_norm": 2.3521053791046143,
"learning_rate": 2e-05,
"loss": 1.1077,
"step": 169
},
{
"epoch": 0.058681394546082156,
"grad_norm": 2.105743408203125,
"learning_rate": 2e-05,
"loss": 1.1048,
"step": 170
},
{
"epoch": 0.059026579219882636,
"grad_norm": 1.9414738416671753,
"learning_rate": 2e-05,
"loss": 1.112,
"step": 171
},
{
"epoch": 0.05937176389368312,
"grad_norm": 1.9606658220291138,
"learning_rate": 2e-05,
"loss": 1.1109,
"step": 172
},
{
"epoch": 0.0597169485674836,
"grad_norm": 2.3274831771850586,
"learning_rate": 2e-05,
"loss": 1.1803,
"step": 173
},
{
"epoch": 0.06006213324128409,
"grad_norm": 2.1384570598602295,
"learning_rate": 2e-05,
"loss": 1.0515,
"step": 174
},
{
"epoch": 0.06040731791508457,
"grad_norm": 2.0795719623565674,
"learning_rate": 2e-05,
"loss": 1.0581,
"step": 175
},
{
"epoch": 0.06075250258888505,
"grad_norm": 2.0180423259735107,
"learning_rate": 2e-05,
"loss": 1.0686,
"step": 176
},
{
"epoch": 0.06109768726268554,
"grad_norm": 2.0913267135620117,
"learning_rate": 2e-05,
"loss": 1.117,
"step": 177
},
{
"epoch": 0.06144287193648602,
"grad_norm": 2.0325934886932373,
"learning_rate": 2e-05,
"loss": 1.1526,
"step": 178
},
{
"epoch": 0.0617880566102865,
"grad_norm": 2.222254991531372,
"learning_rate": 2e-05,
"loss": 1.113,
"step": 179
},
{
"epoch": 0.06213324128408699,
"grad_norm": 2.2039270401000977,
"learning_rate": 2e-05,
"loss": 1.1886,
"step": 180
},
{
"epoch": 0.06247842595788747,
"grad_norm": 2.0291781425476074,
"learning_rate": 2e-05,
"loss": 1.0884,
"step": 181
},
{
"epoch": 0.06282361063168795,
"grad_norm": 2.2183430194854736,
"learning_rate": 2e-05,
"loss": 1.0223,
"step": 182
},
{
"epoch": 0.06316879530548844,
"grad_norm": 2.37440824508667,
"learning_rate": 2e-05,
"loss": 1.0775,
"step": 183
},
{
"epoch": 0.06351397997928893,
"grad_norm": 1.8214384317398071,
"learning_rate": 2e-05,
"loss": 1.1279,
"step": 184
},
{
"epoch": 0.0638591646530894,
"grad_norm": 2.205291271209717,
"learning_rate": 2e-05,
"loss": 1.0396,
"step": 185
},
{
"epoch": 0.06420434932688988,
"grad_norm": 2.137577533721924,
"learning_rate": 2e-05,
"loss": 1.0861,
"step": 186
},
{
"epoch": 0.06454953400069037,
"grad_norm": 1.982663869857788,
"learning_rate": 2e-05,
"loss": 1.1612,
"step": 187
},
{
"epoch": 0.06489471867449086,
"grad_norm": 21.140506744384766,
"learning_rate": 2e-05,
"loss": 1.0824,
"step": 188
},
{
"epoch": 0.06523990334829134,
"grad_norm": 2.2611193656921387,
"learning_rate": 2e-05,
"loss": 1.0795,
"step": 189
},
{
"epoch": 0.06558508802209181,
"grad_norm": 2.0905325412750244,
"learning_rate": 2e-05,
"loss": 1.1989,
"step": 190
},
{
"epoch": 0.0659302726958923,
"grad_norm": 1.9430997371673584,
"learning_rate": 2e-05,
"loss": 1.0885,
"step": 191
},
{
"epoch": 0.06627545736969279,
"grad_norm": 1.8876497745513916,
"learning_rate": 2e-05,
"loss": 1.0708,
"step": 192
},
{
"epoch": 0.06662064204349327,
"grad_norm": 2.0716099739074707,
"learning_rate": 2e-05,
"loss": 1.124,
"step": 193
},
{
"epoch": 0.06696582671729376,
"grad_norm": 2.413959503173828,
"learning_rate": 2e-05,
"loss": 1.1856,
"step": 194
},
{
"epoch": 0.06731101139109423,
"grad_norm": 1.8021107912063599,
"learning_rate": 2e-05,
"loss": 1.1284,
"step": 195
},
{
"epoch": 0.06765619606489472,
"grad_norm": 2.2795395851135254,
"learning_rate": 2e-05,
"loss": 1.1101,
"step": 196
},
{
"epoch": 0.0680013807386952,
"grad_norm": 1.936448097229004,
"learning_rate": 2e-05,
"loss": 1.0921,
"step": 197
},
{
"epoch": 0.06834656541249569,
"grad_norm": 1.940928339958191,
"learning_rate": 2e-05,
"loss": 1.1236,
"step": 198
},
{
"epoch": 0.06869175008629617,
"grad_norm": 2.1147520542144775,
"learning_rate": 2e-05,
"loss": 1.0332,
"step": 199
},
{
"epoch": 0.06903693476009665,
"grad_norm": 1.9784513711929321,
"learning_rate": 2e-05,
"loss": 1.0644,
"step": 200
},
{
"epoch": 0.06938211943389713,
"grad_norm": 2.135711431503296,
"learning_rate": 2e-05,
"loss": 1.0189,
"step": 201
},
{
"epoch": 0.06972730410769762,
"grad_norm": 2.3416550159454346,
"learning_rate": 2e-05,
"loss": 1.0788,
"step": 202
},
{
"epoch": 0.0700724887814981,
"grad_norm": 2.143134593963623,
"learning_rate": 2e-05,
"loss": 1.0657,
"step": 203
},
{
"epoch": 0.07041767345529859,
"grad_norm": 9.058279991149902,
"learning_rate": 2e-05,
"loss": 1.0657,
"step": 204
},
{
"epoch": 0.07076285812909906,
"grad_norm": 2.2367799282073975,
"learning_rate": 2e-05,
"loss": 1.0822,
"step": 205
},
{
"epoch": 0.07110804280289955,
"grad_norm": 1.9047443866729736,
"learning_rate": 2e-05,
"loss": 1.0999,
"step": 206
},
{
"epoch": 0.07145322747670003,
"grad_norm": 2.307863473892212,
"learning_rate": 2e-05,
"loss": 1.1275,
"step": 207
},
{
"epoch": 0.07179841215050052,
"grad_norm": 2.0635058879852295,
"learning_rate": 2e-05,
"loss": 1.0477,
"step": 208
},
{
"epoch": 0.072143596824301,
"grad_norm": 2.144148111343384,
"learning_rate": 2e-05,
"loss": 1.0351,
"step": 209
},
{
"epoch": 0.07248878149810148,
"grad_norm": 1.9361531734466553,
"learning_rate": 2e-05,
"loss": 1.0704,
"step": 210
},
{
"epoch": 0.07283396617190196,
"grad_norm": 1.8395414352416992,
"learning_rate": 2e-05,
"loss": 1.1099,
"step": 211
},
{
"epoch": 0.07317915084570245,
"grad_norm": 2.1172099113464355,
"learning_rate": 2e-05,
"loss": 1.1169,
"step": 212
},
{
"epoch": 0.07352433551950294,
"grad_norm": 2.084325075149536,
"learning_rate": 2e-05,
"loss": 1.0347,
"step": 213
},
{
"epoch": 0.07386952019330342,
"grad_norm": 2.1838455200195312,
"learning_rate": 2e-05,
"loss": 1.0926,
"step": 214
},
{
"epoch": 0.0742147048671039,
"grad_norm": 2.1516849994659424,
"learning_rate": 2e-05,
"loss": 1.082,
"step": 215
},
{
"epoch": 0.07455988954090438,
"grad_norm": 2.011460781097412,
"learning_rate": 2e-05,
"loss": 1.073,
"step": 216
},
{
"epoch": 0.07490507421470487,
"grad_norm": 2.2096571922302246,
"learning_rate": 2e-05,
"loss": 1.1239,
"step": 217
},
{
"epoch": 0.07525025888850535,
"grad_norm": 2.088879346847534,
"learning_rate": 2e-05,
"loss": 1.1313,
"step": 218
},
{
"epoch": 0.07559544356230584,
"grad_norm": 2.0846405029296875,
"learning_rate": 2e-05,
"loss": 0.9951,
"step": 219
},
{
"epoch": 0.07594062823610631,
"grad_norm": 1.9645204544067383,
"learning_rate": 2e-05,
"loss": 1.0431,
"step": 220
},
{
"epoch": 0.0762858129099068,
"grad_norm": 2.1063179969787598,
"learning_rate": 2e-05,
"loss": 1.0659,
"step": 221
},
{
"epoch": 0.07663099758370728,
"grad_norm": 2.0268285274505615,
"learning_rate": 2e-05,
"loss": 1.0909,
"step": 222
},
{
"epoch": 0.07697618225750777,
"grad_norm": 1.9405102729797363,
"learning_rate": 2e-05,
"loss": 1.0,
"step": 223
},
{
"epoch": 0.07732136693130826,
"grad_norm": 2.1061298847198486,
"learning_rate": 2e-05,
"loss": 1.083,
"step": 224
},
{
"epoch": 0.07766655160510873,
"grad_norm": 2.07513165473938,
"learning_rate": 2e-05,
"loss": 1.0872,
"step": 225
},
{
"epoch": 0.07801173627890921,
"grad_norm": 2.2630527019500732,
"learning_rate": 2e-05,
"loss": 1.0296,
"step": 226
},
{
"epoch": 0.0783569209527097,
"grad_norm": 2.0668439865112305,
"learning_rate": 2e-05,
"loss": 1.1433,
"step": 227
},
{
"epoch": 0.07870210562651019,
"grad_norm": 2.3092525005340576,
"learning_rate": 2e-05,
"loss": 1.0347,
"step": 228
},
{
"epoch": 0.07904729030031067,
"grad_norm": 2.0190646648406982,
"learning_rate": 2e-05,
"loss": 1.0423,
"step": 229
},
{
"epoch": 0.07939247497411114,
"grad_norm": 2.0675878524780273,
"learning_rate": 2e-05,
"loss": 1.1551,
"step": 230
},
{
"epoch": 0.07973765964791163,
"grad_norm": 2.282857894897461,
"learning_rate": 2e-05,
"loss": 1.043,
"step": 231
},
{
"epoch": 0.08008284432171212,
"grad_norm": 1.886343240737915,
"learning_rate": 2e-05,
"loss": 1.0244,
"step": 232
},
{
"epoch": 0.0804280289955126,
"grad_norm": 2.2882308959960938,
"learning_rate": 2e-05,
"loss": 1.0228,
"step": 233
},
{
"epoch": 0.08077321366931309,
"grad_norm": 2.05058217048645,
"learning_rate": 2e-05,
"loss": 1.0567,
"step": 234
},
{
"epoch": 0.08111839834311356,
"grad_norm": 2.2782809734344482,
"learning_rate": 2e-05,
"loss": 1.0744,
"step": 235
},
{
"epoch": 0.08146358301691405,
"grad_norm": 1.9740854501724243,
"learning_rate": 2e-05,
"loss": 1.0601,
"step": 236
},
{
"epoch": 0.08180876769071453,
"grad_norm": 2.08333158493042,
"learning_rate": 2e-05,
"loss": 1.0873,
"step": 237
},
{
"epoch": 0.08215395236451502,
"grad_norm": 2.0546019077301025,
"learning_rate": 2e-05,
"loss": 1.088,
"step": 238
},
{
"epoch": 0.0824991370383155,
"grad_norm": 1.9426814317703247,
"learning_rate": 2e-05,
"loss": 1.0512,
"step": 239
},
{
"epoch": 0.08284432171211598,
"grad_norm": 2.0802295207977295,
"learning_rate": 2e-05,
"loss": 1.1123,
"step": 240
},
{
"epoch": 0.08318950638591646,
"grad_norm": 2.010526657104492,
"learning_rate": 2e-05,
"loss": 1.0883,
"step": 241
},
{
"epoch": 0.08353469105971695,
"grad_norm": 2.083188056945801,
"learning_rate": 2e-05,
"loss": 1.0987,
"step": 242
},
{
"epoch": 0.08387987573351743,
"grad_norm": 2.137660264968872,
"learning_rate": 2e-05,
"loss": 0.9924,
"step": 243
},
{
"epoch": 0.08422506040731792,
"grad_norm": 2.041710376739502,
"learning_rate": 2e-05,
"loss": 1.1403,
"step": 244
},
{
"epoch": 0.08457024508111839,
"grad_norm": 2.0598714351654053,
"learning_rate": 2e-05,
"loss": 1.02,
"step": 245
},
{
"epoch": 0.08491542975491888,
"grad_norm": 2.168576955795288,
"learning_rate": 2e-05,
"loss": 1.0771,
"step": 246
},
{
"epoch": 0.08526061442871936,
"grad_norm": 2.145132303237915,
"learning_rate": 2e-05,
"loss": 1.0746,
"step": 247
},
{
"epoch": 0.08560579910251985,
"grad_norm": 2.24804425239563,
"learning_rate": 2e-05,
"loss": 1.0898,
"step": 248
},
{
"epoch": 0.08595098377632034,
"grad_norm": 1.8360575437545776,
"learning_rate": 2e-05,
"loss": 1.0741,
"step": 249
},
{
"epoch": 0.08629616845012081,
"grad_norm": 2.1169514656066895,
"learning_rate": 2e-05,
"loss": 1.1025,
"step": 250
},
{
"epoch": 0.0866413531239213,
"grad_norm": 1.929721474647522,
"learning_rate": 2e-05,
"loss": 1.0455,
"step": 251
},
{
"epoch": 0.08698653779772178,
"grad_norm": 5.5121026039123535,
"learning_rate": 2e-05,
"loss": 1.07,
"step": 252
},
{
"epoch": 0.08733172247152227,
"grad_norm": 2.2410662174224854,
"learning_rate": 2e-05,
"loss": 1.1145,
"step": 253
},
{
"epoch": 0.08767690714532275,
"grad_norm": 2.027545213699341,
"learning_rate": 2e-05,
"loss": 0.9882,
"step": 254
},
{
"epoch": 0.08802209181912322,
"grad_norm": 2.0337374210357666,
"learning_rate": 2e-05,
"loss": 0.999,
"step": 255
},
{
"epoch": 0.08836727649292371,
"grad_norm": 2.1120731830596924,
"learning_rate": 2e-05,
"loss": 1.0424,
"step": 256
},
{
"epoch": 0.0887124611667242,
"grad_norm": 2.039837121963501,
"learning_rate": 2e-05,
"loss": 1.0316,
"step": 257
},
{
"epoch": 0.08905764584052468,
"grad_norm": 2.008521318435669,
"learning_rate": 2e-05,
"loss": 1.019,
"step": 258
},
{
"epoch": 0.08940283051432517,
"grad_norm": 2.081023693084717,
"learning_rate": 2e-05,
"loss": 1.003,
"step": 259
},
{
"epoch": 0.08974801518812564,
"grad_norm": 4.9017229080200195,
"learning_rate": 2e-05,
"loss": 1.1215,
"step": 260
},
{
"epoch": 0.09009319986192613,
"grad_norm": 2.0149810314178467,
"learning_rate": 2e-05,
"loss": 1.0643,
"step": 261
},
{
"epoch": 0.09043838453572661,
"grad_norm": 2.0311553478240967,
"learning_rate": 2e-05,
"loss": 1.0287,
"step": 262
},
{
"epoch": 0.0907835692095271,
"grad_norm": 2.089172124862671,
"learning_rate": 2e-05,
"loss": 1.0435,
"step": 263
},
{
"epoch": 0.09112875388332758,
"grad_norm": 2.233536720275879,
"learning_rate": 2e-05,
"loss": 1.0787,
"step": 264
},
{
"epoch": 0.09147393855712806,
"grad_norm": 2.050518035888672,
"learning_rate": 2e-05,
"loss": 1.0818,
"step": 265
},
{
"epoch": 0.09181912323092854,
"grad_norm": 2.117332935333252,
"learning_rate": 2e-05,
"loss": 1.0334,
"step": 266
},
{
"epoch": 0.09216430790472903,
"grad_norm": 2.1400556564331055,
"learning_rate": 2e-05,
"loss": 1.0997,
"step": 267
},
{
"epoch": 0.09250949257852951,
"grad_norm": 6.34127950668335,
"learning_rate": 2e-05,
"loss": 1.0854,
"step": 268
},
{
"epoch": 0.09285467725233,
"grad_norm": 2.346954584121704,
"learning_rate": 2e-05,
"loss": 1.041,
"step": 269
},
{
"epoch": 0.09319986192613049,
"grad_norm": 2.049189329147339,
"learning_rate": 2e-05,
"loss": 1.1082,
"step": 270
},
{
"epoch": 0.09354504659993096,
"grad_norm": 2.0327305793762207,
"learning_rate": 2e-05,
"loss": 1.0755,
"step": 271
},
{
"epoch": 0.09389023127373144,
"grad_norm": 2.1110620498657227,
"learning_rate": 2e-05,
"loss": 1.0721,
"step": 272
},
{
"epoch": 0.09423541594753193,
"grad_norm": 1.8782284259796143,
"learning_rate": 2e-05,
"loss": 0.9842,
"step": 273
},
{
"epoch": 0.09458060062133242,
"grad_norm": 1.8504958152770996,
"learning_rate": 2e-05,
"loss": 1.0183,
"step": 274
},
{
"epoch": 0.0949257852951329,
"grad_norm": 2.0672526359558105,
"learning_rate": 2e-05,
"loss": 1.0126,
"step": 275
},
{
"epoch": 0.09527096996893337,
"grad_norm": 2.104374885559082,
"learning_rate": 2e-05,
"loss": 1.0911,
"step": 276
},
{
"epoch": 0.09561615464273386,
"grad_norm": 1.9912065267562866,
"learning_rate": 2e-05,
"loss": 1.0659,
"step": 277
},
{
"epoch": 0.09596133931653435,
"grad_norm": 2.093083143234253,
"learning_rate": 2e-05,
"loss": 1.0771,
"step": 278
},
{
"epoch": 0.09630652399033483,
"grad_norm": 1.92844820022583,
"learning_rate": 2e-05,
"loss": 1.006,
"step": 279
},
{
"epoch": 0.09665170866413532,
"grad_norm": 1.7608734369277954,
"learning_rate": 2e-05,
"loss": 1.0353,
"step": 280
},
{
"epoch": 0.09699689333793579,
"grad_norm": 2.1199417114257812,
"learning_rate": 2e-05,
"loss": 1.0065,
"step": 281
},
{
"epoch": 0.09734207801173628,
"grad_norm": 1.7883626222610474,
"learning_rate": 2e-05,
"loss": 1.0815,
"step": 282
},
{
"epoch": 0.09768726268553676,
"grad_norm": 1.9652053117752075,
"learning_rate": 2e-05,
"loss": 1.0158,
"step": 283
},
{
"epoch": 0.09803244735933725,
"grad_norm": 2.1138057708740234,
"learning_rate": 2e-05,
"loss": 1.029,
"step": 284
},
{
"epoch": 0.09837763203313774,
"grad_norm": 2.0844762325286865,
"learning_rate": 2e-05,
"loss": 1.0783,
"step": 285
},
{
"epoch": 0.09872281670693821,
"grad_norm": 1.954588532447815,
"learning_rate": 2e-05,
"loss": 1.0256,
"step": 286
},
{
"epoch": 0.0990680013807387,
"grad_norm": 2.3191030025482178,
"learning_rate": 2e-05,
"loss": 1.1083,
"step": 287
},
{
"epoch": 0.09941318605453918,
"grad_norm": 2.0656120777130127,
"learning_rate": 2e-05,
"loss": 1.0798,
"step": 288
},
{
"epoch": 0.09975837072833967,
"grad_norm": 1.8802495002746582,
"learning_rate": 2e-05,
"loss": 1.0258,
"step": 289
},
{
"epoch": 0.10010355540214015,
"grad_norm": 2.062614917755127,
"learning_rate": 2e-05,
"loss": 1.0565,
"step": 290
},
{
"epoch": 0.10044874007594062,
"grad_norm": 2.0783498287200928,
"learning_rate": 2e-05,
"loss": 1.1027,
"step": 291
},
{
"epoch": 0.10079392474974111,
"grad_norm": 2.0610833168029785,
"learning_rate": 2e-05,
"loss": 1.0019,
"step": 292
},
{
"epoch": 0.1011391094235416,
"grad_norm": 2.029587745666504,
"learning_rate": 2e-05,
"loss": 1.0369,
"step": 293
},
{
"epoch": 0.10148429409734208,
"grad_norm": 1.8925073146820068,
"learning_rate": 2e-05,
"loss": 1.0196,
"step": 294
},
{
"epoch": 0.10182947877114257,
"grad_norm": 1.9382961988449097,
"learning_rate": 2e-05,
"loss": 1.0812,
"step": 295
},
{
"epoch": 0.10217466344494304,
"grad_norm": 1.8473429679870605,
"learning_rate": 2e-05,
"loss": 1.0407,
"step": 296
},
{
"epoch": 0.10251984811874353,
"grad_norm": 2.0330350399017334,
"learning_rate": 2e-05,
"loss": 1.0093,
"step": 297
},
{
"epoch": 0.10286503279254401,
"grad_norm": 2.0864484310150146,
"learning_rate": 2e-05,
"loss": 1.0155,
"step": 298
},
{
"epoch": 0.1032102174663445,
"grad_norm": 1.8072278499603271,
"learning_rate": 2e-05,
"loss": 1.0674,
"step": 299
},
{
"epoch": 0.10355540214014498,
"grad_norm": 2.2771360874176025,
"learning_rate": 2e-05,
"loss": 1.0345,
"step": 300
},
{
"epoch": 0.10390058681394546,
"grad_norm": 1.8649107217788696,
"learning_rate": 2e-05,
"loss": 0.9528,
"step": 301
},
{
"epoch": 0.10424577148774594,
"grad_norm": 1.751585841178894,
"learning_rate": 2e-05,
"loss": 0.9587,
"step": 302
},
{
"epoch": 0.10459095616154643,
"grad_norm": 2.0218489170074463,
"learning_rate": 2e-05,
"loss": 0.9844,
"step": 303
},
{
"epoch": 0.10493614083534691,
"grad_norm": 2.0804543495178223,
"learning_rate": 2e-05,
"loss": 1.0221,
"step": 304
},
{
"epoch": 0.1052813255091474,
"grad_norm": 2.1906816959381104,
"learning_rate": 2e-05,
"loss": 1.0347,
"step": 305
},
{
"epoch": 0.10562651018294787,
"grad_norm": 1.852725863456726,
"learning_rate": 2e-05,
"loss": 1.0482,
"step": 306
},
{
"epoch": 0.10597169485674836,
"grad_norm": 1.9083342552185059,
"learning_rate": 2e-05,
"loss": 1.0901,
"step": 307
},
{
"epoch": 0.10631687953054884,
"grad_norm": 2.0140769481658936,
"learning_rate": 2e-05,
"loss": 1.087,
"step": 308
},
{
"epoch": 0.10666206420434933,
"grad_norm": 1.8648165464401245,
"learning_rate": 2e-05,
"loss": 1.0129,
"step": 309
},
{
"epoch": 0.10700724887814982,
"grad_norm": 2.034452438354492,
"learning_rate": 2e-05,
"loss": 1.0424,
"step": 310
},
{
"epoch": 0.10735243355195029,
"grad_norm": 1.7214909791946411,
"learning_rate": 2e-05,
"loss": 1.0461,
"step": 311
},
{
"epoch": 0.10769761822575077,
"grad_norm": 2.31937575340271,
"learning_rate": 2e-05,
"loss": 1.0606,
"step": 312
},
{
"epoch": 0.10804280289955126,
"grad_norm": 1.9707671403884888,
"learning_rate": 2e-05,
"loss": 0.9882,
"step": 313
},
{
"epoch": 0.10838798757335175,
"grad_norm": 1.837477207183838,
"learning_rate": 2e-05,
"loss": 0.9285,
"step": 314
},
{
"epoch": 0.10873317224715223,
"grad_norm": 1.8579028844833374,
"learning_rate": 2e-05,
"loss": 1.0121,
"step": 315
},
{
"epoch": 0.1090783569209527,
"grad_norm": 1.8835140466690063,
"learning_rate": 2e-05,
"loss": 1.0724,
"step": 316
},
{
"epoch": 0.10942354159475319,
"grad_norm": 1.7847641706466675,
"learning_rate": 2e-05,
"loss": 1.0265,
"step": 317
},
{
"epoch": 0.10976872626855368,
"grad_norm": 2.0330307483673096,
"learning_rate": 2e-05,
"loss": 1.0218,
"step": 318
},
{
"epoch": 0.11011391094235416,
"grad_norm": 1.8466086387634277,
"learning_rate": 2e-05,
"loss": 1.0594,
"step": 319
},
{
"epoch": 0.11045909561615465,
"grad_norm": 1.884079933166504,
"learning_rate": 2e-05,
"loss": 1.0734,
"step": 320
},
{
"epoch": 0.11080428028995512,
"grad_norm": 1.8187580108642578,
"learning_rate": 2e-05,
"loss": 0.9995,
"step": 321
},
{
"epoch": 0.1111494649637556,
"grad_norm": 2.196646213531494,
"learning_rate": 2e-05,
"loss": 1.0265,
"step": 322
},
{
"epoch": 0.11149464963755609,
"grad_norm": 1.9797489643096924,
"learning_rate": 2e-05,
"loss": 1.0325,
"step": 323
},
{
"epoch": 0.11183983431135658,
"grad_norm": 2.0785601139068604,
"learning_rate": 2e-05,
"loss": 1.1051,
"step": 324
},
{
"epoch": 0.11218501898515706,
"grad_norm": 2.0432729721069336,
"learning_rate": 2e-05,
"loss": 1.0815,
"step": 325
},
{
"epoch": 0.11253020365895754,
"grad_norm": 2.0694308280944824,
"learning_rate": 2e-05,
"loss": 1.0851,
"step": 326
},
{
"epoch": 0.11287538833275802,
"grad_norm": 1.8386410474777222,
"learning_rate": 2e-05,
"loss": 1.0106,
"step": 327
},
{
"epoch": 0.11322057300655851,
"grad_norm": 2.018885850906372,
"learning_rate": 2e-05,
"loss": 1.0318,
"step": 328
},
{
"epoch": 0.113565757680359,
"grad_norm": 2.105708360671997,
"learning_rate": 2e-05,
"loss": 1.0567,
"step": 329
},
{
"epoch": 0.11391094235415948,
"grad_norm": 1.9204944372177124,
"learning_rate": 2e-05,
"loss": 1.0262,
"step": 330
},
{
"epoch": 0.11425612702795995,
"grad_norm": 1.9768996238708496,
"learning_rate": 2e-05,
"loss": 1.0085,
"step": 331
},
{
"epoch": 0.11460131170176044,
"grad_norm": 1.785104751586914,
"learning_rate": 2e-05,
"loss": 1.102,
"step": 332
},
{
"epoch": 0.11494649637556092,
"grad_norm": 2.0644032955169678,
"learning_rate": 2e-05,
"loss": 1.0617,
"step": 333
},
{
"epoch": 0.11529168104936141,
"grad_norm": 2.0390021800994873,
"learning_rate": 2e-05,
"loss": 0.9609,
"step": 334
},
{
"epoch": 0.1156368657231619,
"grad_norm": 1.919952392578125,
"learning_rate": 2e-05,
"loss": 1.0701,
"step": 335
},
{
"epoch": 0.11598205039696237,
"grad_norm": 1.8437055349349976,
"learning_rate": 2e-05,
"loss": 1.0486,
"step": 336
},
{
"epoch": 0.11632723507076285,
"grad_norm": 1.864856243133545,
"learning_rate": 2e-05,
"loss": 1.0564,
"step": 337
},
{
"epoch": 0.11667241974456334,
"grad_norm": 2.0349135398864746,
"learning_rate": 2e-05,
"loss": 0.9971,
"step": 338
},
{
"epoch": 0.11701760441836383,
"grad_norm": 2.007643461227417,
"learning_rate": 2e-05,
"loss": 0.986,
"step": 339
},
{
"epoch": 0.11736278909216431,
"grad_norm": 2.1241238117218018,
"learning_rate": 2e-05,
"loss": 1.0278,
"step": 340
},
{
"epoch": 0.11770797376596479,
"grad_norm": 1.960552453994751,
"learning_rate": 2e-05,
"loss": 1.057,
"step": 341
},
{
"epoch": 0.11805315843976527,
"grad_norm": 2.3228769302368164,
"learning_rate": 2e-05,
"loss": 1.029,
"step": 342
},
{
"epoch": 0.11839834311356576,
"grad_norm": 1.9010450839996338,
"learning_rate": 2e-05,
"loss": 0.9918,
"step": 343
},
{
"epoch": 0.11874352778736624,
"grad_norm": 1.9832115173339844,
"learning_rate": 2e-05,
"loss": 0.9904,
"step": 344
},
{
"epoch": 0.11908871246116673,
"grad_norm": 2.176405668258667,
"learning_rate": 2e-05,
"loss": 1.0155,
"step": 345
},
{
"epoch": 0.1194338971349672,
"grad_norm": 2.0718116760253906,
"learning_rate": 2e-05,
"loss": 1.0706,
"step": 346
},
{
"epoch": 0.11977908180876769,
"grad_norm": 2.000976085662842,
"learning_rate": 2e-05,
"loss": 0.9916,
"step": 347
},
{
"epoch": 0.12012426648256817,
"grad_norm": 1.9181327819824219,
"learning_rate": 2e-05,
"loss": 1.0556,
"step": 348
},
{
"epoch": 0.12046945115636866,
"grad_norm": 1.7830644845962524,
"learning_rate": 2e-05,
"loss": 0.9755,
"step": 349
},
{
"epoch": 0.12081463583016915,
"grad_norm": 2.0355966091156006,
"learning_rate": 2e-05,
"loss": 1.0421,
"step": 350
},
{
"epoch": 0.12115982050396962,
"grad_norm": 1.8209973573684692,
"learning_rate": 2e-05,
"loss": 1.0213,
"step": 351
},
{
"epoch": 0.1215050051777701,
"grad_norm": 1.9484202861785889,
"learning_rate": 2e-05,
"loss": 1.11,
"step": 352
},
{
"epoch": 0.12185018985157059,
"grad_norm": 1.959164023399353,
"learning_rate": 2e-05,
"loss": 1.0318,
"step": 353
},
{
"epoch": 0.12219537452537108,
"grad_norm": 1.893936276435852,
"learning_rate": 2e-05,
"loss": 1.0534,
"step": 354
},
{
"epoch": 0.12254055919917156,
"grad_norm": 1.9669185876846313,
"learning_rate": 2e-05,
"loss": 0.9959,
"step": 355
},
{
"epoch": 0.12288574387297203,
"grad_norm": 2.152151584625244,
"learning_rate": 2e-05,
"loss": 1.0081,
"step": 356
},
{
"epoch": 0.12323092854677252,
"grad_norm": 2.203021764755249,
"learning_rate": 2e-05,
"loss": 1.0443,
"step": 357
},
{
"epoch": 0.123576113220573,
"grad_norm": 2.069221258163452,
"learning_rate": 2e-05,
"loss": 1.0519,
"step": 358
},
{
"epoch": 0.12392129789437349,
"grad_norm": 2.054393768310547,
"learning_rate": 2e-05,
"loss": 1.0314,
"step": 359
},
{
"epoch": 0.12426648256817398,
"grad_norm": 2.0708425045013428,
"learning_rate": 2e-05,
"loss": 0.9856,
"step": 360
},
{
"epoch": 0.12461166724197445,
"grad_norm": 2.4350380897521973,
"learning_rate": 2e-05,
"loss": 0.9796,
"step": 361
},
{
"epoch": 0.12495685191577494,
"grad_norm": 1.8085037469863892,
"learning_rate": 2e-05,
"loss": 0.9932,
"step": 362
},
{
"epoch": 0.12530203658957542,
"grad_norm": 1.824069619178772,
"learning_rate": 2e-05,
"loss": 1.024,
"step": 363
},
{
"epoch": 0.1256472212633759,
"grad_norm": 2.675426959991455,
"learning_rate": 2e-05,
"loss": 1.0231,
"step": 364
},
{
"epoch": 0.1259924059371764,
"grad_norm": 2.127661943435669,
"learning_rate": 2e-05,
"loss": 1.043,
"step": 365
},
{
"epoch": 0.12633759061097688,
"grad_norm": 1.9300974607467651,
"learning_rate": 2e-05,
"loss": 1.0163,
"step": 366
},
{
"epoch": 0.12668277528477737,
"grad_norm": 1.984744668006897,
"learning_rate": 2e-05,
"loss": 1.0101,
"step": 367
},
{
"epoch": 0.12702795995857785,
"grad_norm": 1.9173483848571777,
"learning_rate": 2e-05,
"loss": 1.0672,
"step": 368
},
{
"epoch": 0.1273731446323783,
"grad_norm": 2.148045778274536,
"learning_rate": 2e-05,
"loss": 1.0037,
"step": 369
},
{
"epoch": 0.1277183293061788,
"grad_norm": 2.0799989700317383,
"learning_rate": 2e-05,
"loss": 1.0067,
"step": 370
},
{
"epoch": 0.12806351397997928,
"grad_norm": 1.5450844764709473,
"learning_rate": 2e-05,
"loss": 0.9314,
"step": 371
},
{
"epoch": 0.12840869865377977,
"grad_norm": 2.0938477516174316,
"learning_rate": 2e-05,
"loss": 1.0326,
"step": 372
},
{
"epoch": 0.12875388332758025,
"grad_norm": 2.148625612258911,
"learning_rate": 2e-05,
"loss": 1.006,
"step": 373
},
{
"epoch": 0.12909906800138074,
"grad_norm": 1.964438796043396,
"learning_rate": 2e-05,
"loss": 0.9921,
"step": 374
},
{
"epoch": 0.12944425267518123,
"grad_norm": 1.7084150314331055,
"learning_rate": 2e-05,
"loss": 1.0276,
"step": 375
},
{
"epoch": 0.1297894373489817,
"grad_norm": 1.8643776178359985,
"learning_rate": 2e-05,
"loss": 1.071,
"step": 376
},
{
"epoch": 0.1301346220227822,
"grad_norm": 1.9768637418746948,
"learning_rate": 2e-05,
"loss": 1.0634,
"step": 377
},
{
"epoch": 0.13047980669658268,
"grad_norm": 1.8979171514511108,
"learning_rate": 2e-05,
"loss": 1.0746,
"step": 378
},
{
"epoch": 0.13082499137038314,
"grad_norm": 2.2266244888305664,
"learning_rate": 2e-05,
"loss": 1.0139,
"step": 379
},
{
"epoch": 0.13117017604418363,
"grad_norm": 2.767505645751953,
"learning_rate": 2e-05,
"loss": 1.0483,
"step": 380
},
{
"epoch": 0.13151536071798411,
"grad_norm": 1.9986623525619507,
"learning_rate": 2e-05,
"loss": 1.0527,
"step": 381
},
{
"epoch": 0.1318605453917846,
"grad_norm": 2.7748537063598633,
"learning_rate": 2e-05,
"loss": 1.0117,
"step": 382
},
{
"epoch": 0.1322057300655851,
"grad_norm": 2.404034376144409,
"learning_rate": 2e-05,
"loss": 1.0509,
"step": 383
},
{
"epoch": 0.13255091473938557,
"grad_norm": 1.8266741037368774,
"learning_rate": 2e-05,
"loss": 0.9569,
"step": 384
},
{
"epoch": 0.13289609941318606,
"grad_norm": 1.6617871522903442,
"learning_rate": 2e-05,
"loss": 1.0284,
"step": 385
},
{
"epoch": 0.13324128408698654,
"grad_norm": 2.1641674041748047,
"learning_rate": 2e-05,
"loss": 1.0185,
"step": 386
},
{
"epoch": 0.13358646876078703,
"grad_norm": 2.230027675628662,
"learning_rate": 2e-05,
"loss": 1.0059,
"step": 387
},
{
"epoch": 0.13393165343458752,
"grad_norm": 2.0304672718048096,
"learning_rate": 2e-05,
"loss": 0.9458,
"step": 388
},
{
"epoch": 0.13427683810838797,
"grad_norm": 1.9289956092834473,
"learning_rate": 2e-05,
"loss": 0.9867,
"step": 389
},
{
"epoch": 0.13462202278218846,
"grad_norm": 2.290512800216675,
"learning_rate": 2e-05,
"loss": 1.0438,
"step": 390
},
{
"epoch": 0.13496720745598895,
"grad_norm": 1.9645930528640747,
"learning_rate": 2e-05,
"loss": 1.0647,
"step": 391
},
{
"epoch": 0.13531239212978943,
"grad_norm": 1.9078412055969238,
"learning_rate": 2e-05,
"loss": 1.0219,
"step": 392
},
{
"epoch": 0.13565757680358992,
"grad_norm": 1.8987268209457397,
"learning_rate": 2e-05,
"loss": 0.9697,
"step": 393
},
{
"epoch": 0.1360027614773904,
"grad_norm": 1.8962979316711426,
"learning_rate": 2e-05,
"loss": 1.0337,
"step": 394
},
{
"epoch": 0.1363479461511909,
"grad_norm": 1.955389380455017,
"learning_rate": 2e-05,
"loss": 0.9921,
"step": 395
},
{
"epoch": 0.13669313082499138,
"grad_norm": 1.89638352394104,
"learning_rate": 2e-05,
"loss": 0.9957,
"step": 396
},
{
"epoch": 0.13703831549879186,
"grad_norm": 1.949547290802002,
"learning_rate": 2e-05,
"loss": 1.0009,
"step": 397
},
{
"epoch": 0.13738350017259235,
"grad_norm": 1.9283349514007568,
"learning_rate": 2e-05,
"loss": 0.9941,
"step": 398
},
{
"epoch": 0.1377286848463928,
"grad_norm": 1.9636731147766113,
"learning_rate": 2e-05,
"loss": 0.9358,
"step": 399
},
{
"epoch": 0.1380738695201933,
"grad_norm": 1.988175630569458,
"learning_rate": 2e-05,
"loss": 1.0143,
"step": 400
}
],
"logging_steps": 1.0,
"max_steps": 14485,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 165248866713600.0,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}