Malaysian-Llama-3.2-3B-Instruct
/
lora-embedding-128-llama3.2-3b-malaysian-8k
/checkpoint-400
/trainer_state.json
{ | |
"best_global_step": null, | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 0.1380738695201933, | |
"eval_steps": 500, | |
"global_step": 400, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.00034518467380048324, | |
"grad_norm": 25.833335876464844, | |
"learning_rate": 0.0, | |
"loss": 2.2926, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.0006903693476009665, | |
"grad_norm": 23.58083724975586, | |
"learning_rate": 2e-05, | |
"loss": 2.1943, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.0010355540214014498, | |
"grad_norm": 24.2187442779541, | |
"learning_rate": 2e-05, | |
"loss": 2.3772, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.001380738695201933, | |
"grad_norm": 14.579557418823242, | |
"learning_rate": 2e-05, | |
"loss": 2.0945, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.0017259233690024164, | |
"grad_norm": 15.069305419921875, | |
"learning_rate": 2e-05, | |
"loss": 2.1537, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.0020711080428028996, | |
"grad_norm": 14.385339736938477, | |
"learning_rate": 2e-05, | |
"loss": 2.1585, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.002416292716603383, | |
"grad_norm": 9.95476245880127, | |
"learning_rate": 2e-05, | |
"loss": 1.8333, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.002761477390403866, | |
"grad_norm": 9.15005111694336, | |
"learning_rate": 2e-05, | |
"loss": 1.937, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.0031066620642043494, | |
"grad_norm": 7.361661911010742, | |
"learning_rate": 2e-05, | |
"loss": 1.9484, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.0034518467380048328, | |
"grad_norm": 5.931179046630859, | |
"learning_rate": 2e-05, | |
"loss": 1.7668, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.0037970314118053157, | |
"grad_norm": 8.192151069641113, | |
"learning_rate": 2e-05, | |
"loss": 1.8234, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.004142216085605799, | |
"grad_norm": 6.827078342437744, | |
"learning_rate": 2e-05, | |
"loss": 1.9133, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.0044874007594062825, | |
"grad_norm": 7.808759689331055, | |
"learning_rate": 2e-05, | |
"loss": 1.8314, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.004832585433206766, | |
"grad_norm": 7.531504154205322, | |
"learning_rate": 2e-05, | |
"loss": 1.7961, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.0051777701070072485, | |
"grad_norm": 6.617609977722168, | |
"learning_rate": 2e-05, | |
"loss": 1.7624, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.005522954780807732, | |
"grad_norm": 4.966954708099365, | |
"learning_rate": 2e-05, | |
"loss": 1.7034, | |
"step": 16 | |
}, | |
{ | |
"epoch": 0.005868139454608215, | |
"grad_norm": 5.566157817840576, | |
"learning_rate": 2e-05, | |
"loss": 1.7722, | |
"step": 17 | |
}, | |
{ | |
"epoch": 0.006213324128408699, | |
"grad_norm": 4.915079593658447, | |
"learning_rate": 2e-05, | |
"loss": 1.6602, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.006558508802209182, | |
"grad_norm": 4.77263069152832, | |
"learning_rate": 2e-05, | |
"loss": 1.6854, | |
"step": 19 | |
}, | |
{ | |
"epoch": 0.0069036934760096655, | |
"grad_norm": 4.299682140350342, | |
"learning_rate": 2e-05, | |
"loss": 1.6217, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.007248878149810148, | |
"grad_norm": 4.1203694343566895, | |
"learning_rate": 2e-05, | |
"loss": 1.6546, | |
"step": 21 | |
}, | |
{ | |
"epoch": 0.0075940628236106315, | |
"grad_norm": 4.058311939239502, | |
"learning_rate": 2e-05, | |
"loss": 1.6795, | |
"step": 22 | |
}, | |
{ | |
"epoch": 0.007939247497411116, | |
"grad_norm": 3.668001651763916, | |
"learning_rate": 2e-05, | |
"loss": 1.7228, | |
"step": 23 | |
}, | |
{ | |
"epoch": 0.008284432171211598, | |
"grad_norm": 3.397109270095825, | |
"learning_rate": 2e-05, | |
"loss": 1.555, | |
"step": 24 | |
}, | |
{ | |
"epoch": 0.00862961684501208, | |
"grad_norm": 4.351656436920166, | |
"learning_rate": 2e-05, | |
"loss": 1.6977, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.008974801518812565, | |
"grad_norm": 3.7789554595947266, | |
"learning_rate": 2e-05, | |
"loss": 1.6534, | |
"step": 26 | |
}, | |
{ | |
"epoch": 0.009319986192613048, | |
"grad_norm": 3.4693193435668945, | |
"learning_rate": 2e-05, | |
"loss": 1.6248, | |
"step": 27 | |
}, | |
{ | |
"epoch": 0.009665170866413532, | |
"grad_norm": 2.6027116775512695, | |
"learning_rate": 2e-05, | |
"loss": 1.4901, | |
"step": 28 | |
}, | |
{ | |
"epoch": 0.010010355540214014, | |
"grad_norm": 2.6816513538360596, | |
"learning_rate": 2e-05, | |
"loss": 1.4964, | |
"step": 29 | |
}, | |
{ | |
"epoch": 0.010355540214014497, | |
"grad_norm": 5.244405746459961, | |
"learning_rate": 2e-05, | |
"loss": 1.5238, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.010700724887814981, | |
"grad_norm": 4.071628570556641, | |
"learning_rate": 2e-05, | |
"loss": 1.5401, | |
"step": 31 | |
}, | |
{ | |
"epoch": 0.011045909561615464, | |
"grad_norm": 3.897395372390747, | |
"learning_rate": 2e-05, | |
"loss": 1.4759, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.011391094235415948, | |
"grad_norm": 2.9609882831573486, | |
"learning_rate": 2e-05, | |
"loss": 1.4919, | |
"step": 33 | |
}, | |
{ | |
"epoch": 0.01173627890921643, | |
"grad_norm": 3.1106183528900146, | |
"learning_rate": 2e-05, | |
"loss": 1.5436, | |
"step": 34 | |
}, | |
{ | |
"epoch": 0.012081463583016915, | |
"grad_norm": 3.7441351413726807, | |
"learning_rate": 2e-05, | |
"loss": 1.5496, | |
"step": 35 | |
}, | |
{ | |
"epoch": 0.012426648256817397, | |
"grad_norm": 3.6406350135803223, | |
"learning_rate": 2e-05, | |
"loss": 1.4531, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.01277183293061788, | |
"grad_norm": 2.915447950363159, | |
"learning_rate": 2e-05, | |
"loss": 1.48, | |
"step": 37 | |
}, | |
{ | |
"epoch": 0.013117017604418364, | |
"grad_norm": 2.8513331413269043, | |
"learning_rate": 2e-05, | |
"loss": 1.5192, | |
"step": 38 | |
}, | |
{ | |
"epoch": 0.013462202278218847, | |
"grad_norm": 3.2347466945648193, | |
"learning_rate": 2e-05, | |
"loss": 1.5727, | |
"step": 39 | |
}, | |
{ | |
"epoch": 0.013807386952019331, | |
"grad_norm": 2.82135272026062, | |
"learning_rate": 2e-05, | |
"loss": 1.5098, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.014152571625819814, | |
"grad_norm": 2.694873809814453, | |
"learning_rate": 2e-05, | |
"loss": 1.4223, | |
"step": 41 | |
}, | |
{ | |
"epoch": 0.014497756299620296, | |
"grad_norm": 2.7129478454589844, | |
"learning_rate": 2e-05, | |
"loss": 1.4062, | |
"step": 42 | |
}, | |
{ | |
"epoch": 0.01484294097342078, | |
"grad_norm": 2.6555328369140625, | |
"learning_rate": 2e-05, | |
"loss": 1.4993, | |
"step": 43 | |
}, | |
{ | |
"epoch": 0.015188125647221263, | |
"grad_norm": 2.3439159393310547, | |
"learning_rate": 2e-05, | |
"loss": 1.4281, | |
"step": 44 | |
}, | |
{ | |
"epoch": 0.015533310321021747, | |
"grad_norm": 2.40368914604187, | |
"learning_rate": 2e-05, | |
"loss": 1.4261, | |
"step": 45 | |
}, | |
{ | |
"epoch": 0.01587849499482223, | |
"grad_norm": 2.3288614749908447, | |
"learning_rate": 2e-05, | |
"loss": 1.3958, | |
"step": 46 | |
}, | |
{ | |
"epoch": 0.016223679668622714, | |
"grad_norm": 2.474519968032837, | |
"learning_rate": 2e-05, | |
"loss": 1.4246, | |
"step": 47 | |
}, | |
{ | |
"epoch": 0.016568864342423197, | |
"grad_norm": 2.680997371673584, | |
"learning_rate": 2e-05, | |
"loss": 1.4318, | |
"step": 48 | |
}, | |
{ | |
"epoch": 0.01691404901622368, | |
"grad_norm": 2.6899235248565674, | |
"learning_rate": 2e-05, | |
"loss": 1.3581, | |
"step": 49 | |
}, | |
{ | |
"epoch": 0.01725923369002416, | |
"grad_norm": 2.5045225620269775, | |
"learning_rate": 2e-05, | |
"loss": 1.3983, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.017604418363824648, | |
"grad_norm": 2.650184154510498, | |
"learning_rate": 2e-05, | |
"loss": 1.4005, | |
"step": 51 | |
}, | |
{ | |
"epoch": 0.01794960303762513, | |
"grad_norm": 2.560302495956421, | |
"learning_rate": 2e-05, | |
"loss": 1.3706, | |
"step": 52 | |
}, | |
{ | |
"epoch": 0.018294787711425613, | |
"grad_norm": 2.116626739501953, | |
"learning_rate": 2e-05, | |
"loss": 1.2987, | |
"step": 53 | |
}, | |
{ | |
"epoch": 0.018639972385226095, | |
"grad_norm": 3.0732431411743164, | |
"learning_rate": 2e-05, | |
"loss": 1.4394, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.018985157059026578, | |
"grad_norm": 2.880014657974243, | |
"learning_rate": 2e-05, | |
"loss": 1.3772, | |
"step": 55 | |
}, | |
{ | |
"epoch": 0.019330341732827064, | |
"grad_norm": 2.65002179145813, | |
"learning_rate": 2e-05, | |
"loss": 1.344, | |
"step": 56 | |
}, | |
{ | |
"epoch": 0.019675526406627546, | |
"grad_norm": 2.4109294414520264, | |
"learning_rate": 2e-05, | |
"loss": 1.3818, | |
"step": 57 | |
}, | |
{ | |
"epoch": 0.02002071108042803, | |
"grad_norm": 3.318305730819702, | |
"learning_rate": 2e-05, | |
"loss": 1.4488, | |
"step": 58 | |
}, | |
{ | |
"epoch": 0.02036589575422851, | |
"grad_norm": 2.9551541805267334, | |
"learning_rate": 2e-05, | |
"loss": 1.3245, | |
"step": 59 | |
}, | |
{ | |
"epoch": 0.020711080428028994, | |
"grad_norm": 2.512965679168701, | |
"learning_rate": 2e-05, | |
"loss": 1.3476, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.02105626510182948, | |
"grad_norm": 2.608680248260498, | |
"learning_rate": 2e-05, | |
"loss": 1.335, | |
"step": 61 | |
}, | |
{ | |
"epoch": 0.021401449775629963, | |
"grad_norm": 2.2880616188049316, | |
"learning_rate": 2e-05, | |
"loss": 1.2781, | |
"step": 62 | |
}, | |
{ | |
"epoch": 0.021746634449430445, | |
"grad_norm": 2.7310123443603516, | |
"learning_rate": 2e-05, | |
"loss": 1.3449, | |
"step": 63 | |
}, | |
{ | |
"epoch": 0.022091819123230928, | |
"grad_norm": 2.5008130073547363, | |
"learning_rate": 2e-05, | |
"loss": 1.4042, | |
"step": 64 | |
}, | |
{ | |
"epoch": 0.02243700379703141, | |
"grad_norm": 3.5907320976257324, | |
"learning_rate": 2e-05, | |
"loss": 1.344, | |
"step": 65 | |
}, | |
{ | |
"epoch": 0.022782188470831896, | |
"grad_norm": 2.4797489643096924, | |
"learning_rate": 2e-05, | |
"loss": 1.3342, | |
"step": 66 | |
}, | |
{ | |
"epoch": 0.02312737314463238, | |
"grad_norm": 2.556204319000244, | |
"learning_rate": 2e-05, | |
"loss": 1.3244, | |
"step": 67 | |
}, | |
{ | |
"epoch": 0.02347255781843286, | |
"grad_norm": 5.068964004516602, | |
"learning_rate": 2e-05, | |
"loss": 1.3041, | |
"step": 68 | |
}, | |
{ | |
"epoch": 0.023817742492233344, | |
"grad_norm": 2.165076494216919, | |
"learning_rate": 2e-05, | |
"loss": 1.2727, | |
"step": 69 | |
}, | |
{ | |
"epoch": 0.02416292716603383, | |
"grad_norm": 2.202164649963379, | |
"learning_rate": 2e-05, | |
"loss": 1.2887, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.024508111839834312, | |
"grad_norm": 2.3834869861602783, | |
"learning_rate": 2e-05, | |
"loss": 1.2694, | |
"step": 71 | |
}, | |
{ | |
"epoch": 0.024853296513634795, | |
"grad_norm": 2.5316550731658936, | |
"learning_rate": 2e-05, | |
"loss": 1.2514, | |
"step": 72 | |
}, | |
{ | |
"epoch": 0.025198481187435277, | |
"grad_norm": 1.9449459314346313, | |
"learning_rate": 2e-05, | |
"loss": 1.26, | |
"step": 73 | |
}, | |
{ | |
"epoch": 0.02554366586123576, | |
"grad_norm": 2.2826900482177734, | |
"learning_rate": 2e-05, | |
"loss": 1.2931, | |
"step": 74 | |
}, | |
{ | |
"epoch": 0.025888850535036246, | |
"grad_norm": 2.260650396347046, | |
"learning_rate": 2e-05, | |
"loss": 1.2732, | |
"step": 75 | |
}, | |
{ | |
"epoch": 0.02623403520883673, | |
"grad_norm": 2.356182336807251, | |
"learning_rate": 2e-05, | |
"loss": 1.2449, | |
"step": 76 | |
}, | |
{ | |
"epoch": 0.02657921988263721, | |
"grad_norm": 2.199906826019287, | |
"learning_rate": 2e-05, | |
"loss": 1.2733, | |
"step": 77 | |
}, | |
{ | |
"epoch": 0.026924404556437694, | |
"grad_norm": 2.3083510398864746, | |
"learning_rate": 2e-05, | |
"loss": 1.2257, | |
"step": 78 | |
}, | |
{ | |
"epoch": 0.027269589230238176, | |
"grad_norm": 2.2658169269561768, | |
"learning_rate": 2e-05, | |
"loss": 1.2525, | |
"step": 79 | |
}, | |
{ | |
"epoch": 0.027614773904038662, | |
"grad_norm": 2.352308988571167, | |
"learning_rate": 2e-05, | |
"loss": 1.2202, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.027959958577839145, | |
"grad_norm": 2.523381471633911, | |
"learning_rate": 2e-05, | |
"loss": 1.2996, | |
"step": 81 | |
}, | |
{ | |
"epoch": 0.028305143251639627, | |
"grad_norm": 2.4327428340911865, | |
"learning_rate": 2e-05, | |
"loss": 1.2135, | |
"step": 82 | |
}, | |
{ | |
"epoch": 0.02865032792544011, | |
"grad_norm": 2.4549570083618164, | |
"learning_rate": 2e-05, | |
"loss": 1.2813, | |
"step": 83 | |
}, | |
{ | |
"epoch": 0.028995512599240592, | |
"grad_norm": 2.4394640922546387, | |
"learning_rate": 2e-05, | |
"loss": 1.2828, | |
"step": 84 | |
}, | |
{ | |
"epoch": 0.029340697273041078, | |
"grad_norm": 2.4780633449554443, | |
"learning_rate": 2e-05, | |
"loss": 1.2517, | |
"step": 85 | |
}, | |
{ | |
"epoch": 0.02968588194684156, | |
"grad_norm": 2.326880931854248, | |
"learning_rate": 2e-05, | |
"loss": 1.2288, | |
"step": 86 | |
}, | |
{ | |
"epoch": 0.030031066620642043, | |
"grad_norm": 3.1833627223968506, | |
"learning_rate": 2e-05, | |
"loss": 1.1978, | |
"step": 87 | |
}, | |
{ | |
"epoch": 0.030376251294442526, | |
"grad_norm": 2.624091625213623, | |
"learning_rate": 2e-05, | |
"loss": 1.2534, | |
"step": 88 | |
}, | |
{ | |
"epoch": 0.03072143596824301, | |
"grad_norm": 2.531895160675049, | |
"learning_rate": 2e-05, | |
"loss": 1.2263, | |
"step": 89 | |
}, | |
{ | |
"epoch": 0.031066620642043494, | |
"grad_norm": 2.2346715927124023, | |
"learning_rate": 2e-05, | |
"loss": 1.2301, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.03141180531584398, | |
"grad_norm": 2.237839698791504, | |
"learning_rate": 2e-05, | |
"loss": 1.1897, | |
"step": 91 | |
}, | |
{ | |
"epoch": 0.03175698998964446, | |
"grad_norm": 2.4267807006835938, | |
"learning_rate": 2e-05, | |
"loss": 1.2508, | |
"step": 92 | |
}, | |
{ | |
"epoch": 0.03210217466344494, | |
"grad_norm": 2.2506682872772217, | |
"learning_rate": 2e-05, | |
"loss": 1.2262, | |
"step": 93 | |
}, | |
{ | |
"epoch": 0.03244735933724543, | |
"grad_norm": 2.5266077518463135, | |
"learning_rate": 2e-05, | |
"loss": 1.3139, | |
"step": 94 | |
}, | |
{ | |
"epoch": 0.03279254401104591, | |
"grad_norm": 2.2002406120300293, | |
"learning_rate": 2e-05, | |
"loss": 1.2755, | |
"step": 95 | |
}, | |
{ | |
"epoch": 0.03313772868484639, | |
"grad_norm": 2.20263409614563, | |
"learning_rate": 2e-05, | |
"loss": 1.2388, | |
"step": 96 | |
}, | |
{ | |
"epoch": 0.03348291335864688, | |
"grad_norm": 2.297576665878296, | |
"learning_rate": 2e-05, | |
"loss": 1.2406, | |
"step": 97 | |
}, | |
{ | |
"epoch": 0.03382809803244736, | |
"grad_norm": 2.3951361179351807, | |
"learning_rate": 2e-05, | |
"loss": 1.2244, | |
"step": 98 | |
}, | |
{ | |
"epoch": 0.034173282706247844, | |
"grad_norm": 2.2707271575927734, | |
"learning_rate": 2e-05, | |
"loss": 1.2719, | |
"step": 99 | |
}, | |
{ | |
"epoch": 0.03451846738004832, | |
"grad_norm": 2.2766411304473877, | |
"learning_rate": 2e-05, | |
"loss": 1.23, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.03486365205384881, | |
"grad_norm": 2.253887414932251, | |
"learning_rate": 2e-05, | |
"loss": 1.1369, | |
"step": 101 | |
}, | |
{ | |
"epoch": 0.035208836727649295, | |
"grad_norm": 2.0003821849823, | |
"learning_rate": 2e-05, | |
"loss": 1.2547, | |
"step": 102 | |
}, | |
{ | |
"epoch": 0.035554021401449774, | |
"grad_norm": 2.277153253555298, | |
"learning_rate": 2e-05, | |
"loss": 1.2244, | |
"step": 103 | |
}, | |
{ | |
"epoch": 0.03589920607525026, | |
"grad_norm": 2.1561081409454346, | |
"learning_rate": 2e-05, | |
"loss": 1.2222, | |
"step": 104 | |
}, | |
{ | |
"epoch": 0.03624439074905074, | |
"grad_norm": 2.0002012252807617, | |
"learning_rate": 2e-05, | |
"loss": 1.1599, | |
"step": 105 | |
}, | |
{ | |
"epoch": 0.036589575422851225, | |
"grad_norm": 2.3313021659851074, | |
"learning_rate": 2e-05, | |
"loss": 1.1658, | |
"step": 106 | |
}, | |
{ | |
"epoch": 0.03693476009665171, | |
"grad_norm": 2.58686900138855, | |
"learning_rate": 2e-05, | |
"loss": 1.2282, | |
"step": 107 | |
}, | |
{ | |
"epoch": 0.03727994477045219, | |
"grad_norm": 2.485671043395996, | |
"learning_rate": 2e-05, | |
"loss": 1.1537, | |
"step": 108 | |
}, | |
{ | |
"epoch": 0.037625129444252677, | |
"grad_norm": 2.3962278366088867, | |
"learning_rate": 2e-05, | |
"loss": 1.133, | |
"step": 109 | |
}, | |
{ | |
"epoch": 0.037970314118053156, | |
"grad_norm": 2.118319034576416, | |
"learning_rate": 2e-05, | |
"loss": 1.1769, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.03831549879185364, | |
"grad_norm": 2.095940113067627, | |
"learning_rate": 2e-05, | |
"loss": 1.1763, | |
"step": 111 | |
}, | |
{ | |
"epoch": 0.03866068346565413, | |
"grad_norm": 2.0862512588500977, | |
"learning_rate": 2e-05, | |
"loss": 1.1356, | |
"step": 112 | |
}, | |
{ | |
"epoch": 0.03900586813945461, | |
"grad_norm": 1.9276546239852905, | |
"learning_rate": 2e-05, | |
"loss": 1.2068, | |
"step": 113 | |
}, | |
{ | |
"epoch": 0.03935105281325509, | |
"grad_norm": 2.5604825019836426, | |
"learning_rate": 2e-05, | |
"loss": 1.16, | |
"step": 114 | |
}, | |
{ | |
"epoch": 0.03969623748705557, | |
"grad_norm": 2.200289726257324, | |
"learning_rate": 2e-05, | |
"loss": 1.1811, | |
"step": 115 | |
}, | |
{ | |
"epoch": 0.04004142216085606, | |
"grad_norm": 2.2654318809509277, | |
"learning_rate": 2e-05, | |
"loss": 1.2745, | |
"step": 116 | |
}, | |
{ | |
"epoch": 0.040386606834656544, | |
"grad_norm": 2.194129228591919, | |
"learning_rate": 2e-05, | |
"loss": 1.1633, | |
"step": 117 | |
}, | |
{ | |
"epoch": 0.04073179150845702, | |
"grad_norm": 2.267609119415283, | |
"learning_rate": 2e-05, | |
"loss": 1.1025, | |
"step": 118 | |
}, | |
{ | |
"epoch": 0.04107697618225751, | |
"grad_norm": 2.3966379165649414, | |
"learning_rate": 2e-05, | |
"loss": 1.1624, | |
"step": 119 | |
}, | |
{ | |
"epoch": 0.04142216085605799, | |
"grad_norm": 2.3428127765655518, | |
"learning_rate": 2e-05, | |
"loss": 1.2521, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.041767345529858474, | |
"grad_norm": 2.257154941558838, | |
"learning_rate": 2e-05, | |
"loss": 1.1427, | |
"step": 121 | |
}, | |
{ | |
"epoch": 0.04211253020365896, | |
"grad_norm": 2.4615261554718018, | |
"learning_rate": 2e-05, | |
"loss": 1.209, | |
"step": 122 | |
}, | |
{ | |
"epoch": 0.04245771487745944, | |
"grad_norm": 2.215366840362549, | |
"learning_rate": 2e-05, | |
"loss": 1.1213, | |
"step": 123 | |
}, | |
{ | |
"epoch": 0.042802899551259925, | |
"grad_norm": 2.293936014175415, | |
"learning_rate": 2e-05, | |
"loss": 1.1118, | |
"step": 124 | |
}, | |
{ | |
"epoch": 0.043148084225060404, | |
"grad_norm": 2.351991653442383, | |
"learning_rate": 2e-05, | |
"loss": 1.1849, | |
"step": 125 | |
}, | |
{ | |
"epoch": 0.04349326889886089, | |
"grad_norm": 2.118906259536743, | |
"learning_rate": 2e-05, | |
"loss": 1.1905, | |
"step": 126 | |
}, | |
{ | |
"epoch": 0.043838453572661376, | |
"grad_norm": 2.1108460426330566, | |
"learning_rate": 2e-05, | |
"loss": 1.1787, | |
"step": 127 | |
}, | |
{ | |
"epoch": 0.044183638246461855, | |
"grad_norm": 2.560715436935425, | |
"learning_rate": 2e-05, | |
"loss": 1.0727, | |
"step": 128 | |
}, | |
{ | |
"epoch": 0.04452882292026234, | |
"grad_norm": 1.9983900785446167, | |
"learning_rate": 2e-05, | |
"loss": 1.1665, | |
"step": 129 | |
}, | |
{ | |
"epoch": 0.04487400759406282, | |
"grad_norm": 2.3714847564697266, | |
"learning_rate": 2e-05, | |
"loss": 1.1488, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.045219192267863306, | |
"grad_norm": 2.243546724319458, | |
"learning_rate": 2e-05, | |
"loss": 1.0965, | |
"step": 131 | |
}, | |
{ | |
"epoch": 0.04556437694166379, | |
"grad_norm": 1.989858627319336, | |
"learning_rate": 2e-05, | |
"loss": 1.0833, | |
"step": 132 | |
}, | |
{ | |
"epoch": 0.04590956161546427, | |
"grad_norm": 2.298943519592285, | |
"learning_rate": 2e-05, | |
"loss": 1.1548, | |
"step": 133 | |
}, | |
{ | |
"epoch": 0.04625474628926476, | |
"grad_norm": 2.256289005279541, | |
"learning_rate": 2e-05, | |
"loss": 1.1139, | |
"step": 134 | |
}, | |
{ | |
"epoch": 0.04659993096306524, | |
"grad_norm": 2.386530876159668, | |
"learning_rate": 2e-05, | |
"loss": 1.1245, | |
"step": 135 | |
}, | |
{ | |
"epoch": 0.04694511563686572, | |
"grad_norm": 2.073687791824341, | |
"learning_rate": 2e-05, | |
"loss": 1.1337, | |
"step": 136 | |
}, | |
{ | |
"epoch": 0.04729030031066621, | |
"grad_norm": 2.307697057723999, | |
"learning_rate": 2e-05, | |
"loss": 1.1325, | |
"step": 137 | |
}, | |
{ | |
"epoch": 0.04763548498446669, | |
"grad_norm": 2.1158194541931152, | |
"learning_rate": 2e-05, | |
"loss": 1.1637, | |
"step": 138 | |
}, | |
{ | |
"epoch": 0.047980669658267174, | |
"grad_norm": 2.21449613571167, | |
"learning_rate": 2e-05, | |
"loss": 1.1802, | |
"step": 139 | |
}, | |
{ | |
"epoch": 0.04832585433206766, | |
"grad_norm": 2.041476249694824, | |
"learning_rate": 2e-05, | |
"loss": 1.1007, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.04867103900586814, | |
"grad_norm": 2.262849807739258, | |
"learning_rate": 2e-05, | |
"loss": 1.1006, | |
"step": 141 | |
}, | |
{ | |
"epoch": 0.049016223679668625, | |
"grad_norm": 2.207761287689209, | |
"learning_rate": 2e-05, | |
"loss": 1.133, | |
"step": 142 | |
}, | |
{ | |
"epoch": 0.049361408353469104, | |
"grad_norm": 2.083613872528076, | |
"learning_rate": 2e-05, | |
"loss": 1.1319, | |
"step": 143 | |
}, | |
{ | |
"epoch": 0.04970659302726959, | |
"grad_norm": 2.1225838661193848, | |
"learning_rate": 2e-05, | |
"loss": 1.0786, | |
"step": 144 | |
}, | |
{ | |
"epoch": 0.050051777701070076, | |
"grad_norm": 2.17154598236084, | |
"learning_rate": 2e-05, | |
"loss": 1.2138, | |
"step": 145 | |
}, | |
{ | |
"epoch": 0.050396962374870555, | |
"grad_norm": 2.655251979827881, | |
"learning_rate": 2e-05, | |
"loss": 1.1639, | |
"step": 146 | |
}, | |
{ | |
"epoch": 0.05074214704867104, | |
"grad_norm": 2.241605758666992, | |
"learning_rate": 2e-05, | |
"loss": 1.1345, | |
"step": 147 | |
}, | |
{ | |
"epoch": 0.05108733172247152, | |
"grad_norm": 2.397520065307617, | |
"learning_rate": 2e-05, | |
"loss": 1.1592, | |
"step": 148 | |
}, | |
{ | |
"epoch": 0.051432516396272006, | |
"grad_norm": 2.2444629669189453, | |
"learning_rate": 2e-05, | |
"loss": 1.134, | |
"step": 149 | |
}, | |
{ | |
"epoch": 0.05177770107007249, | |
"grad_norm": 2.179332971572876, | |
"learning_rate": 2e-05, | |
"loss": 1.1726, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.05212288574387297, | |
"grad_norm": 2.1803085803985596, | |
"learning_rate": 2e-05, | |
"loss": 1.1458, | |
"step": 151 | |
}, | |
{ | |
"epoch": 0.05246807041767346, | |
"grad_norm": 2.06752872467041, | |
"learning_rate": 2e-05, | |
"loss": 1.1741, | |
"step": 152 | |
}, | |
{ | |
"epoch": 0.052813255091473936, | |
"grad_norm": 1.9567921161651611, | |
"learning_rate": 2e-05, | |
"loss": 1.1783, | |
"step": 153 | |
}, | |
{ | |
"epoch": 0.05315843976527442, | |
"grad_norm": 1.8780710697174072, | |
"learning_rate": 2e-05, | |
"loss": 1.0461, | |
"step": 154 | |
}, | |
{ | |
"epoch": 0.05350362443907491, | |
"grad_norm": 2.142355442047119, | |
"learning_rate": 2e-05, | |
"loss": 1.1683, | |
"step": 155 | |
}, | |
{ | |
"epoch": 0.05384880911287539, | |
"grad_norm": 2.404834270477295, | |
"learning_rate": 2e-05, | |
"loss": 1.1367, | |
"step": 156 | |
}, | |
{ | |
"epoch": 0.05419399378667587, | |
"grad_norm": 2.467586040496826, | |
"learning_rate": 2e-05, | |
"loss": 1.0616, | |
"step": 157 | |
}, | |
{ | |
"epoch": 0.05453917846047635, | |
"grad_norm": 2.428678035736084, | |
"learning_rate": 2e-05, | |
"loss": 1.1587, | |
"step": 158 | |
}, | |
{ | |
"epoch": 0.05488436313427684, | |
"grad_norm": 1.9721519947052002, | |
"learning_rate": 2e-05, | |
"loss": 1.0864, | |
"step": 159 | |
}, | |
{ | |
"epoch": 0.055229547808077324, | |
"grad_norm": 2.282735586166382, | |
"learning_rate": 2e-05, | |
"loss": 1.1805, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.0555747324818778, | |
"grad_norm": 2.136899471282959, | |
"learning_rate": 2e-05, | |
"loss": 1.0941, | |
"step": 161 | |
}, | |
{ | |
"epoch": 0.05591991715567829, | |
"grad_norm": 2.0251846313476562, | |
"learning_rate": 2e-05, | |
"loss": 1.0796, | |
"step": 162 | |
}, | |
{ | |
"epoch": 0.05626510182947877, | |
"grad_norm": 2.1328299045562744, | |
"learning_rate": 2e-05, | |
"loss": 1.0917, | |
"step": 163 | |
}, | |
{ | |
"epoch": 0.056610286503279254, | |
"grad_norm": 2.205331802368164, | |
"learning_rate": 2e-05, | |
"loss": 1.1792, | |
"step": 164 | |
}, | |
{ | |
"epoch": 0.05695547117707974, | |
"grad_norm": 2.0339744091033936, | |
"learning_rate": 2e-05, | |
"loss": 1.1874, | |
"step": 165 | |
}, | |
{ | |
"epoch": 0.05730065585088022, | |
"grad_norm": 1.8030365705490112, | |
"learning_rate": 2e-05, | |
"loss": 1.0929, | |
"step": 166 | |
}, | |
{ | |
"epoch": 0.057645840524680705, | |
"grad_norm": 2.1905670166015625, | |
"learning_rate": 2e-05, | |
"loss": 1.1831, | |
"step": 167 | |
}, | |
{ | |
"epoch": 0.057991025198481184, | |
"grad_norm": 2.142336845397949, | |
"learning_rate": 2e-05, | |
"loss": 1.1237, | |
"step": 168 | |
}, | |
{ | |
"epoch": 0.05833620987228167, | |
"grad_norm": 2.3521053791046143, | |
"learning_rate": 2e-05, | |
"loss": 1.1077, | |
"step": 169 | |
}, | |
{ | |
"epoch": 0.058681394546082156, | |
"grad_norm": 2.105743408203125, | |
"learning_rate": 2e-05, | |
"loss": 1.1048, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.059026579219882636, | |
"grad_norm": 1.9414738416671753, | |
"learning_rate": 2e-05, | |
"loss": 1.112, | |
"step": 171 | |
}, | |
{ | |
"epoch": 0.05937176389368312, | |
"grad_norm": 1.9606658220291138, | |
"learning_rate": 2e-05, | |
"loss": 1.1109, | |
"step": 172 | |
}, | |
{ | |
"epoch": 0.0597169485674836, | |
"grad_norm": 2.3274831771850586, | |
"learning_rate": 2e-05, | |
"loss": 1.1803, | |
"step": 173 | |
}, | |
{ | |
"epoch": 0.06006213324128409, | |
"grad_norm": 2.1384570598602295, | |
"learning_rate": 2e-05, | |
"loss": 1.0515, | |
"step": 174 | |
}, | |
{ | |
"epoch": 0.06040731791508457, | |
"grad_norm": 2.0795719623565674, | |
"learning_rate": 2e-05, | |
"loss": 1.0581, | |
"step": 175 | |
}, | |
{ | |
"epoch": 0.06075250258888505, | |
"grad_norm": 2.0180423259735107, | |
"learning_rate": 2e-05, | |
"loss": 1.0686, | |
"step": 176 | |
}, | |
{ | |
"epoch": 0.06109768726268554, | |
"grad_norm": 2.0913267135620117, | |
"learning_rate": 2e-05, | |
"loss": 1.117, | |
"step": 177 | |
}, | |
{ | |
"epoch": 0.06144287193648602, | |
"grad_norm": 2.0325934886932373, | |
"learning_rate": 2e-05, | |
"loss": 1.1526, | |
"step": 178 | |
}, | |
{ | |
"epoch": 0.0617880566102865, | |
"grad_norm": 2.222254991531372, | |
"learning_rate": 2e-05, | |
"loss": 1.113, | |
"step": 179 | |
}, | |
{ | |
"epoch": 0.06213324128408699, | |
"grad_norm": 2.2039270401000977, | |
"learning_rate": 2e-05, | |
"loss": 1.1886, | |
"step": 180 | |
}, | |
{ | |
"epoch": 0.06247842595788747, | |
"grad_norm": 2.0291781425476074, | |
"learning_rate": 2e-05, | |
"loss": 1.0884, | |
"step": 181 | |
}, | |
{ | |
"epoch": 0.06282361063168795, | |
"grad_norm": 2.2183430194854736, | |
"learning_rate": 2e-05, | |
"loss": 1.0223, | |
"step": 182 | |
}, | |
{ | |
"epoch": 0.06316879530548844, | |
"grad_norm": 2.37440824508667, | |
"learning_rate": 2e-05, | |
"loss": 1.0775, | |
"step": 183 | |
}, | |
{ | |
"epoch": 0.06351397997928893, | |
"grad_norm": 1.8214384317398071, | |
"learning_rate": 2e-05, | |
"loss": 1.1279, | |
"step": 184 | |
}, | |
{ | |
"epoch": 0.0638591646530894, | |
"grad_norm": 2.205291271209717, | |
"learning_rate": 2e-05, | |
"loss": 1.0396, | |
"step": 185 | |
}, | |
{ | |
"epoch": 0.06420434932688988, | |
"grad_norm": 2.137577533721924, | |
"learning_rate": 2e-05, | |
"loss": 1.0861, | |
"step": 186 | |
}, | |
{ | |
"epoch": 0.06454953400069037, | |
"grad_norm": 1.982663869857788, | |
"learning_rate": 2e-05, | |
"loss": 1.1612, | |
"step": 187 | |
}, | |
{ | |
"epoch": 0.06489471867449086, | |
"grad_norm": 21.140506744384766, | |
"learning_rate": 2e-05, | |
"loss": 1.0824, | |
"step": 188 | |
}, | |
{ | |
"epoch": 0.06523990334829134, | |
"grad_norm": 2.2611193656921387, | |
"learning_rate": 2e-05, | |
"loss": 1.0795, | |
"step": 189 | |
}, | |
{ | |
"epoch": 0.06558508802209181, | |
"grad_norm": 2.0905325412750244, | |
"learning_rate": 2e-05, | |
"loss": 1.1989, | |
"step": 190 | |
}, | |
{ | |
"epoch": 0.0659302726958923, | |
"grad_norm": 1.9430997371673584, | |
"learning_rate": 2e-05, | |
"loss": 1.0885, | |
"step": 191 | |
}, | |
{ | |
"epoch": 0.06627545736969279, | |
"grad_norm": 1.8876497745513916, | |
"learning_rate": 2e-05, | |
"loss": 1.0708, | |
"step": 192 | |
}, | |
{ | |
"epoch": 0.06662064204349327, | |
"grad_norm": 2.0716099739074707, | |
"learning_rate": 2e-05, | |
"loss": 1.124, | |
"step": 193 | |
}, | |
{ | |
"epoch": 0.06696582671729376, | |
"grad_norm": 2.413959503173828, | |
"learning_rate": 2e-05, | |
"loss": 1.1856, | |
"step": 194 | |
}, | |
{ | |
"epoch": 0.06731101139109423, | |
"grad_norm": 1.8021107912063599, | |
"learning_rate": 2e-05, | |
"loss": 1.1284, | |
"step": 195 | |
}, | |
{ | |
"epoch": 0.06765619606489472, | |
"grad_norm": 2.2795395851135254, | |
"learning_rate": 2e-05, | |
"loss": 1.1101, | |
"step": 196 | |
}, | |
{ | |
"epoch": 0.0680013807386952, | |
"grad_norm": 1.936448097229004, | |
"learning_rate": 2e-05, | |
"loss": 1.0921, | |
"step": 197 | |
}, | |
{ | |
"epoch": 0.06834656541249569, | |
"grad_norm": 1.940928339958191, | |
"learning_rate": 2e-05, | |
"loss": 1.1236, | |
"step": 198 | |
}, | |
{ | |
"epoch": 0.06869175008629617, | |
"grad_norm": 2.1147520542144775, | |
"learning_rate": 2e-05, | |
"loss": 1.0332, | |
"step": 199 | |
}, | |
{ | |
"epoch": 0.06903693476009665, | |
"grad_norm": 1.9784513711929321, | |
"learning_rate": 2e-05, | |
"loss": 1.0644, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.06938211943389713, | |
"grad_norm": 2.135711431503296, | |
"learning_rate": 2e-05, | |
"loss": 1.0189, | |
"step": 201 | |
}, | |
{ | |
"epoch": 0.06972730410769762, | |
"grad_norm": 2.3416550159454346, | |
"learning_rate": 2e-05, | |
"loss": 1.0788, | |
"step": 202 | |
}, | |
{ | |
"epoch": 0.0700724887814981, | |
"grad_norm": 2.143134593963623, | |
"learning_rate": 2e-05, | |
"loss": 1.0657, | |
"step": 203 | |
}, | |
{ | |
"epoch": 0.07041767345529859, | |
"grad_norm": 9.058279991149902, | |
"learning_rate": 2e-05, | |
"loss": 1.0657, | |
"step": 204 | |
}, | |
{ | |
"epoch": 0.07076285812909906, | |
"grad_norm": 2.2367799282073975, | |
"learning_rate": 2e-05, | |
"loss": 1.0822, | |
"step": 205 | |
}, | |
{ | |
"epoch": 0.07110804280289955, | |
"grad_norm": 1.9047443866729736, | |
"learning_rate": 2e-05, | |
"loss": 1.0999, | |
"step": 206 | |
}, | |
{ | |
"epoch": 0.07145322747670003, | |
"grad_norm": 2.307863473892212, | |
"learning_rate": 2e-05, | |
"loss": 1.1275, | |
"step": 207 | |
}, | |
{ | |
"epoch": 0.07179841215050052, | |
"grad_norm": 2.0635058879852295, | |
"learning_rate": 2e-05, | |
"loss": 1.0477, | |
"step": 208 | |
}, | |
{ | |
"epoch": 0.072143596824301, | |
"grad_norm": 2.144148111343384, | |
"learning_rate": 2e-05, | |
"loss": 1.0351, | |
"step": 209 | |
}, | |
{ | |
"epoch": 0.07248878149810148, | |
"grad_norm": 1.9361531734466553, | |
"learning_rate": 2e-05, | |
"loss": 1.0704, | |
"step": 210 | |
}, | |
{ | |
"epoch": 0.07283396617190196, | |
"grad_norm": 1.8395414352416992, | |
"learning_rate": 2e-05, | |
"loss": 1.1099, | |
"step": 211 | |
}, | |
{ | |
"epoch": 0.07317915084570245, | |
"grad_norm": 2.1172099113464355, | |
"learning_rate": 2e-05, | |
"loss": 1.1169, | |
"step": 212 | |
}, | |
{ | |
"epoch": 0.07352433551950294, | |
"grad_norm": 2.084325075149536, | |
"learning_rate": 2e-05, | |
"loss": 1.0347, | |
"step": 213 | |
}, | |
{ | |
"epoch": 0.07386952019330342, | |
"grad_norm": 2.1838455200195312, | |
"learning_rate": 2e-05, | |
"loss": 1.0926, | |
"step": 214 | |
}, | |
{ | |
"epoch": 0.0742147048671039, | |
"grad_norm": 2.1516849994659424, | |
"learning_rate": 2e-05, | |
"loss": 1.082, | |
"step": 215 | |
}, | |
{ | |
"epoch": 0.07455988954090438, | |
"grad_norm": 2.011460781097412, | |
"learning_rate": 2e-05, | |
"loss": 1.073, | |
"step": 216 | |
}, | |
{ | |
"epoch": 0.07490507421470487, | |
"grad_norm": 2.2096571922302246, | |
"learning_rate": 2e-05, | |
"loss": 1.1239, | |
"step": 217 | |
}, | |
{ | |
"epoch": 0.07525025888850535, | |
"grad_norm": 2.088879346847534, | |
"learning_rate": 2e-05, | |
"loss": 1.1313, | |
"step": 218 | |
}, | |
{ | |
"epoch": 0.07559544356230584, | |
"grad_norm": 2.0846405029296875, | |
"learning_rate": 2e-05, | |
"loss": 0.9951, | |
"step": 219 | |
}, | |
{ | |
"epoch": 0.07594062823610631, | |
"grad_norm": 1.9645204544067383, | |
"learning_rate": 2e-05, | |
"loss": 1.0431, | |
"step": 220 | |
}, | |
{ | |
"epoch": 0.0762858129099068, | |
"grad_norm": 2.1063179969787598, | |
"learning_rate": 2e-05, | |
"loss": 1.0659, | |
"step": 221 | |
}, | |
{ | |
"epoch": 0.07663099758370728, | |
"grad_norm": 2.0268285274505615, | |
"learning_rate": 2e-05, | |
"loss": 1.0909, | |
"step": 222 | |
}, | |
{ | |
"epoch": 0.07697618225750777, | |
"grad_norm": 1.9405102729797363, | |
"learning_rate": 2e-05, | |
"loss": 1.0, | |
"step": 223 | |
}, | |
{ | |
"epoch": 0.07732136693130826, | |
"grad_norm": 2.1061298847198486, | |
"learning_rate": 2e-05, | |
"loss": 1.083, | |
"step": 224 | |
}, | |
{ | |
"epoch": 0.07766655160510873, | |
"grad_norm": 2.07513165473938, | |
"learning_rate": 2e-05, | |
"loss": 1.0872, | |
"step": 225 | |
}, | |
{ | |
"epoch": 0.07801173627890921, | |
"grad_norm": 2.2630527019500732, | |
"learning_rate": 2e-05, | |
"loss": 1.0296, | |
"step": 226 | |
}, | |
{ | |
"epoch": 0.0783569209527097, | |
"grad_norm": 2.0668439865112305, | |
"learning_rate": 2e-05, | |
"loss": 1.1433, | |
"step": 227 | |
}, | |
{ | |
"epoch": 0.07870210562651019, | |
"grad_norm": 2.3092525005340576, | |
"learning_rate": 2e-05, | |
"loss": 1.0347, | |
"step": 228 | |
}, | |
{ | |
"epoch": 0.07904729030031067, | |
"grad_norm": 2.0190646648406982, | |
"learning_rate": 2e-05, | |
"loss": 1.0423, | |
"step": 229 | |
}, | |
{ | |
"epoch": 0.07939247497411114, | |
"grad_norm": 2.0675878524780273, | |
"learning_rate": 2e-05, | |
"loss": 1.1551, | |
"step": 230 | |
}, | |
{ | |
"epoch": 0.07973765964791163, | |
"grad_norm": 2.282857894897461, | |
"learning_rate": 2e-05, | |
"loss": 1.043, | |
"step": 231 | |
}, | |
{ | |
"epoch": 0.08008284432171212, | |
"grad_norm": 1.886343240737915, | |
"learning_rate": 2e-05, | |
"loss": 1.0244, | |
"step": 232 | |
}, | |
{ | |
"epoch": 0.0804280289955126, | |
"grad_norm": 2.2882308959960938, | |
"learning_rate": 2e-05, | |
"loss": 1.0228, | |
"step": 233 | |
}, | |
{ | |
"epoch": 0.08077321366931309, | |
"grad_norm": 2.05058217048645, | |
"learning_rate": 2e-05, | |
"loss": 1.0567, | |
"step": 234 | |
}, | |
{ | |
"epoch": 0.08111839834311356, | |
"grad_norm": 2.2782809734344482, | |
"learning_rate": 2e-05, | |
"loss": 1.0744, | |
"step": 235 | |
}, | |
{ | |
"epoch": 0.08146358301691405, | |
"grad_norm": 1.9740854501724243, | |
"learning_rate": 2e-05, | |
"loss": 1.0601, | |
"step": 236 | |
}, | |
{ | |
"epoch": 0.08180876769071453, | |
"grad_norm": 2.08333158493042, | |
"learning_rate": 2e-05, | |
"loss": 1.0873, | |
"step": 237 | |
}, | |
{ | |
"epoch": 0.08215395236451502, | |
"grad_norm": 2.0546019077301025, | |
"learning_rate": 2e-05, | |
"loss": 1.088, | |
"step": 238 | |
}, | |
{ | |
"epoch": 0.0824991370383155, | |
"grad_norm": 1.9426814317703247, | |
"learning_rate": 2e-05, | |
"loss": 1.0512, | |
"step": 239 | |
}, | |
{ | |
"epoch": 0.08284432171211598, | |
"grad_norm": 2.0802295207977295, | |
"learning_rate": 2e-05, | |
"loss": 1.1123, | |
"step": 240 | |
}, | |
{ | |
"epoch": 0.08318950638591646, | |
"grad_norm": 2.010526657104492, | |
"learning_rate": 2e-05, | |
"loss": 1.0883, | |
"step": 241 | |
}, | |
{ | |
"epoch": 0.08353469105971695, | |
"grad_norm": 2.083188056945801, | |
"learning_rate": 2e-05, | |
"loss": 1.0987, | |
"step": 242 | |
}, | |
{ | |
"epoch": 0.08387987573351743, | |
"grad_norm": 2.137660264968872, | |
"learning_rate": 2e-05, | |
"loss": 0.9924, | |
"step": 243 | |
}, | |
{ | |
"epoch": 0.08422506040731792, | |
"grad_norm": 2.041710376739502, | |
"learning_rate": 2e-05, | |
"loss": 1.1403, | |
"step": 244 | |
}, | |
{ | |
"epoch": 0.08457024508111839, | |
"grad_norm": 2.0598714351654053, | |
"learning_rate": 2e-05, | |
"loss": 1.02, | |
"step": 245 | |
}, | |
{ | |
"epoch": 0.08491542975491888, | |
"grad_norm": 2.168576955795288, | |
"learning_rate": 2e-05, | |
"loss": 1.0771, | |
"step": 246 | |
}, | |
{ | |
"epoch": 0.08526061442871936, | |
"grad_norm": 2.145132303237915, | |
"learning_rate": 2e-05, | |
"loss": 1.0746, | |
"step": 247 | |
}, | |
{ | |
"epoch": 0.08560579910251985, | |
"grad_norm": 2.24804425239563, | |
"learning_rate": 2e-05, | |
"loss": 1.0898, | |
"step": 248 | |
}, | |
{ | |
"epoch": 0.08595098377632034, | |
"grad_norm": 1.8360575437545776, | |
"learning_rate": 2e-05, | |
"loss": 1.0741, | |
"step": 249 | |
}, | |
{ | |
"epoch": 0.08629616845012081, | |
"grad_norm": 2.1169514656066895, | |
"learning_rate": 2e-05, | |
"loss": 1.1025, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.0866413531239213, | |
"grad_norm": 1.929721474647522, | |
"learning_rate": 2e-05, | |
"loss": 1.0455, | |
"step": 251 | |
}, | |
{ | |
"epoch": 0.08698653779772178, | |
"grad_norm": 5.5121026039123535, | |
"learning_rate": 2e-05, | |
"loss": 1.07, | |
"step": 252 | |
}, | |
{ | |
"epoch": 0.08733172247152227, | |
"grad_norm": 2.2410662174224854, | |
"learning_rate": 2e-05, | |
"loss": 1.1145, | |
"step": 253 | |
}, | |
{ | |
"epoch": 0.08767690714532275, | |
"grad_norm": 2.027545213699341, | |
"learning_rate": 2e-05, | |
"loss": 0.9882, | |
"step": 254 | |
}, | |
{ | |
"epoch": 0.08802209181912322, | |
"grad_norm": 2.0337374210357666, | |
"learning_rate": 2e-05, | |
"loss": 0.999, | |
"step": 255 | |
}, | |
{ | |
"epoch": 0.08836727649292371, | |
"grad_norm": 2.1120731830596924, | |
"learning_rate": 2e-05, | |
"loss": 1.0424, | |
"step": 256 | |
}, | |
{ | |
"epoch": 0.0887124611667242, | |
"grad_norm": 2.039837121963501, | |
"learning_rate": 2e-05, | |
"loss": 1.0316, | |
"step": 257 | |
}, | |
{ | |
"epoch": 0.08905764584052468, | |
"grad_norm": 2.008521318435669, | |
"learning_rate": 2e-05, | |
"loss": 1.019, | |
"step": 258 | |
}, | |
{ | |
"epoch": 0.08940283051432517, | |
"grad_norm": 2.081023693084717, | |
"learning_rate": 2e-05, | |
"loss": 1.003, | |
"step": 259 | |
}, | |
{ | |
"epoch": 0.08974801518812564, | |
"grad_norm": 4.9017229080200195, | |
"learning_rate": 2e-05, | |
"loss": 1.1215, | |
"step": 260 | |
}, | |
{ | |
"epoch": 0.09009319986192613, | |
"grad_norm": 2.0149810314178467, | |
"learning_rate": 2e-05, | |
"loss": 1.0643, | |
"step": 261 | |
}, | |
{ | |
"epoch": 0.09043838453572661, | |
"grad_norm": 2.0311553478240967, | |
"learning_rate": 2e-05, | |
"loss": 1.0287, | |
"step": 262 | |
}, | |
{ | |
"epoch": 0.0907835692095271, | |
"grad_norm": 2.089172124862671, | |
"learning_rate": 2e-05, | |
"loss": 1.0435, | |
"step": 263 | |
}, | |
{ | |
"epoch": 0.09112875388332758, | |
"grad_norm": 2.233536720275879, | |
"learning_rate": 2e-05, | |
"loss": 1.0787, | |
"step": 264 | |
}, | |
{ | |
"epoch": 0.09147393855712806, | |
"grad_norm": 2.050518035888672, | |
"learning_rate": 2e-05, | |
"loss": 1.0818, | |
"step": 265 | |
}, | |
{ | |
"epoch": 0.09181912323092854, | |
"grad_norm": 2.117332935333252, | |
"learning_rate": 2e-05, | |
"loss": 1.0334, | |
"step": 266 | |
}, | |
{ | |
"epoch": 0.09216430790472903, | |
"grad_norm": 2.1400556564331055, | |
"learning_rate": 2e-05, | |
"loss": 1.0997, | |
"step": 267 | |
}, | |
{ | |
"epoch": 0.09250949257852951, | |
"grad_norm": 6.34127950668335, | |
"learning_rate": 2e-05, | |
"loss": 1.0854, | |
"step": 268 | |
}, | |
{ | |
"epoch": 0.09285467725233, | |
"grad_norm": 2.346954584121704, | |
"learning_rate": 2e-05, | |
"loss": 1.041, | |
"step": 269 | |
}, | |
{ | |
"epoch": 0.09319986192613049, | |
"grad_norm": 2.049189329147339, | |
"learning_rate": 2e-05, | |
"loss": 1.1082, | |
"step": 270 | |
}, | |
{ | |
"epoch": 0.09354504659993096, | |
"grad_norm": 2.0327305793762207, | |
"learning_rate": 2e-05, | |
"loss": 1.0755, | |
"step": 271 | |
}, | |
{ | |
"epoch": 0.09389023127373144, | |
"grad_norm": 2.1110620498657227, | |
"learning_rate": 2e-05, | |
"loss": 1.0721, | |
"step": 272 | |
}, | |
{ | |
"epoch": 0.09423541594753193, | |
"grad_norm": 1.8782284259796143, | |
"learning_rate": 2e-05, | |
"loss": 0.9842, | |
"step": 273 | |
}, | |
{ | |
"epoch": 0.09458060062133242, | |
"grad_norm": 1.8504958152770996, | |
"learning_rate": 2e-05, | |
"loss": 1.0183, | |
"step": 274 | |
}, | |
{ | |
"epoch": 0.0949257852951329, | |
"grad_norm": 2.0672526359558105, | |
"learning_rate": 2e-05, | |
"loss": 1.0126, | |
"step": 275 | |
}, | |
{ | |
"epoch": 0.09527096996893337, | |
"grad_norm": 2.104374885559082, | |
"learning_rate": 2e-05, | |
"loss": 1.0911, | |
"step": 276 | |
}, | |
{ | |
"epoch": 0.09561615464273386, | |
"grad_norm": 1.9912065267562866, | |
"learning_rate": 2e-05, | |
"loss": 1.0659, | |
"step": 277 | |
}, | |
{ | |
"epoch": 0.09596133931653435, | |
"grad_norm": 2.093083143234253, | |
"learning_rate": 2e-05, | |
"loss": 1.0771, | |
"step": 278 | |
}, | |
{ | |
"epoch": 0.09630652399033483, | |
"grad_norm": 1.92844820022583, | |
"learning_rate": 2e-05, | |
"loss": 1.006, | |
"step": 279 | |
}, | |
{ | |
"epoch": 0.09665170866413532, | |
"grad_norm": 1.7608734369277954, | |
"learning_rate": 2e-05, | |
"loss": 1.0353, | |
"step": 280 | |
}, | |
{ | |
"epoch": 0.09699689333793579, | |
"grad_norm": 2.1199417114257812, | |
"learning_rate": 2e-05, | |
"loss": 1.0065, | |
"step": 281 | |
}, | |
{ | |
"epoch": 0.09734207801173628, | |
"grad_norm": 1.7883626222610474, | |
"learning_rate": 2e-05, | |
"loss": 1.0815, | |
"step": 282 | |
}, | |
{ | |
"epoch": 0.09768726268553676, | |
"grad_norm": 1.9652053117752075, | |
"learning_rate": 2e-05, | |
"loss": 1.0158, | |
"step": 283 | |
}, | |
{ | |
"epoch": 0.09803244735933725, | |
"grad_norm": 2.1138057708740234, | |
"learning_rate": 2e-05, | |
"loss": 1.029, | |
"step": 284 | |
}, | |
{ | |
"epoch": 0.09837763203313774, | |
"grad_norm": 2.0844762325286865, | |
"learning_rate": 2e-05, | |
"loss": 1.0783, | |
"step": 285 | |
}, | |
{ | |
"epoch": 0.09872281670693821, | |
"grad_norm": 1.954588532447815, | |
"learning_rate": 2e-05, | |
"loss": 1.0256, | |
"step": 286 | |
}, | |
{ | |
"epoch": 0.0990680013807387, | |
"grad_norm": 2.3191030025482178, | |
"learning_rate": 2e-05, | |
"loss": 1.1083, | |
"step": 287 | |
}, | |
{ | |
"epoch": 0.09941318605453918, | |
"grad_norm": 2.0656120777130127, | |
"learning_rate": 2e-05, | |
"loss": 1.0798, | |
"step": 288 | |
}, | |
{ | |
"epoch": 0.09975837072833967, | |
"grad_norm": 1.8802495002746582, | |
"learning_rate": 2e-05, | |
"loss": 1.0258, | |
"step": 289 | |
}, | |
{ | |
"epoch": 0.10010355540214015, | |
"grad_norm": 2.062614917755127, | |
"learning_rate": 2e-05, | |
"loss": 1.0565, | |
"step": 290 | |
}, | |
{ | |
"epoch": 0.10044874007594062, | |
"grad_norm": 2.0783498287200928, | |
"learning_rate": 2e-05, | |
"loss": 1.1027, | |
"step": 291 | |
}, | |
{ | |
"epoch": 0.10079392474974111, | |
"grad_norm": 2.0610833168029785, | |
"learning_rate": 2e-05, | |
"loss": 1.0019, | |
"step": 292 | |
}, | |
{ | |
"epoch": 0.1011391094235416, | |
"grad_norm": 2.029587745666504, | |
"learning_rate": 2e-05, | |
"loss": 1.0369, | |
"step": 293 | |
}, | |
{ | |
"epoch": 0.10148429409734208, | |
"grad_norm": 1.8925073146820068, | |
"learning_rate": 2e-05, | |
"loss": 1.0196, | |
"step": 294 | |
}, | |
{ | |
"epoch": 0.10182947877114257, | |
"grad_norm": 1.9382961988449097, | |
"learning_rate": 2e-05, | |
"loss": 1.0812, | |
"step": 295 | |
}, | |
{ | |
"epoch": 0.10217466344494304, | |
"grad_norm": 1.8473429679870605, | |
"learning_rate": 2e-05, | |
"loss": 1.0407, | |
"step": 296 | |
}, | |
{ | |
"epoch": 0.10251984811874353, | |
"grad_norm": 2.0330350399017334, | |
"learning_rate": 2e-05, | |
"loss": 1.0093, | |
"step": 297 | |
}, | |
{ | |
"epoch": 0.10286503279254401, | |
"grad_norm": 2.0864484310150146, | |
"learning_rate": 2e-05, | |
"loss": 1.0155, | |
"step": 298 | |
}, | |
{ | |
"epoch": 0.1032102174663445, | |
"grad_norm": 1.8072278499603271, | |
"learning_rate": 2e-05, | |
"loss": 1.0674, | |
"step": 299 | |
}, | |
{ | |
"epoch": 0.10355540214014498, | |
"grad_norm": 2.2771360874176025, | |
"learning_rate": 2e-05, | |
"loss": 1.0345, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.10390058681394546, | |
"grad_norm": 1.8649107217788696, | |
"learning_rate": 2e-05, | |
"loss": 0.9528, | |
"step": 301 | |
}, | |
{ | |
"epoch": 0.10424577148774594, | |
"grad_norm": 1.751585841178894, | |
"learning_rate": 2e-05, | |
"loss": 0.9587, | |
"step": 302 | |
}, | |
{ | |
"epoch": 0.10459095616154643, | |
"grad_norm": 2.0218489170074463, | |
"learning_rate": 2e-05, | |
"loss": 0.9844, | |
"step": 303 | |
}, | |
{ | |
"epoch": 0.10493614083534691, | |
"grad_norm": 2.0804543495178223, | |
"learning_rate": 2e-05, | |
"loss": 1.0221, | |
"step": 304 | |
}, | |
{ | |
"epoch": 0.1052813255091474, | |
"grad_norm": 2.1906816959381104, | |
"learning_rate": 2e-05, | |
"loss": 1.0347, | |
"step": 305 | |
}, | |
{ | |
"epoch": 0.10562651018294787, | |
"grad_norm": 1.852725863456726, | |
"learning_rate": 2e-05, | |
"loss": 1.0482, | |
"step": 306 | |
}, | |
{ | |
"epoch": 0.10597169485674836, | |
"grad_norm": 1.9083342552185059, | |
"learning_rate": 2e-05, | |
"loss": 1.0901, | |
"step": 307 | |
}, | |
{ | |
"epoch": 0.10631687953054884, | |
"grad_norm": 2.0140769481658936, | |
"learning_rate": 2e-05, | |
"loss": 1.087, | |
"step": 308 | |
}, | |
{ | |
"epoch": 0.10666206420434933, | |
"grad_norm": 1.8648165464401245, | |
"learning_rate": 2e-05, | |
"loss": 1.0129, | |
"step": 309 | |
}, | |
{ | |
"epoch": 0.10700724887814982, | |
"grad_norm": 2.034452438354492, | |
"learning_rate": 2e-05, | |
"loss": 1.0424, | |
"step": 310 | |
}, | |
{ | |
"epoch": 0.10735243355195029, | |
"grad_norm": 1.7214909791946411, | |
"learning_rate": 2e-05, | |
"loss": 1.0461, | |
"step": 311 | |
}, | |
{ | |
"epoch": 0.10769761822575077, | |
"grad_norm": 2.31937575340271, | |
"learning_rate": 2e-05, | |
"loss": 1.0606, | |
"step": 312 | |
}, | |
{ | |
"epoch": 0.10804280289955126, | |
"grad_norm": 1.9707671403884888, | |
"learning_rate": 2e-05, | |
"loss": 0.9882, | |
"step": 313 | |
}, | |
{ | |
"epoch": 0.10838798757335175, | |
"grad_norm": 1.837477207183838, | |
"learning_rate": 2e-05, | |
"loss": 0.9285, | |
"step": 314 | |
}, | |
{ | |
"epoch": 0.10873317224715223, | |
"grad_norm": 1.8579028844833374, | |
"learning_rate": 2e-05, | |
"loss": 1.0121, | |
"step": 315 | |
}, | |
{ | |
"epoch": 0.1090783569209527, | |
"grad_norm": 1.8835140466690063, | |
"learning_rate": 2e-05, | |
"loss": 1.0724, | |
"step": 316 | |
}, | |
{ | |
"epoch": 0.10942354159475319, | |
"grad_norm": 1.7847641706466675, | |
"learning_rate": 2e-05, | |
"loss": 1.0265, | |
"step": 317 | |
}, | |
{ | |
"epoch": 0.10976872626855368, | |
"grad_norm": 2.0330307483673096, | |
"learning_rate": 2e-05, | |
"loss": 1.0218, | |
"step": 318 | |
}, | |
{ | |
"epoch": 0.11011391094235416, | |
"grad_norm": 1.8466086387634277, | |
"learning_rate": 2e-05, | |
"loss": 1.0594, | |
"step": 319 | |
}, | |
{ | |
"epoch": 0.11045909561615465, | |
"grad_norm": 1.884079933166504, | |
"learning_rate": 2e-05, | |
"loss": 1.0734, | |
"step": 320 | |
}, | |
{ | |
"epoch": 0.11080428028995512, | |
"grad_norm": 1.8187580108642578, | |
"learning_rate": 2e-05, | |
"loss": 0.9995, | |
"step": 321 | |
}, | |
{ | |
"epoch": 0.1111494649637556, | |
"grad_norm": 2.196646213531494, | |
"learning_rate": 2e-05, | |
"loss": 1.0265, | |
"step": 322 | |
}, | |
{ | |
"epoch": 0.11149464963755609, | |
"grad_norm": 1.9797489643096924, | |
"learning_rate": 2e-05, | |
"loss": 1.0325, | |
"step": 323 | |
}, | |
{ | |
"epoch": 0.11183983431135658, | |
"grad_norm": 2.0785601139068604, | |
"learning_rate": 2e-05, | |
"loss": 1.1051, | |
"step": 324 | |
}, | |
{ | |
"epoch": 0.11218501898515706, | |
"grad_norm": 2.0432729721069336, | |
"learning_rate": 2e-05, | |
"loss": 1.0815, | |
"step": 325 | |
}, | |
{ | |
"epoch": 0.11253020365895754, | |
"grad_norm": 2.0694308280944824, | |
"learning_rate": 2e-05, | |
"loss": 1.0851, | |
"step": 326 | |
}, | |
{ | |
"epoch": 0.11287538833275802, | |
"grad_norm": 1.8386410474777222, | |
"learning_rate": 2e-05, | |
"loss": 1.0106, | |
"step": 327 | |
}, | |
{ | |
"epoch": 0.11322057300655851, | |
"grad_norm": 2.018885850906372, | |
"learning_rate": 2e-05, | |
"loss": 1.0318, | |
"step": 328 | |
}, | |
{ | |
"epoch": 0.113565757680359, | |
"grad_norm": 2.105708360671997, | |
"learning_rate": 2e-05, | |
"loss": 1.0567, | |
"step": 329 | |
}, | |
{ | |
"epoch": 0.11391094235415948, | |
"grad_norm": 1.9204944372177124, | |
"learning_rate": 2e-05, | |
"loss": 1.0262, | |
"step": 330 | |
}, | |
{ | |
"epoch": 0.11425612702795995, | |
"grad_norm": 1.9768996238708496, | |
"learning_rate": 2e-05, | |
"loss": 1.0085, | |
"step": 331 | |
}, | |
{ | |
"epoch": 0.11460131170176044, | |
"grad_norm": 1.785104751586914, | |
"learning_rate": 2e-05, | |
"loss": 1.102, | |
"step": 332 | |
}, | |
{ | |
"epoch": 0.11494649637556092, | |
"grad_norm": 2.0644032955169678, | |
"learning_rate": 2e-05, | |
"loss": 1.0617, | |
"step": 333 | |
}, | |
{ | |
"epoch": 0.11529168104936141, | |
"grad_norm": 2.0390021800994873, | |
"learning_rate": 2e-05, | |
"loss": 0.9609, | |
"step": 334 | |
}, | |
{ | |
"epoch": 0.1156368657231619, | |
"grad_norm": 1.919952392578125, | |
"learning_rate": 2e-05, | |
"loss": 1.0701, | |
"step": 335 | |
}, | |
{ | |
"epoch": 0.11598205039696237, | |
"grad_norm": 1.8437055349349976, | |
"learning_rate": 2e-05, | |
"loss": 1.0486, | |
"step": 336 | |
}, | |
{ | |
"epoch": 0.11632723507076285, | |
"grad_norm": 1.864856243133545, | |
"learning_rate": 2e-05, | |
"loss": 1.0564, | |
"step": 337 | |
}, | |
{ | |
"epoch": 0.11667241974456334, | |
"grad_norm": 2.0349135398864746, | |
"learning_rate": 2e-05, | |
"loss": 0.9971, | |
"step": 338 | |
}, | |
{ | |
"epoch": 0.11701760441836383, | |
"grad_norm": 2.007643461227417, | |
"learning_rate": 2e-05, | |
"loss": 0.986, | |
"step": 339 | |
}, | |
{ | |
"epoch": 0.11736278909216431, | |
"grad_norm": 2.1241238117218018, | |
"learning_rate": 2e-05, | |
"loss": 1.0278, | |
"step": 340 | |
}, | |
{ | |
"epoch": 0.11770797376596479, | |
"grad_norm": 1.960552453994751, | |
"learning_rate": 2e-05, | |
"loss": 1.057, | |
"step": 341 | |
}, | |
{ | |
"epoch": 0.11805315843976527, | |
"grad_norm": 2.3228769302368164, | |
"learning_rate": 2e-05, | |
"loss": 1.029, | |
"step": 342 | |
}, | |
{ | |
"epoch": 0.11839834311356576, | |
"grad_norm": 1.9010450839996338, | |
"learning_rate": 2e-05, | |
"loss": 0.9918, | |
"step": 343 | |
}, | |
{ | |
"epoch": 0.11874352778736624, | |
"grad_norm": 1.9832115173339844, | |
"learning_rate": 2e-05, | |
"loss": 0.9904, | |
"step": 344 | |
}, | |
{ | |
"epoch": 0.11908871246116673, | |
"grad_norm": 2.176405668258667, | |
"learning_rate": 2e-05, | |
"loss": 1.0155, | |
"step": 345 | |
}, | |
{ | |
"epoch": 0.1194338971349672, | |
"grad_norm": 2.0718116760253906, | |
"learning_rate": 2e-05, | |
"loss": 1.0706, | |
"step": 346 | |
}, | |
{ | |
"epoch": 0.11977908180876769, | |
"grad_norm": 2.000976085662842, | |
"learning_rate": 2e-05, | |
"loss": 0.9916, | |
"step": 347 | |
}, | |
{ | |
"epoch": 0.12012426648256817, | |
"grad_norm": 1.9181327819824219, | |
"learning_rate": 2e-05, | |
"loss": 1.0556, | |
"step": 348 | |
}, | |
{ | |
"epoch": 0.12046945115636866, | |
"grad_norm": 1.7830644845962524, | |
"learning_rate": 2e-05, | |
"loss": 0.9755, | |
"step": 349 | |
}, | |
{ | |
"epoch": 0.12081463583016915, | |
"grad_norm": 2.0355966091156006, | |
"learning_rate": 2e-05, | |
"loss": 1.0421, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.12115982050396962, | |
"grad_norm": 1.8209973573684692, | |
"learning_rate": 2e-05, | |
"loss": 1.0213, | |
"step": 351 | |
}, | |
{ | |
"epoch": 0.1215050051777701, | |
"grad_norm": 1.9484202861785889, | |
"learning_rate": 2e-05, | |
"loss": 1.11, | |
"step": 352 | |
}, | |
{ | |
"epoch": 0.12185018985157059, | |
"grad_norm": 1.959164023399353, | |
"learning_rate": 2e-05, | |
"loss": 1.0318, | |
"step": 353 | |
}, | |
{ | |
"epoch": 0.12219537452537108, | |
"grad_norm": 1.893936276435852, | |
"learning_rate": 2e-05, | |
"loss": 1.0534, | |
"step": 354 | |
}, | |
{ | |
"epoch": 0.12254055919917156, | |
"grad_norm": 1.9669185876846313, | |
"learning_rate": 2e-05, | |
"loss": 0.9959, | |
"step": 355 | |
}, | |
{ | |
"epoch": 0.12288574387297203, | |
"grad_norm": 2.152151584625244, | |
"learning_rate": 2e-05, | |
"loss": 1.0081, | |
"step": 356 | |
}, | |
{ | |
"epoch": 0.12323092854677252, | |
"grad_norm": 2.203021764755249, | |
"learning_rate": 2e-05, | |
"loss": 1.0443, | |
"step": 357 | |
}, | |
{ | |
"epoch": 0.123576113220573, | |
"grad_norm": 2.069221258163452, | |
"learning_rate": 2e-05, | |
"loss": 1.0519, | |
"step": 358 | |
}, | |
{ | |
"epoch": 0.12392129789437349, | |
"grad_norm": 2.054393768310547, | |
"learning_rate": 2e-05, | |
"loss": 1.0314, | |
"step": 359 | |
}, | |
{ | |
"epoch": 0.12426648256817398, | |
"grad_norm": 2.0708425045013428, | |
"learning_rate": 2e-05, | |
"loss": 0.9856, | |
"step": 360 | |
}, | |
{ | |
"epoch": 0.12461166724197445, | |
"grad_norm": 2.4350380897521973, | |
"learning_rate": 2e-05, | |
"loss": 0.9796, | |
"step": 361 | |
}, | |
{ | |
"epoch": 0.12495685191577494, | |
"grad_norm": 1.8085037469863892, | |
"learning_rate": 2e-05, | |
"loss": 0.9932, | |
"step": 362 | |
}, | |
{ | |
"epoch": 0.12530203658957542, | |
"grad_norm": 1.824069619178772, | |
"learning_rate": 2e-05, | |
"loss": 1.024, | |
"step": 363 | |
}, | |
{ | |
"epoch": 0.1256472212633759, | |
"grad_norm": 2.675426959991455, | |
"learning_rate": 2e-05, | |
"loss": 1.0231, | |
"step": 364 | |
}, | |
{ | |
"epoch": 0.1259924059371764, | |
"grad_norm": 2.127661943435669, | |
"learning_rate": 2e-05, | |
"loss": 1.043, | |
"step": 365 | |
}, | |
{ | |
"epoch": 0.12633759061097688, | |
"grad_norm": 1.9300974607467651, | |
"learning_rate": 2e-05, | |
"loss": 1.0163, | |
"step": 366 | |
}, | |
{ | |
"epoch": 0.12668277528477737, | |
"grad_norm": 1.984744668006897, | |
"learning_rate": 2e-05, | |
"loss": 1.0101, | |
"step": 367 | |
}, | |
{ | |
"epoch": 0.12702795995857785, | |
"grad_norm": 1.9173483848571777, | |
"learning_rate": 2e-05, | |
"loss": 1.0672, | |
"step": 368 | |
}, | |
{ | |
"epoch": 0.1273731446323783, | |
"grad_norm": 2.148045778274536, | |
"learning_rate": 2e-05, | |
"loss": 1.0037, | |
"step": 369 | |
}, | |
{ | |
"epoch": 0.1277183293061788, | |
"grad_norm": 2.0799989700317383, | |
"learning_rate": 2e-05, | |
"loss": 1.0067, | |
"step": 370 | |
}, | |
{ | |
"epoch": 0.12806351397997928, | |
"grad_norm": 1.5450844764709473, | |
"learning_rate": 2e-05, | |
"loss": 0.9314, | |
"step": 371 | |
}, | |
{ | |
"epoch": 0.12840869865377977, | |
"grad_norm": 2.0938477516174316, | |
"learning_rate": 2e-05, | |
"loss": 1.0326, | |
"step": 372 | |
}, | |
{ | |
"epoch": 0.12875388332758025, | |
"grad_norm": 2.148625612258911, | |
"learning_rate": 2e-05, | |
"loss": 1.006, | |
"step": 373 | |
}, | |
{ | |
"epoch": 0.12909906800138074, | |
"grad_norm": 1.964438796043396, | |
"learning_rate": 2e-05, | |
"loss": 0.9921, | |
"step": 374 | |
}, | |
{ | |
"epoch": 0.12944425267518123, | |
"grad_norm": 1.7084150314331055, | |
"learning_rate": 2e-05, | |
"loss": 1.0276, | |
"step": 375 | |
}, | |
{ | |
"epoch": 0.1297894373489817, | |
"grad_norm": 1.8643776178359985, | |
"learning_rate": 2e-05, | |
"loss": 1.071, | |
"step": 376 | |
}, | |
{ | |
"epoch": 0.1301346220227822, | |
"grad_norm": 1.9768637418746948, | |
"learning_rate": 2e-05, | |
"loss": 1.0634, | |
"step": 377 | |
}, | |
{ | |
"epoch": 0.13047980669658268, | |
"grad_norm": 1.8979171514511108, | |
"learning_rate": 2e-05, | |
"loss": 1.0746, | |
"step": 378 | |
}, | |
{ | |
"epoch": 0.13082499137038314, | |
"grad_norm": 2.2266244888305664, | |
"learning_rate": 2e-05, | |
"loss": 1.0139, | |
"step": 379 | |
}, | |
{ | |
"epoch": 0.13117017604418363, | |
"grad_norm": 2.767505645751953, | |
"learning_rate": 2e-05, | |
"loss": 1.0483, | |
"step": 380 | |
}, | |
{ | |
"epoch": 0.13151536071798411, | |
"grad_norm": 1.9986623525619507, | |
"learning_rate": 2e-05, | |
"loss": 1.0527, | |
"step": 381 | |
}, | |
{ | |
"epoch": 0.1318605453917846, | |
"grad_norm": 2.7748537063598633, | |
"learning_rate": 2e-05, | |
"loss": 1.0117, | |
"step": 382 | |
}, | |
{ | |
"epoch": 0.1322057300655851, | |
"grad_norm": 2.404034376144409, | |
"learning_rate": 2e-05, | |
"loss": 1.0509, | |
"step": 383 | |
}, | |
{ | |
"epoch": 0.13255091473938557, | |
"grad_norm": 1.8266741037368774, | |
"learning_rate": 2e-05, | |
"loss": 0.9569, | |
"step": 384 | |
}, | |
{ | |
"epoch": 0.13289609941318606, | |
"grad_norm": 1.6617871522903442, | |
"learning_rate": 2e-05, | |
"loss": 1.0284, | |
"step": 385 | |
}, | |
{ | |
"epoch": 0.13324128408698654, | |
"grad_norm": 2.1641674041748047, | |
"learning_rate": 2e-05, | |
"loss": 1.0185, | |
"step": 386 | |
}, | |
{ | |
"epoch": 0.13358646876078703, | |
"grad_norm": 2.230027675628662, | |
"learning_rate": 2e-05, | |
"loss": 1.0059, | |
"step": 387 | |
}, | |
{ | |
"epoch": 0.13393165343458752, | |
"grad_norm": 2.0304672718048096, | |
"learning_rate": 2e-05, | |
"loss": 0.9458, | |
"step": 388 | |
}, | |
{ | |
"epoch": 0.13427683810838797, | |
"grad_norm": 1.9289956092834473, | |
"learning_rate": 2e-05, | |
"loss": 0.9867, | |
"step": 389 | |
}, | |
{ | |
"epoch": 0.13462202278218846, | |
"grad_norm": 2.290512800216675, | |
"learning_rate": 2e-05, | |
"loss": 1.0438, | |
"step": 390 | |
}, | |
{ | |
"epoch": 0.13496720745598895, | |
"grad_norm": 1.9645930528640747, | |
"learning_rate": 2e-05, | |
"loss": 1.0647, | |
"step": 391 | |
}, | |
{ | |
"epoch": 0.13531239212978943, | |
"grad_norm": 1.9078412055969238, | |
"learning_rate": 2e-05, | |
"loss": 1.0219, | |
"step": 392 | |
}, | |
{ | |
"epoch": 0.13565757680358992, | |
"grad_norm": 1.8987268209457397, | |
"learning_rate": 2e-05, | |
"loss": 0.9697, | |
"step": 393 | |
}, | |
{ | |
"epoch": 0.1360027614773904, | |
"grad_norm": 1.8962979316711426, | |
"learning_rate": 2e-05, | |
"loss": 1.0337, | |
"step": 394 | |
}, | |
{ | |
"epoch": 0.1363479461511909, | |
"grad_norm": 1.955389380455017, | |
"learning_rate": 2e-05, | |
"loss": 0.9921, | |
"step": 395 | |
}, | |
{ | |
"epoch": 0.13669313082499138, | |
"grad_norm": 1.89638352394104, | |
"learning_rate": 2e-05, | |
"loss": 0.9957, | |
"step": 396 | |
}, | |
{ | |
"epoch": 0.13703831549879186, | |
"grad_norm": 1.949547290802002, | |
"learning_rate": 2e-05, | |
"loss": 1.0009, | |
"step": 397 | |
}, | |
{ | |
"epoch": 0.13738350017259235, | |
"grad_norm": 1.9283349514007568, | |
"learning_rate": 2e-05, | |
"loss": 0.9941, | |
"step": 398 | |
}, | |
{ | |
"epoch": 0.1377286848463928, | |
"grad_norm": 1.9636731147766113, | |
"learning_rate": 2e-05, | |
"loss": 0.9358, | |
"step": 399 | |
}, | |
{ | |
"epoch": 0.1380738695201933, | |
"grad_norm": 1.988175630569458, | |
"learning_rate": 2e-05, | |
"loss": 1.0143, | |
"step": 400 | |
} | |
], | |
"logging_steps": 1.0, | |
"max_steps": 14485, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 5, | |
"save_steps": 200, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 165248866713600.0, | |
"train_batch_size": 12, | |
"trial_name": null, | |
"trial_params": null | |
} | |