|
{ |
|
"best_metric": 2.435030460357666, |
|
"best_model_checkpoint": "zhtw-en/checkpoint-92500", |
|
"epoch": 3.0, |
|
"eval_steps": 2500, |
|
"global_step": 93276, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016081307088640164, |
|
"grad_norm": 8.550692558288574, |
|
"learning_rate": 4.919593464556799e-05, |
|
"loss": 3.4707, |
|
"num_input_tokens_seen": 301408, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03216261417728033, |
|
"grad_norm": 9.999076843261719, |
|
"learning_rate": 4.8391869291135987e-05, |
|
"loss": 3.3664, |
|
"num_input_tokens_seen": 597672, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.048243921265920496, |
|
"grad_norm": 8.839406967163086, |
|
"learning_rate": 4.7587803936703975e-05, |
|
"loss": 3.3089, |
|
"num_input_tokens_seen": 896520, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.06432522835456066, |
|
"grad_norm": 7.105090618133545, |
|
"learning_rate": 4.678373858227197e-05, |
|
"loss": 3.267, |
|
"num_input_tokens_seen": 1194832, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.08040653544320082, |
|
"grad_norm": 7.992733001708984, |
|
"learning_rate": 4.597967322783996e-05, |
|
"loss": 3.2254, |
|
"num_input_tokens_seen": 1493088, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.08040653544320082, |
|
"eval_loss": 2.910461664199829, |
|
"eval_runtime": 2.5005, |
|
"eval_samples_per_second": 999.788, |
|
"eval_steps_per_second": 125.173, |
|
"num_input_tokens_seen": 1493088, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.09648784253184099, |
|
"grad_norm": 8.68657112121582, |
|
"learning_rate": 4.5175607873407955e-05, |
|
"loss": 3.1691, |
|
"num_input_tokens_seen": 1793976, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.11256914962048115, |
|
"grad_norm": 7.246800899505615, |
|
"learning_rate": 4.4371542518975943e-05, |
|
"loss": 3.1685, |
|
"num_input_tokens_seen": 2095352, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.1286504567091213, |
|
"grad_norm": 9.030860900878906, |
|
"learning_rate": 4.356747716454393e-05, |
|
"loss": 3.1333, |
|
"num_input_tokens_seen": 2393856, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.14473176379776148, |
|
"grad_norm": 7.463845252990723, |
|
"learning_rate": 4.276341181011193e-05, |
|
"loss": 3.1295, |
|
"num_input_tokens_seen": 2694496, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.16081307088640165, |
|
"grad_norm": 8.482089042663574, |
|
"learning_rate": 4.195934645567992e-05, |
|
"loss": 3.0946, |
|
"num_input_tokens_seen": 2990968, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.16081307088640165, |
|
"eval_loss": 2.830476999282837, |
|
"eval_runtime": 2.5201, |
|
"eval_samples_per_second": 992.009, |
|
"eval_steps_per_second": 124.199, |
|
"num_input_tokens_seen": 2990968, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.17689437797504182, |
|
"grad_norm": 8.570518493652344, |
|
"learning_rate": 4.115528110124791e-05, |
|
"loss": 3.112, |
|
"num_input_tokens_seen": 3289488, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.19297568506368198, |
|
"grad_norm": 9.759325981140137, |
|
"learning_rate": 4.03512157468159e-05, |
|
"loss": 3.0933, |
|
"num_input_tokens_seen": 3590264, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.20905699215232215, |
|
"grad_norm": 6.518988609313965, |
|
"learning_rate": 3.9547150392383896e-05, |
|
"loss": 3.0858, |
|
"num_input_tokens_seen": 3885160, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.2251382992409623, |
|
"grad_norm": 6.913475036621094, |
|
"learning_rate": 3.8743085037951885e-05, |
|
"loss": 3.0543, |
|
"num_input_tokens_seen": 4184600, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.24121960632960246, |
|
"grad_norm": 8.485562324523926, |
|
"learning_rate": 3.793901968351988e-05, |
|
"loss": 3.0473, |
|
"num_input_tokens_seen": 4477792, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.24121960632960246, |
|
"eval_loss": 2.773728847503662, |
|
"eval_runtime": 2.5738, |
|
"eval_samples_per_second": 971.323, |
|
"eval_steps_per_second": 121.61, |
|
"num_input_tokens_seen": 4477792, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.2573009134182426, |
|
"grad_norm": 7.89262056350708, |
|
"learning_rate": 3.713495432908787e-05, |
|
"loss": 3.029, |
|
"num_input_tokens_seen": 4779520, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2733822205068828, |
|
"grad_norm": 6.879751205444336, |
|
"learning_rate": 3.6330888974655864e-05, |
|
"loss": 3.0127, |
|
"num_input_tokens_seen": 5078952, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.28946352759552296, |
|
"grad_norm": 8.109273910522461, |
|
"learning_rate": 3.552682362022385e-05, |
|
"loss": 3.0078, |
|
"num_input_tokens_seen": 5376128, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.3055448346841631, |
|
"grad_norm": 8.074146270751953, |
|
"learning_rate": 3.472275826579184e-05, |
|
"loss": 2.9988, |
|
"num_input_tokens_seen": 5671664, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.3216261417728033, |
|
"grad_norm": 6.523529529571533, |
|
"learning_rate": 3.391869291135984e-05, |
|
"loss": 2.9633, |
|
"num_input_tokens_seen": 5967560, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.3216261417728033, |
|
"eval_loss": 2.7306864261627197, |
|
"eval_runtime": 2.505, |
|
"eval_samples_per_second": 997.986, |
|
"eval_steps_per_second": 124.948, |
|
"num_input_tokens_seen": 5967560, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.33770744886144344, |
|
"grad_norm": 6.974269866943359, |
|
"learning_rate": 3.311462755692783e-05, |
|
"loss": 2.9732, |
|
"num_input_tokens_seen": 6265312, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.35378875595008363, |
|
"grad_norm": 7.644798278808594, |
|
"learning_rate": 3.231056220249582e-05, |
|
"loss": 2.9729, |
|
"num_input_tokens_seen": 6563632, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.3698700630387238, |
|
"grad_norm": 7.96437406539917, |
|
"learning_rate": 3.150649684806381e-05, |
|
"loss": 2.9484, |
|
"num_input_tokens_seen": 6865528, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.38595137012736397, |
|
"grad_norm": 7.939519882202148, |
|
"learning_rate": 3.0702431493631805e-05, |
|
"loss": 2.9387, |
|
"num_input_tokens_seen": 7165632, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.4020326772160041, |
|
"grad_norm": 7.698306083679199, |
|
"learning_rate": 2.9898366139199797e-05, |
|
"loss": 2.9355, |
|
"num_input_tokens_seen": 7463192, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.4020326772160041, |
|
"eval_loss": 2.684298515319824, |
|
"eval_runtime": 2.5214, |
|
"eval_samples_per_second": 991.52, |
|
"eval_steps_per_second": 124.138, |
|
"num_input_tokens_seen": 7463192, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.4181139843046443, |
|
"grad_norm": 7.21583890914917, |
|
"learning_rate": 2.9094300784767786e-05, |
|
"loss": 2.9418, |
|
"num_input_tokens_seen": 7758024, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.43419529139328444, |
|
"grad_norm": 7.767180919647217, |
|
"learning_rate": 2.8290235430335778e-05, |
|
"loss": 2.923, |
|
"num_input_tokens_seen": 8052032, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.4502765984819246, |
|
"grad_norm": 7.057159423828125, |
|
"learning_rate": 2.7486170075903773e-05, |
|
"loss": 2.9016, |
|
"num_input_tokens_seen": 8347768, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.4663579055705648, |
|
"grad_norm": 7.320003032684326, |
|
"learning_rate": 2.6682104721471762e-05, |
|
"loss": 2.885, |
|
"num_input_tokens_seen": 8646192, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.4824392126592049, |
|
"grad_norm": 7.630561828613281, |
|
"learning_rate": 2.587803936703975e-05, |
|
"loss": 2.9076, |
|
"num_input_tokens_seen": 8950264, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.4824392126592049, |
|
"eval_loss": 2.658709764480591, |
|
"eval_runtime": 2.6566, |
|
"eval_samples_per_second": 941.035, |
|
"eval_steps_per_second": 117.818, |
|
"num_input_tokens_seen": 8950264, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.4985205197478451, |
|
"grad_norm": 6.413602828979492, |
|
"learning_rate": 2.507397401260775e-05, |
|
"loss": 2.8818, |
|
"num_input_tokens_seen": 9247272, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.5146018268364853, |
|
"grad_norm": 6.833747863769531, |
|
"learning_rate": 2.426990865817574e-05, |
|
"loss": 2.8998, |
|
"num_input_tokens_seen": 9547432, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.5306831339251254, |
|
"grad_norm": 7.930506706237793, |
|
"learning_rate": 2.3465843303743727e-05, |
|
"loss": 2.8543, |
|
"num_input_tokens_seen": 9844072, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.5467644410137656, |
|
"grad_norm": 6.841344356536865, |
|
"learning_rate": 2.2661777949311722e-05, |
|
"loss": 2.8669, |
|
"num_input_tokens_seen": 10142344, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.5628457481024057, |
|
"grad_norm": 6.899343967437744, |
|
"learning_rate": 2.185771259487971e-05, |
|
"loss": 2.8714, |
|
"num_input_tokens_seen": 10443344, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.5628457481024057, |
|
"eval_loss": 2.6303601264953613, |
|
"eval_runtime": 2.5354, |
|
"eval_samples_per_second": 986.046, |
|
"eval_steps_per_second": 123.453, |
|
"num_input_tokens_seen": 10443344, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.5789270551910459, |
|
"grad_norm": 6.256689071655273, |
|
"learning_rate": 2.1053647240447703e-05, |
|
"loss": 2.8418, |
|
"num_input_tokens_seen": 10744312, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.5950083622796861, |
|
"grad_norm": 7.627821445465088, |
|
"learning_rate": 2.0249581886015695e-05, |
|
"loss": 2.8462, |
|
"num_input_tokens_seen": 11048208, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.6110896693683262, |
|
"grad_norm": 7.331953525543213, |
|
"learning_rate": 1.9445516531583687e-05, |
|
"loss": 2.8345, |
|
"num_input_tokens_seen": 11347880, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.6271709764569664, |
|
"grad_norm": 6.463207244873047, |
|
"learning_rate": 1.864145117715168e-05, |
|
"loss": 2.8531, |
|
"num_input_tokens_seen": 11650144, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.6432522835456066, |
|
"grad_norm": 7.423746109008789, |
|
"learning_rate": 1.783738582271967e-05, |
|
"loss": 2.8716, |
|
"num_input_tokens_seen": 11951096, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6432522835456066, |
|
"eval_loss": 2.6024744510650635, |
|
"eval_runtime": 2.5182, |
|
"eval_samples_per_second": 992.755, |
|
"eval_steps_per_second": 124.293, |
|
"num_input_tokens_seen": 11951096, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6593335906342468, |
|
"grad_norm": 7.352589130401611, |
|
"learning_rate": 1.7033320468287664e-05, |
|
"loss": 2.8243, |
|
"num_input_tokens_seen": 12252592, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.6754148977228869, |
|
"grad_norm": 7.22981071472168, |
|
"learning_rate": 1.6229255113855656e-05, |
|
"loss": 2.8454, |
|
"num_input_tokens_seen": 12546792, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.6914962048115271, |
|
"grad_norm": 6.819567680358887, |
|
"learning_rate": 1.5425189759423648e-05, |
|
"loss": 2.8047, |
|
"num_input_tokens_seen": 12838728, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.7075775119001673, |
|
"grad_norm": 8.716426849365234, |
|
"learning_rate": 1.4621124404991638e-05, |
|
"loss": 2.8144, |
|
"num_input_tokens_seen": 13137688, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.7236588189888075, |
|
"grad_norm": 7.324875831604004, |
|
"learning_rate": 1.381705905055963e-05, |
|
"loss": 2.7989, |
|
"num_input_tokens_seen": 13432464, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.7236588189888075, |
|
"eval_loss": 2.5822224617004395, |
|
"eval_runtime": 2.5158, |
|
"eval_samples_per_second": 993.705, |
|
"eval_steps_per_second": 124.412, |
|
"num_input_tokens_seen": 13432464, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.7397401260774475, |
|
"grad_norm": 7.962778568267822, |
|
"learning_rate": 1.301299369612762e-05, |
|
"loss": 2.7653, |
|
"num_input_tokens_seen": 13730024, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.7558214331660877, |
|
"grad_norm": 6.807019233703613, |
|
"learning_rate": 1.2208928341695614e-05, |
|
"loss": 2.7933, |
|
"num_input_tokens_seen": 14026400, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.7719027402547279, |
|
"grad_norm": 8.716556549072266, |
|
"learning_rate": 1.1404862987263605e-05, |
|
"loss": 2.7988, |
|
"num_input_tokens_seen": 14326608, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.787984047343368, |
|
"grad_norm": 7.388988018035889, |
|
"learning_rate": 1.0600797632831597e-05, |
|
"loss": 2.7928, |
|
"num_input_tokens_seen": 14623864, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.8040653544320082, |
|
"grad_norm": 7.011099815368652, |
|
"learning_rate": 9.796732278399589e-06, |
|
"loss": 2.7941, |
|
"num_input_tokens_seen": 14919424, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.8040653544320082, |
|
"eval_loss": 2.5630149841308594, |
|
"eval_runtime": 2.5601, |
|
"eval_samples_per_second": 976.534, |
|
"eval_steps_per_second": 122.262, |
|
"num_input_tokens_seen": 14919424, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.8201466615206484, |
|
"grad_norm": 6.740393161773682, |
|
"learning_rate": 8.992666923967581e-06, |
|
"loss": 2.8089, |
|
"num_input_tokens_seen": 15216136, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.8362279686092886, |
|
"grad_norm": 7.124479293823242, |
|
"learning_rate": 8.188601569535573e-06, |
|
"loss": 2.7704, |
|
"num_input_tokens_seen": 15515592, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.8523092756979287, |
|
"grad_norm": 7.781102180480957, |
|
"learning_rate": 7.384536215103564e-06, |
|
"loss": 2.8022, |
|
"num_input_tokens_seen": 15818560, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.8683905827865689, |
|
"grad_norm": 6.861135005950928, |
|
"learning_rate": 6.580470860671556e-06, |
|
"loss": 2.7891, |
|
"num_input_tokens_seen": 16114056, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.8844718898752091, |
|
"grad_norm": 7.128973484039307, |
|
"learning_rate": 5.776405506239547e-06, |
|
"loss": 2.7692, |
|
"num_input_tokens_seen": 16415080, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.8844718898752091, |
|
"eval_loss": 2.5496785640716553, |
|
"eval_runtime": 2.6422, |
|
"eval_samples_per_second": 946.175, |
|
"eval_steps_per_second": 118.461, |
|
"num_input_tokens_seen": 16415080, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.9005531969638492, |
|
"grad_norm": 8.560084342956543, |
|
"learning_rate": 4.9723401518075395e-06, |
|
"loss": 2.7627, |
|
"num_input_tokens_seen": 16711136, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.9166345040524894, |
|
"grad_norm": 7.5000224113464355, |
|
"learning_rate": 4.168274797375531e-06, |
|
"loss": 2.7687, |
|
"num_input_tokens_seen": 17005880, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.9327158111411296, |
|
"grad_norm": 6.699025630950928, |
|
"learning_rate": 3.3642094429435228e-06, |
|
"loss": 2.779, |
|
"num_input_tokens_seen": 17307064, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.9487971182297698, |
|
"grad_norm": 6.6417131423950195, |
|
"learning_rate": 2.560144088511514e-06, |
|
"loss": 2.7493, |
|
"num_input_tokens_seen": 17602296, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.9648784253184098, |
|
"grad_norm": 6.775792121887207, |
|
"learning_rate": 1.756078734079506e-06, |
|
"loss": 2.757, |
|
"num_input_tokens_seen": 17897832, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.9648784253184098, |
|
"eval_loss": 2.5388031005859375, |
|
"eval_runtime": 2.6123, |
|
"eval_samples_per_second": 957.018, |
|
"eval_steps_per_second": 119.819, |
|
"num_input_tokens_seen": 17897832, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.98095973240705, |
|
"grad_norm": 7.619235038757324, |
|
"learning_rate": 9.520133796474978e-07, |
|
"loss": 2.7433, |
|
"num_input_tokens_seen": 18195568, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.9970410394956902, |
|
"grad_norm": 6.682379722595215, |
|
"learning_rate": 1.479480252154895e-07, |
|
"loss": 2.7266, |
|
"num_input_tokens_seen": 18491904, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.0131223465843304, |
|
"grad_norm": 6.5948309898376465, |
|
"learning_rate": 3.311462755692783e-05, |
|
"loss": 2.6645, |
|
"num_input_tokens_seen": 18790628, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.0292036536729705, |
|
"grad_norm": 6.920671463012695, |
|
"learning_rate": 3.2846605772117164e-05, |
|
"loss": 2.6881, |
|
"num_input_tokens_seen": 19090780, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.0452849607616108, |
|
"grad_norm": 6.296219348907471, |
|
"learning_rate": 3.257858398730649e-05, |
|
"loss": 2.7024, |
|
"num_input_tokens_seen": 19384812, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.0452849607616108, |
|
"eval_loss": 2.6005640029907227, |
|
"eval_runtime": 2.5084, |
|
"eval_samples_per_second": 996.636, |
|
"eval_steps_per_second": 124.779, |
|
"num_input_tokens_seen": 19384812, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.061366267850251, |
|
"grad_norm": 7.068648815155029, |
|
"learning_rate": 3.231056220249582e-05, |
|
"loss": 2.6939, |
|
"num_input_tokens_seen": 19683060, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.077447574938891, |
|
"grad_norm": 5.753154754638672, |
|
"learning_rate": 3.204254041768515e-05, |
|
"loss": 2.6977, |
|
"num_input_tokens_seen": 19979196, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.0935288820275313, |
|
"grad_norm": 8.155505180358887, |
|
"learning_rate": 3.1774518632874485e-05, |
|
"loss": 2.7048, |
|
"num_input_tokens_seen": 20278524, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.1096101891161714, |
|
"grad_norm": 7.031659126281738, |
|
"learning_rate": 3.150649684806381e-05, |
|
"loss": 2.7237, |
|
"num_input_tokens_seen": 20577572, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.1256914962048115, |
|
"grad_norm": 7.90298318862915, |
|
"learning_rate": 3.123847506325314e-05, |
|
"loss": 2.7248, |
|
"num_input_tokens_seen": 20876844, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.1256914962048115, |
|
"eval_loss": 2.6041972637176514, |
|
"eval_runtime": 2.7564, |
|
"eval_samples_per_second": 906.969, |
|
"eval_steps_per_second": 113.552, |
|
"num_input_tokens_seen": 20876844, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.1417728032934518, |
|
"grad_norm": 6.368433475494385, |
|
"learning_rate": 3.0970453278442473e-05, |
|
"loss": 2.7246, |
|
"num_input_tokens_seen": 21179820, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.1578541103820919, |
|
"grad_norm": 7.143220901489258, |
|
"learning_rate": 3.0702431493631805e-05, |
|
"loss": 2.7211, |
|
"num_input_tokens_seen": 21476172, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.173935417470732, |
|
"grad_norm": 7.216341972351074, |
|
"learning_rate": 3.0434409708821134e-05, |
|
"loss": 2.7088, |
|
"num_input_tokens_seen": 21774292, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.1900167245593722, |
|
"grad_norm": 6.958596706390381, |
|
"learning_rate": 3.0166387924010465e-05, |
|
"loss": 2.7166, |
|
"num_input_tokens_seen": 22070908, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.2060980316480123, |
|
"grad_norm": 7.161530494689941, |
|
"learning_rate": 2.9898366139199797e-05, |
|
"loss": 2.6764, |
|
"num_input_tokens_seen": 22372340, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.2060980316480123, |
|
"eval_loss": 2.5923423767089844, |
|
"eval_runtime": 2.6849, |
|
"eval_samples_per_second": 931.145, |
|
"eval_steps_per_second": 116.579, |
|
"num_input_tokens_seen": 22372340, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.2221793387366526, |
|
"grad_norm": 7.448612213134766, |
|
"learning_rate": 2.963034435438913e-05, |
|
"loss": 2.7098, |
|
"num_input_tokens_seen": 22672932, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.2382606458252927, |
|
"grad_norm": 8.339189529418945, |
|
"learning_rate": 2.9362322569578454e-05, |
|
"loss": 2.702, |
|
"num_input_tokens_seen": 22971844, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.2543419529139328, |
|
"grad_norm": 6.795124053955078, |
|
"learning_rate": 2.9094300784767786e-05, |
|
"loss": 2.7007, |
|
"num_input_tokens_seen": 23266964, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.2704232600025729, |
|
"grad_norm": 6.3036298751831055, |
|
"learning_rate": 2.8826278999957118e-05, |
|
"loss": 2.71, |
|
"num_input_tokens_seen": 23564068, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.2865045670912132, |
|
"grad_norm": 8.75069808959961, |
|
"learning_rate": 2.855825721514645e-05, |
|
"loss": 2.6854, |
|
"num_input_tokens_seen": 23866100, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.2865045670912132, |
|
"eval_loss": 2.5792863368988037, |
|
"eval_runtime": 2.5776, |
|
"eval_samples_per_second": 969.878, |
|
"eval_steps_per_second": 121.429, |
|
"num_input_tokens_seen": 23866100, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.3025858741798533, |
|
"grad_norm": 6.966170310974121, |
|
"learning_rate": 2.8290235430335778e-05, |
|
"loss": 2.697, |
|
"num_input_tokens_seen": 24162356, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.3186671812684936, |
|
"grad_norm": 7.854964733123779, |
|
"learning_rate": 2.802221364552511e-05, |
|
"loss": 2.6954, |
|
"num_input_tokens_seen": 24458980, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.3347484883571337, |
|
"grad_norm": 7.1461944580078125, |
|
"learning_rate": 2.775419186071444e-05, |
|
"loss": 2.6839, |
|
"num_input_tokens_seen": 24757828, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.3508297954457738, |
|
"grad_norm": 8.25295639038086, |
|
"learning_rate": 2.7486170075903773e-05, |
|
"loss": 2.7035, |
|
"num_input_tokens_seen": 25052236, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.366911102534414, |
|
"grad_norm": 6.336223602294922, |
|
"learning_rate": 2.7218148291093105e-05, |
|
"loss": 2.683, |
|
"num_input_tokens_seen": 25348084, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.366911102534414, |
|
"eval_loss": 2.5722219944000244, |
|
"eval_runtime": 2.7384, |
|
"eval_samples_per_second": 912.958, |
|
"eval_steps_per_second": 114.302, |
|
"num_input_tokens_seen": 25348084, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.3829924096230541, |
|
"grad_norm": 9.477555274963379, |
|
"learning_rate": 2.695012650628243e-05, |
|
"loss": 2.6877, |
|
"num_input_tokens_seen": 25642372, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.3990737167116944, |
|
"grad_norm": 8.233431816101074, |
|
"learning_rate": 2.6682104721471762e-05, |
|
"loss": 2.6927, |
|
"num_input_tokens_seen": 25939652, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.4151550238003345, |
|
"grad_norm": 5.860446929931641, |
|
"learning_rate": 2.6414082936661094e-05, |
|
"loss": 2.6819, |
|
"num_input_tokens_seen": 26248940, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.4312363308889746, |
|
"grad_norm": 6.748124599456787, |
|
"learning_rate": 2.6146061151850426e-05, |
|
"loss": 2.6893, |
|
"num_input_tokens_seen": 26552860, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.4473176379776147, |
|
"grad_norm": 6.038182258605957, |
|
"learning_rate": 2.587803936703975e-05, |
|
"loss": 2.6871, |
|
"num_input_tokens_seen": 26854100, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.4473176379776147, |
|
"eval_loss": 2.5538456439971924, |
|
"eval_runtime": 2.6078, |
|
"eval_samples_per_second": 958.67, |
|
"eval_steps_per_second": 120.026, |
|
"num_input_tokens_seen": 26854100, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.463398945066255, |
|
"grad_norm": 7.815784454345703, |
|
"learning_rate": 2.5610017582229086e-05, |
|
"loss": 2.6709, |
|
"num_input_tokens_seen": 27148148, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.479480252154895, |
|
"grad_norm": 7.8851094245910645, |
|
"learning_rate": 2.5341995797418418e-05, |
|
"loss": 2.6698, |
|
"num_input_tokens_seen": 27445020, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.4955615592435354, |
|
"grad_norm": 7.389246940612793, |
|
"learning_rate": 2.507397401260775e-05, |
|
"loss": 2.6787, |
|
"num_input_tokens_seen": 27742908, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.5116428663321755, |
|
"grad_norm": 7.621913909912109, |
|
"learning_rate": 2.4805952227797078e-05, |
|
"loss": 2.6713, |
|
"num_input_tokens_seen": 28037284, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.5277241734208156, |
|
"grad_norm": 7.889066219329834, |
|
"learning_rate": 2.4537930442986407e-05, |
|
"loss": 2.6551, |
|
"num_input_tokens_seen": 28332612, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.5277241734208156, |
|
"eval_loss": 2.5442593097686768, |
|
"eval_runtime": 2.6341, |
|
"eval_samples_per_second": 949.086, |
|
"eval_steps_per_second": 118.826, |
|
"num_input_tokens_seen": 28332612, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.5438054805094557, |
|
"grad_norm": 7.912906646728516, |
|
"learning_rate": 2.426990865817574e-05, |
|
"loss": 2.6881, |
|
"num_input_tokens_seen": 28630948, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.559886787598096, |
|
"grad_norm": 6.370878219604492, |
|
"learning_rate": 2.4001886873365067e-05, |
|
"loss": 2.6424, |
|
"num_input_tokens_seen": 28928732, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.5759680946867363, |
|
"grad_norm": 7.0892653465271, |
|
"learning_rate": 2.37338650885544e-05, |
|
"loss": 2.6626, |
|
"num_input_tokens_seen": 29224436, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.5920494017753763, |
|
"grad_norm": 6.357864856719971, |
|
"learning_rate": 2.3465843303743727e-05, |
|
"loss": 2.6546, |
|
"num_input_tokens_seen": 29520148, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.6081307088640164, |
|
"grad_norm": 8.4866943359375, |
|
"learning_rate": 2.319782151893306e-05, |
|
"loss": 2.661, |
|
"num_input_tokens_seen": 29822156, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.6081307088640164, |
|
"eval_loss": 2.527804374694824, |
|
"eval_runtime": 2.6771, |
|
"eval_samples_per_second": 933.855, |
|
"eval_steps_per_second": 116.919, |
|
"num_input_tokens_seen": 29822156, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.6242120159526565, |
|
"grad_norm": 6.843733787536621, |
|
"learning_rate": 2.292979973412239e-05, |
|
"loss": 2.6521, |
|
"num_input_tokens_seen": 30122052, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.6402933230412968, |
|
"grad_norm": 6.88835334777832, |
|
"learning_rate": 2.2661777949311722e-05, |
|
"loss": 2.6614, |
|
"num_input_tokens_seen": 30422196, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.656374630129937, |
|
"grad_norm": 5.855214595794678, |
|
"learning_rate": 2.239375616450105e-05, |
|
"loss": 2.6454, |
|
"num_input_tokens_seen": 30722660, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.6724559372185772, |
|
"grad_norm": 6.58035135269165, |
|
"learning_rate": 2.2125734379690383e-05, |
|
"loss": 2.6524, |
|
"num_input_tokens_seen": 31018220, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.6885372443072173, |
|
"grad_norm": 6.767495155334473, |
|
"learning_rate": 2.185771259487971e-05, |
|
"loss": 2.6497, |
|
"num_input_tokens_seen": 31319476, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.6885372443072173, |
|
"eval_loss": 2.526638984680176, |
|
"eval_runtime": 2.5213, |
|
"eval_samples_per_second": 991.545, |
|
"eval_steps_per_second": 124.141, |
|
"num_input_tokens_seen": 31319476, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.7046185513958574, |
|
"grad_norm": 7.022729873657227, |
|
"learning_rate": 2.1589690810069043e-05, |
|
"loss": 2.6437, |
|
"num_input_tokens_seen": 31621308, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.7206998584844975, |
|
"grad_norm": 6.241069793701172, |
|
"learning_rate": 2.132166902525837e-05, |
|
"loss": 2.6447, |
|
"num_input_tokens_seen": 31917460, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.7367811655731378, |
|
"grad_norm": 7.7204084396362305, |
|
"learning_rate": 2.1053647240447703e-05, |
|
"loss": 2.6448, |
|
"num_input_tokens_seen": 32217596, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.752862472661778, |
|
"grad_norm": 6.703210830688477, |
|
"learning_rate": 2.0785625455637035e-05, |
|
"loss": 2.6366, |
|
"num_input_tokens_seen": 32513884, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.7689437797504182, |
|
"grad_norm": 6.371466159820557, |
|
"learning_rate": 2.0517603670826367e-05, |
|
"loss": 2.6281, |
|
"num_input_tokens_seen": 32813220, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.7689437797504182, |
|
"eval_loss": 2.5115973949432373, |
|
"eval_runtime": 2.5216, |
|
"eval_samples_per_second": 991.443, |
|
"eval_steps_per_second": 124.129, |
|
"num_input_tokens_seen": 32813220, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.7850250868390583, |
|
"grad_norm": 7.277946949005127, |
|
"learning_rate": 2.0249581886015695e-05, |
|
"loss": 2.6536, |
|
"num_input_tokens_seen": 33110188, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.8011063939276983, |
|
"grad_norm": 7.93104887008667, |
|
"learning_rate": 1.9981560101205027e-05, |
|
"loss": 2.5981, |
|
"num_input_tokens_seen": 33405980, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.8171877010163386, |
|
"grad_norm": 7.782486438751221, |
|
"learning_rate": 1.971353831639436e-05, |
|
"loss": 2.635, |
|
"num_input_tokens_seen": 33700596, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.8332690081049787, |
|
"grad_norm": 8.59358024597168, |
|
"learning_rate": 1.9445516531583687e-05, |
|
"loss": 2.6269, |
|
"num_input_tokens_seen": 33997724, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.849350315193619, |
|
"grad_norm": 6.669950485229492, |
|
"learning_rate": 1.917749474677302e-05, |
|
"loss": 2.6067, |
|
"num_input_tokens_seen": 34298052, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.849350315193619, |
|
"eval_loss": 2.5047078132629395, |
|
"eval_runtime": 2.5169, |
|
"eval_samples_per_second": 993.288, |
|
"eval_steps_per_second": 124.36, |
|
"num_input_tokens_seen": 34298052, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.8654316222822591, |
|
"grad_norm": 6.265903949737549, |
|
"learning_rate": 1.8909472961962348e-05, |
|
"loss": 2.5966, |
|
"num_input_tokens_seen": 34593980, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.8815129293708992, |
|
"grad_norm": 7.943974018096924, |
|
"learning_rate": 1.864145117715168e-05, |
|
"loss": 2.6303, |
|
"num_input_tokens_seen": 34894428, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.8975942364595393, |
|
"grad_norm": 8.290629386901855, |
|
"learning_rate": 1.837342939234101e-05, |
|
"loss": 2.6303, |
|
"num_input_tokens_seen": 35193236, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.9136755435481796, |
|
"grad_norm": 7.974947929382324, |
|
"learning_rate": 1.8105407607530343e-05, |
|
"loss": 2.6272, |
|
"num_input_tokens_seen": 35486796, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.92975685063682, |
|
"grad_norm": 5.827637195587158, |
|
"learning_rate": 1.783738582271967e-05, |
|
"loss": 2.6112, |
|
"num_input_tokens_seen": 35783604, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.92975685063682, |
|
"eval_loss": 2.4935405254364014, |
|
"eval_runtime": 2.5107, |
|
"eval_samples_per_second": 995.741, |
|
"eval_steps_per_second": 124.667, |
|
"num_input_tokens_seen": 35783604, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.94583815772546, |
|
"grad_norm": 6.535378456115723, |
|
"learning_rate": 1.7569364037909003e-05, |
|
"loss": 2.6135, |
|
"num_input_tokens_seen": 36086620, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.9619194648141, |
|
"grad_norm": 6.398725986480713, |
|
"learning_rate": 1.7301342253098332e-05, |
|
"loss": 2.602, |
|
"num_input_tokens_seen": 36386140, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.9780007719027402, |
|
"grad_norm": 6.332113265991211, |
|
"learning_rate": 1.7033320468287664e-05, |
|
"loss": 2.6258, |
|
"num_input_tokens_seen": 36684308, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.9940820789913805, |
|
"grad_norm": 7.8002753257751465, |
|
"learning_rate": 1.6765298683476992e-05, |
|
"loss": 2.6226, |
|
"num_input_tokens_seen": 36984724, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 2.0101633860800208, |
|
"grad_norm": 6.9457011222839355, |
|
"learning_rate": 1.6497276898666324e-05, |
|
"loss": 2.5207, |
|
"num_input_tokens_seen": 37281092, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 2.0101633860800208, |
|
"eval_loss": 2.4945950508117676, |
|
"eval_runtime": 2.5094, |
|
"eval_samples_per_second": 996.26, |
|
"eval_steps_per_second": 124.732, |
|
"num_input_tokens_seen": 37281092, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 2.026244693168661, |
|
"grad_norm": 7.541498184204102, |
|
"learning_rate": 1.6229255113855656e-05, |
|
"loss": 2.4728, |
|
"num_input_tokens_seen": 37582300, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.042326000257301, |
|
"grad_norm": 6.7798027992248535, |
|
"learning_rate": 1.5961233329044987e-05, |
|
"loss": 2.4539, |
|
"num_input_tokens_seen": 37880828, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 2.058407307345941, |
|
"grad_norm": 7.033351898193359, |
|
"learning_rate": 1.5693211544234316e-05, |
|
"loss": 2.4467, |
|
"num_input_tokens_seen": 38181276, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 2.074488614434581, |
|
"grad_norm": 6.487890720367432, |
|
"learning_rate": 1.5425189759423648e-05, |
|
"loss": 2.4764, |
|
"num_input_tokens_seen": 38473348, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 2.0905699215232216, |
|
"grad_norm": 6.955127716064453, |
|
"learning_rate": 1.5157167974612976e-05, |
|
"loss": 2.4799, |
|
"num_input_tokens_seen": 38768588, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 2.0905699215232216, |
|
"eval_loss": 2.491555690765381, |
|
"eval_runtime": 2.5076, |
|
"eval_samples_per_second": 996.967, |
|
"eval_steps_per_second": 124.82, |
|
"num_input_tokens_seen": 38768588, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 2.1066512286118617, |
|
"grad_norm": 6.78762674331665, |
|
"learning_rate": 1.4889146189802308e-05, |
|
"loss": 2.4726, |
|
"num_input_tokens_seen": 39067460, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 2.122732535700502, |
|
"grad_norm": 7.199331283569336, |
|
"learning_rate": 1.4621124404991638e-05, |
|
"loss": 2.4562, |
|
"num_input_tokens_seen": 39360244, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.138813842789142, |
|
"grad_norm": 7.353775501251221, |
|
"learning_rate": 1.435310262018097e-05, |
|
"loss": 2.4629, |
|
"num_input_tokens_seen": 39660020, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 2.154895149877782, |
|
"grad_norm": 6.827337265014648, |
|
"learning_rate": 1.4085080835370298e-05, |
|
"loss": 2.4817, |
|
"num_input_tokens_seen": 39960476, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 2.170976456966422, |
|
"grad_norm": 6.532020092010498, |
|
"learning_rate": 1.381705905055963e-05, |
|
"loss": 2.4727, |
|
"num_input_tokens_seen": 40252972, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 2.170976456966422, |
|
"eval_loss": 2.4865615367889404, |
|
"eval_runtime": 2.5372, |
|
"eval_samples_per_second": 985.335, |
|
"eval_steps_per_second": 123.364, |
|
"num_input_tokens_seen": 40252972, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 2.1870577640550626, |
|
"grad_norm": 6.601158142089844, |
|
"learning_rate": 1.354903726574896e-05, |
|
"loss": 2.4666, |
|
"num_input_tokens_seen": 40553732, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 2.2031390711437027, |
|
"grad_norm": 7.200645446777344, |
|
"learning_rate": 1.3281015480938292e-05, |
|
"loss": 2.4657, |
|
"num_input_tokens_seen": 40851196, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 2.2192203782323427, |
|
"grad_norm": 8.067240715026855, |
|
"learning_rate": 1.301299369612762e-05, |
|
"loss": 2.4801, |
|
"num_input_tokens_seen": 41149276, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.235301685320983, |
|
"grad_norm": 7.724194526672363, |
|
"learning_rate": 1.2744971911316952e-05, |
|
"loss": 2.4766, |
|
"num_input_tokens_seen": 41448540, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 2.251382992409623, |
|
"grad_norm": 6.999200344085693, |
|
"learning_rate": 1.2476950126506282e-05, |
|
"loss": 2.4719, |
|
"num_input_tokens_seen": 41746300, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 2.251382992409623, |
|
"eval_loss": 2.476020097732544, |
|
"eval_runtime": 2.5166, |
|
"eval_samples_per_second": 993.407, |
|
"eval_steps_per_second": 124.375, |
|
"num_input_tokens_seen": 41746300, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 2.267464299498263, |
|
"grad_norm": 6.666884899139404, |
|
"learning_rate": 1.2208928341695614e-05, |
|
"loss": 2.4771, |
|
"num_input_tokens_seen": 42042532, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 2.2835456065869035, |
|
"grad_norm": 8.354509353637695, |
|
"learning_rate": 1.1940906556884944e-05, |
|
"loss": 2.4679, |
|
"num_input_tokens_seen": 42341628, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 2.2996269136755436, |
|
"grad_norm": 8.39284610748291, |
|
"learning_rate": 1.1672884772074275e-05, |
|
"loss": 2.4597, |
|
"num_input_tokens_seen": 42642948, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 2.3157082207641837, |
|
"grad_norm": 7.233700275421143, |
|
"learning_rate": 1.1404862987263605e-05, |
|
"loss": 2.4592, |
|
"num_input_tokens_seen": 42941820, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.331789527852824, |
|
"grad_norm": 7.843503475189209, |
|
"learning_rate": 1.1136841202452935e-05, |
|
"loss": 2.4738, |
|
"num_input_tokens_seen": 43241188, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 2.331789527852824, |
|
"eval_loss": 2.47127103805542, |
|
"eval_runtime": 2.5277, |
|
"eval_samples_per_second": 989.05, |
|
"eval_steps_per_second": 123.829, |
|
"num_input_tokens_seen": 43241188, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 2.347870834941464, |
|
"grad_norm": 6.37482213973999, |
|
"learning_rate": 1.0868819417642267e-05, |
|
"loss": 2.4576, |
|
"num_input_tokens_seen": 43535900, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 2.3639521420301044, |
|
"grad_norm": 6.642532825469971, |
|
"learning_rate": 1.0600797632831597e-05, |
|
"loss": 2.467, |
|
"num_input_tokens_seen": 43833516, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 2.3800334491187445, |
|
"grad_norm": 6.606197357177734, |
|
"learning_rate": 1.0332775848020927e-05, |
|
"loss": 2.4752, |
|
"num_input_tokens_seen": 44134084, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 2.3961147562073846, |
|
"grad_norm": 6.338978290557861, |
|
"learning_rate": 1.0064754063210257e-05, |
|
"loss": 2.4473, |
|
"num_input_tokens_seen": 44432540, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 2.4121960632960247, |
|
"grad_norm": 7.172792434692383, |
|
"learning_rate": 9.796732278399589e-06, |
|
"loss": 2.4629, |
|
"num_input_tokens_seen": 44730244, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.4121960632960247, |
|
"eval_loss": 2.4629955291748047, |
|
"eval_runtime": 2.521, |
|
"eval_samples_per_second": 991.68, |
|
"eval_steps_per_second": 124.158, |
|
"num_input_tokens_seen": 44730244, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.4282773703846647, |
|
"grad_norm": 6.65930700302124, |
|
"learning_rate": 9.528710493588919e-06, |
|
"loss": 2.4512, |
|
"num_input_tokens_seen": 45031884, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 2.4443586774733053, |
|
"grad_norm": 7.209745407104492, |
|
"learning_rate": 9.260688708778249e-06, |
|
"loss": 2.4557, |
|
"num_input_tokens_seen": 45334924, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 2.4604399845619453, |
|
"grad_norm": 6.847073078155518, |
|
"learning_rate": 8.992666923967581e-06, |
|
"loss": 2.4512, |
|
"num_input_tokens_seen": 45633532, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 2.4765212916505854, |
|
"grad_norm": 7.705162525177002, |
|
"learning_rate": 8.724645139156911e-06, |
|
"loss": 2.4568, |
|
"num_input_tokens_seen": 45933804, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 2.4926025987392255, |
|
"grad_norm": 7.5681962966918945, |
|
"learning_rate": 8.456623354346243e-06, |
|
"loss": 2.4524, |
|
"num_input_tokens_seen": 46231060, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 2.4926025987392255, |
|
"eval_loss": 2.457481861114502, |
|
"eval_runtime": 2.5921, |
|
"eval_samples_per_second": 964.467, |
|
"eval_steps_per_second": 120.751, |
|
"num_input_tokens_seen": 46231060, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 2.5086839058278656, |
|
"grad_norm": 6.8857269287109375, |
|
"learning_rate": 8.188601569535573e-06, |
|
"loss": 2.4622, |
|
"num_input_tokens_seen": 46525772, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.524765212916506, |
|
"grad_norm": 6.347681522369385, |
|
"learning_rate": 7.920579784724903e-06, |
|
"loss": 2.4528, |
|
"num_input_tokens_seen": 46822532, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 2.5408465200051458, |
|
"grad_norm": 6.935575008392334, |
|
"learning_rate": 7.652557999914233e-06, |
|
"loss": 2.4414, |
|
"num_input_tokens_seen": 47122964, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 2.5569278270937863, |
|
"grad_norm": 6.603360652923584, |
|
"learning_rate": 7.384536215103564e-06, |
|
"loss": 2.4655, |
|
"num_input_tokens_seen": 47423300, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 2.5730091341824264, |
|
"grad_norm": 7.182071208953857, |
|
"learning_rate": 7.116514430292895e-06, |
|
"loss": 2.435, |
|
"num_input_tokens_seen": 47718964, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.5730091341824264, |
|
"eval_loss": 2.455320358276367, |
|
"eval_runtime": 2.5065, |
|
"eval_samples_per_second": 997.4, |
|
"eval_steps_per_second": 124.874, |
|
"num_input_tokens_seen": 47718964, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.5890904412710665, |
|
"grad_norm": 7.3647260665893555, |
|
"learning_rate": 6.848492645482225e-06, |
|
"loss": 2.4356, |
|
"num_input_tokens_seen": 48015996, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 2.6051717483597066, |
|
"grad_norm": 7.950341701507568, |
|
"learning_rate": 6.580470860671556e-06, |
|
"loss": 2.4453, |
|
"num_input_tokens_seen": 48316420, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.6212530554483466, |
|
"grad_norm": 6.016787052154541, |
|
"learning_rate": 6.312449075860886e-06, |
|
"loss": 2.45, |
|
"num_input_tokens_seen": 48611452, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 2.637334362536987, |
|
"grad_norm": 7.281980514526367, |
|
"learning_rate": 6.044427291050217e-06, |
|
"loss": 2.4687, |
|
"num_input_tokens_seen": 48913668, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 2.6534156696256272, |
|
"grad_norm": 6.644787311553955, |
|
"learning_rate": 5.776405506239547e-06, |
|
"loss": 2.4621, |
|
"num_input_tokens_seen": 49209724, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 2.6534156696256272, |
|
"eval_loss": 2.4475488662719727, |
|
"eval_runtime": 2.517, |
|
"eval_samples_per_second": 993.239, |
|
"eval_steps_per_second": 124.354, |
|
"num_input_tokens_seen": 49209724, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 2.6694969767142673, |
|
"grad_norm": 6.181220054626465, |
|
"learning_rate": 5.508383721428878e-06, |
|
"loss": 2.4343, |
|
"num_input_tokens_seen": 49505772, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 2.6855782838029074, |
|
"grad_norm": 6.418393135070801, |
|
"learning_rate": 5.2403619366182085e-06, |
|
"loss": 2.4329, |
|
"num_input_tokens_seen": 49809956, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 2.7016595908915475, |
|
"grad_norm": 6.279716491699219, |
|
"learning_rate": 4.9723401518075395e-06, |
|
"loss": 2.4481, |
|
"num_input_tokens_seen": 50112060, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.717740897980188, |
|
"grad_norm": 6.502873420715332, |
|
"learning_rate": 4.70431836699687e-06, |
|
"loss": 2.4464, |
|
"num_input_tokens_seen": 50414356, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 2.733822205068828, |
|
"grad_norm": 6.15990686416626, |
|
"learning_rate": 4.4362965821862e-06, |
|
"loss": 2.4492, |
|
"num_input_tokens_seen": 50712980, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 2.733822205068828, |
|
"eval_loss": 2.4440150260925293, |
|
"eval_runtime": 2.529, |
|
"eval_samples_per_second": 988.544, |
|
"eval_steps_per_second": 123.766, |
|
"num_input_tokens_seen": 50712980, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 2.749903512157468, |
|
"grad_norm": 6.876352310180664, |
|
"learning_rate": 4.168274797375531e-06, |
|
"loss": 2.4514, |
|
"num_input_tokens_seen": 51012460, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 2.7659848192461083, |
|
"grad_norm": 7.305426597595215, |
|
"learning_rate": 3.900253012564861e-06, |
|
"loss": 2.4317, |
|
"num_input_tokens_seen": 51308524, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 2.7820661263347484, |
|
"grad_norm": 6.460892677307129, |
|
"learning_rate": 3.632231227754192e-06, |
|
"loss": 2.4559, |
|
"num_input_tokens_seen": 51610700, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 2.798147433423389, |
|
"grad_norm": 8.062651634216309, |
|
"learning_rate": 3.3642094429435228e-06, |
|
"loss": 2.4535, |
|
"num_input_tokens_seen": 51910236, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 2.814228740512029, |
|
"grad_norm": 7.140311241149902, |
|
"learning_rate": 3.0961876581328533e-06, |
|
"loss": 2.4536, |
|
"num_input_tokens_seen": 52204380, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 2.814228740512029, |
|
"eval_loss": 2.4393906593322754, |
|
"eval_runtime": 2.5312, |
|
"eval_samples_per_second": 987.685, |
|
"eval_steps_per_second": 123.658, |
|
"num_input_tokens_seen": 52204380, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 2.830310047600669, |
|
"grad_norm": 6.569787502288818, |
|
"learning_rate": 2.8281658733221834e-06, |
|
"loss": 2.4379, |
|
"num_input_tokens_seen": 52504668, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 2.846391354689309, |
|
"grad_norm": 7.735711097717285, |
|
"learning_rate": 2.560144088511514e-06, |
|
"loss": 2.4239, |
|
"num_input_tokens_seen": 52798740, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 2.8624726617779492, |
|
"grad_norm": 7.504124641418457, |
|
"learning_rate": 2.292122303700845e-06, |
|
"loss": 2.4427, |
|
"num_input_tokens_seen": 53097716, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 2.8785539688665893, |
|
"grad_norm": 6.647756099700928, |
|
"learning_rate": 2.0241005188901755e-06, |
|
"loss": 2.4682, |
|
"num_input_tokens_seen": 53397564, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 2.8946352759552294, |
|
"grad_norm": 6.640815734863281, |
|
"learning_rate": 1.756078734079506e-06, |
|
"loss": 2.4148, |
|
"num_input_tokens_seen": 53695620, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.8946352759552294, |
|
"eval_loss": 2.43597674369812, |
|
"eval_runtime": 2.5128, |
|
"eval_samples_per_second": 994.891, |
|
"eval_steps_per_second": 124.56, |
|
"num_input_tokens_seen": 53695620, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.91071658304387, |
|
"grad_norm": 7.346447467803955, |
|
"learning_rate": 1.4880569492688366e-06, |
|
"loss": 2.4352, |
|
"num_input_tokens_seen": 53991180, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 2.92679789013251, |
|
"grad_norm": 6.777767658233643, |
|
"learning_rate": 1.2200351644581672e-06, |
|
"loss": 2.4664, |
|
"num_input_tokens_seen": 54288348, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 2.94287919722115, |
|
"grad_norm": 6.908254623413086, |
|
"learning_rate": 9.520133796474978e-07, |
|
"loss": 2.4474, |
|
"num_input_tokens_seen": 54590740, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 2.95896050430979, |
|
"grad_norm": 7.04544734954834, |
|
"learning_rate": 6.839915948368284e-07, |
|
"loss": 2.4554, |
|
"num_input_tokens_seen": 54889220, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 2.9750418113984303, |
|
"grad_norm": 9.98161792755127, |
|
"learning_rate": 4.159698100261589e-07, |
|
"loss": 2.4243, |
|
"num_input_tokens_seen": 55190020, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 2.9750418113984303, |
|
"eval_loss": 2.435030460357666, |
|
"eval_runtime": 2.5128, |
|
"eval_samples_per_second": 994.908, |
|
"eval_steps_per_second": 124.563, |
|
"num_input_tokens_seen": 55190020, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 2.991123118487071, |
|
"grad_norm": 6.586206912994385, |
|
"learning_rate": 1.479480252154895e-07, |
|
"loss": 2.44, |
|
"num_input_tokens_seen": 55490868, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"num_input_tokens_seen": 55653732, |
|
"step": 93276, |
|
"total_flos": 1.4738832163602432e+16, |
|
"train_loss": 1.7073542784139526, |
|
"train_runtime": 2504.3889, |
|
"train_samples_per_second": 297.955, |
|
"train_steps_per_second": 37.245, |
|
"train_tokens_per_second": 22216.164 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 93276, |
|
"num_input_tokens_seen": 55653732, |
|
"num_train_epochs": 3, |
|
"save_steps": 2500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4738832163602432e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|