diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6415 @@ +{ + "best_global_step": 2800, + "best_metric": 1.25887144, + "best_model_checkpoint": "/output/v0-20250330-123600/checkpoint-2800", + "epoch": 0.9999357917941913, + "eval_steps": 50, + "global_step": 2920, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003424437643130792, + "grad_norm": 10.562653541564941, + "learning_rate": 6.849315068493151e-07, + "loss": 3.7932324409484863, + "memory(GiB)": 15.32, + "step": 1, + "token_acc": 0.48090169067000627, + "train_speed(iter/s)": 0.095328 + }, + { + "epoch": 0.001712218821565396, + "grad_norm": 13.306646347045898, + "learning_rate": 3.4246575342465754e-06, + "loss": 4.11192512512207, + "memory(GiB)": 15.32, + "step": 5, + "token_acc": 0.48935819171753836, + "train_speed(iter/s)": 0.136408 + }, + { + "epoch": 0.003424437643130792, + "grad_norm": 13.110633850097656, + "learning_rate": 6.849315068493151e-06, + "loss": 4.3073070526123045, + "memory(GiB)": 15.32, + "step": 10, + "token_acc": 0.47730028290448606, + "train_speed(iter/s)": 0.148193 + }, + { + "epoch": 0.0051366564646961885, + "grad_norm": 15.585564613342285, + "learning_rate": 1.0273972602739726e-05, + "loss": 4.278420257568359, + "memory(GiB)": 15.32, + "step": 15, + "token_acc": 0.487364090508375, + "train_speed(iter/s)": 0.151401 + }, + { + "epoch": 0.006848875286261584, + "grad_norm": 9.919371604919434, + "learning_rate": 1.3698630136986302e-05, + "loss": 3.2780494689941406, + "memory(GiB)": 15.32, + "step": 20, + "token_acc": 0.5149952015355086, + "train_speed(iter/s)": 0.153251 + }, + { + "epoch": 0.00856109410782698, + "grad_norm": 6.564103126525879, + "learning_rate": 1.7123287671232875e-05, + "loss": 2.935646629333496, + "memory(GiB)": 15.32, + "step": 25, + "token_acc": 0.5059960841899168, + "train_speed(iter/s)": 0.154291 + }, + { + "epoch": 0.010273312929392377, + "grad_norm": 3.8877317905426025, + "learning_rate": 2.0547945205479453e-05, + "loss": 2.700718307495117, + "memory(GiB)": 15.32, + "step": 30, + "token_acc": 0.49509137592508684, + "train_speed(iter/s)": 0.152838 + }, + { + "epoch": 0.011985531750957772, + "grad_norm": 3.0155110359191895, + "learning_rate": 2.3972602739726026e-05, + "loss": 1.908003044128418, + "memory(GiB)": 15.32, + "step": 35, + "token_acc": 0.5628140703517588, + "train_speed(iter/s)": 0.155035 + }, + { + "epoch": 0.013697750572523168, + "grad_norm": 1.9901460409164429, + "learning_rate": 2.7397260273972603e-05, + "loss": 1.951570701599121, + "memory(GiB)": 15.32, + "step": 40, + "token_acc": 0.5446540880503145, + "train_speed(iter/s)": 0.156204 + }, + { + "epoch": 0.015409969394088565, + "grad_norm": 1.2631195783615112, + "learning_rate": 3.082191780821918e-05, + "loss": 1.8759283065795898, + "memory(GiB)": 15.32, + "step": 45, + "token_acc": 0.5555123720689208, + "train_speed(iter/s)": 0.156183 + }, + { + "epoch": 0.01712218821565396, + "grad_norm": 1.6672759056091309, + "learning_rate": 3.424657534246575e-05, + "loss": 1.7698347091674804, + "memory(GiB)": 15.32, + "step": 50, + "token_acc": 0.5656245280169159, + "train_speed(iter/s)": 0.156511 + }, + { + "epoch": 0.01712218821565396, + "eval_loss": 1.726967215538025, + "eval_runtime": 44.6015, + "eval_samples_per_second": 10.403, + "eval_steps_per_second": 10.403, + "eval_token_acc": 0.5752919619400844, + "step": 50 + }, + { + "epoch": 0.018834407037219357, + "grad_norm": 1.6087548732757568, + "learning_rate": 3.767123287671233e-05, + "loss": 1.7173377990722656, + "memory(GiB)": 15.32, + "step": 55, + "token_acc": 0.5751355830069298, + "train_speed(iter/s)": 0.13676 + }, + { + "epoch": 0.020546625858784754, + "grad_norm": 1.5402779579162598, + "learning_rate": 4.1095890410958905e-05, + "loss": 1.7572795867919921, + "memory(GiB)": 15.32, + "step": 60, + "token_acc": 0.5718085106382979, + "train_speed(iter/s)": 0.137374 + }, + { + "epoch": 0.022258844680350148, + "grad_norm": 1.476481556892395, + "learning_rate": 4.452054794520548e-05, + "loss": 1.6557682037353516, + "memory(GiB)": 15.32, + "step": 65, + "token_acc": 0.5803955288048152, + "train_speed(iter/s)": 0.138688 + }, + { + "epoch": 0.023971063501915545, + "grad_norm": 1.2691764831542969, + "learning_rate": 4.794520547945205e-05, + "loss": 1.6802217483520507, + "memory(GiB)": 15.32, + "step": 70, + "token_acc": 0.5775040328493914, + "train_speed(iter/s)": 0.139788 + }, + { + "epoch": 0.02568328232348094, + "grad_norm": 1.526488184928894, + "learning_rate": 5.136986301369864e-05, + "loss": 1.6561800003051759, + "memory(GiB)": 15.32, + "step": 75, + "token_acc": 0.5892834252018971, + "train_speed(iter/s)": 0.140383 + }, + { + "epoch": 0.027395501145046335, + "grad_norm": 1.6070963144302368, + "learning_rate": 5.479452054794521e-05, + "loss": 1.6377702713012696, + "memory(GiB)": 15.32, + "step": 80, + "token_acc": 0.5973952434881087, + "train_speed(iter/s)": 0.140891 + }, + { + "epoch": 0.029107719966611732, + "grad_norm": 1.4103753566741943, + "learning_rate": 5.821917808219178e-05, + "loss": 1.7173212051391602, + "memory(GiB)": 15.32, + "step": 85, + "token_acc": 0.5771731917717319, + "train_speed(iter/s)": 0.14159 + }, + { + "epoch": 0.03081993878817713, + "grad_norm": 1.519978404045105, + "learning_rate": 6.164383561643835e-05, + "loss": 1.647627067565918, + "memory(GiB)": 15.32, + "step": 90, + "token_acc": 0.5874305837735337, + "train_speed(iter/s)": 0.141995 + }, + { + "epoch": 0.032532157609742526, + "grad_norm": 1.327723503112793, + "learning_rate": 6.506849315068494e-05, + "loss": 1.6138336181640625, + "memory(GiB)": 15.32, + "step": 95, + "token_acc": 0.5970721028868519, + "train_speed(iter/s)": 0.142301 + }, + { + "epoch": 0.03424437643130792, + "grad_norm": 1.5655895471572876, + "learning_rate": 6.84931506849315e-05, + "loss": 1.6160991668701172, + "memory(GiB)": 15.32, + "step": 100, + "token_acc": 0.5930098578927154, + "train_speed(iter/s)": 0.142956 + }, + { + "epoch": 0.03424437643130792, + "eval_loss": 1.5346044301986694, + "eval_runtime": 45.0308, + "eval_samples_per_second": 10.304, + "eval_steps_per_second": 10.304, + "eval_token_acc": 0.599267059628673, + "step": 100 + }, + { + "epoch": 0.03595659525287332, + "grad_norm": 1.8872798681259155, + "learning_rate": 7.191780821917809e-05, + "loss": 1.6932052612304687, + "memory(GiB)": 15.32, + "step": 105, + "token_acc": 0.5967494406745188, + "train_speed(iter/s)": 0.132951 + }, + { + "epoch": 0.037668814074438714, + "grad_norm": 1.4912633895874023, + "learning_rate": 7.534246575342466e-05, + "loss": 1.4904816627502442, + "memory(GiB)": 15.32, + "step": 110, + "token_acc": 0.6213143428285857, + "train_speed(iter/s)": 0.134126 + }, + { + "epoch": 0.03938103289600411, + "grad_norm": 1.2947896718978882, + "learning_rate": 7.876712328767124e-05, + "loss": 1.5470872879028321, + "memory(GiB)": 15.32, + "step": 115, + "token_acc": 0.5979495705181491, + "train_speed(iter/s)": 0.13544 + }, + { + "epoch": 0.04109325171756951, + "grad_norm": 1.3727917671203613, + "learning_rate": 8.219178082191781e-05, + "loss": 1.5511229515075684, + "memory(GiB)": 15.32, + "step": 120, + "token_acc": 0.602931803696622, + "train_speed(iter/s)": 0.13611 + }, + { + "epoch": 0.0428054705391349, + "grad_norm": 1.6801995038986206, + "learning_rate": 8.561643835616438e-05, + "loss": 1.4787715911865233, + "memory(GiB)": 15.32, + "step": 125, + "token_acc": 0.6094570928196147, + "train_speed(iter/s)": 0.137181 + }, + { + "epoch": 0.044517689360700295, + "grad_norm": 1.708738923072815, + "learning_rate": 8.904109589041096e-05, + "loss": 1.4594125747680664, + "memory(GiB)": 15.32, + "step": 130, + "token_acc": 0.6156867714140902, + "train_speed(iter/s)": 0.138114 + }, + { + "epoch": 0.046229908182265696, + "grad_norm": 1.1958949565887451, + "learning_rate": 9.246575342465755e-05, + "loss": 1.4886094093322755, + "memory(GiB)": 15.32, + "step": 135, + "token_acc": 0.6231549815498155, + "train_speed(iter/s)": 0.138388 + }, + { + "epoch": 0.04794212700383109, + "grad_norm": 1.772101640701294, + "learning_rate": 9.58904109589041e-05, + "loss": 1.5204880714416504, + "memory(GiB)": 15.32, + "step": 140, + "token_acc": 0.6100154083204931, + "train_speed(iter/s)": 0.139232 + }, + { + "epoch": 0.04965434582539648, + "grad_norm": 1.8606083393096924, + "learning_rate": 9.931506849315069e-05, + "loss": 1.5220369338989257, + "memory(GiB)": 15.32, + "step": 145, + "token_acc": 0.6051495758297366, + "train_speed(iter/s)": 0.139852 + }, + { + "epoch": 0.05136656464696188, + "grad_norm": 1.5017919540405273, + "learning_rate": 9.999948696607946e-05, + "loss": 1.5608755111694337, + "memory(GiB)": 15.32, + "step": 150, + "token_acc": 0.5953513660459426, + "train_speed(iter/s)": 0.140459 + }, + { + "epoch": 0.05136656464696188, + "eval_loss": 1.4748371839523315, + "eval_runtime": 45.2008, + "eval_samples_per_second": 10.265, + "eval_steps_per_second": 10.265, + "eval_token_acc": 0.6077665187540013, + "step": 150 + }, + { + "epoch": 0.05307878346852728, + "grad_norm": 1.5679197311401367, + "learning_rate": 9.999740278382108e-05, + "loss": 1.5313417434692382, + "memory(GiB)": 15.32, + "step": 155, + "token_acc": 0.6085361913991981, + "train_speed(iter/s)": 0.134524 + }, + { + "epoch": 0.05479100229009267, + "grad_norm": 1.4405403137207031, + "learning_rate": 9.99937154553819e-05, + "loss": 1.6128368377685547, + "memory(GiB)": 15.32, + "step": 160, + "token_acc": 0.6020935471048099, + "train_speed(iter/s)": 0.135049 + }, + { + "epoch": 0.05650322111165807, + "grad_norm": 1.611666202545166, + "learning_rate": 9.998842509899456e-05, + "loss": 1.4315536499023438, + "memory(GiB)": 15.32, + "step": 165, + "token_acc": 0.6300712719298246, + "train_speed(iter/s)": 0.135709 + }, + { + "epoch": 0.058215439933223465, + "grad_norm": 1.841750979423523, + "learning_rate": 9.998153188429216e-05, + "loss": 1.6267351150512694, + "memory(GiB)": 15.32, + "step": 170, + "token_acc": 0.6016483516483516, + "train_speed(iter/s)": 0.135811 + }, + { + "epoch": 0.059927658754788865, + "grad_norm": 1.4180922508239746, + "learning_rate": 9.997303603230282e-05, + "loss": 1.4912917137145996, + "memory(GiB)": 15.32, + "step": 175, + "token_acc": 0.6141880744756069, + "train_speed(iter/s)": 0.136595 + }, + { + "epoch": 0.06163987757635426, + "grad_norm": 1.5234973430633545, + "learning_rate": 9.996293781544253e-05, + "loss": 1.4663347244262694, + "memory(GiB)": 15.32, + "step": 180, + "token_acc": 0.6194945078383278, + "train_speed(iter/s)": 0.136777 + }, + { + "epoch": 0.06335209639791965, + "grad_norm": 1.2434215545654297, + "learning_rate": 9.995123755750648e-05, + "loss": 1.4156696319580078, + "memory(GiB)": 15.32, + "step": 185, + "token_acc": 0.618693134822167, + "train_speed(iter/s)": 0.137456 + }, + { + "epoch": 0.06506431521948505, + "grad_norm": 1.489881992340088, + "learning_rate": 9.993793563365863e-05, + "loss": 1.557717990875244, + "memory(GiB)": 15.32, + "step": 190, + "token_acc": 0.6018673355629878, + "train_speed(iter/s)": 0.137977 + }, + { + "epoch": 0.06677653404105044, + "grad_norm": 1.589721441268921, + "learning_rate": 9.99230324704197e-05, + "loss": 1.484724235534668, + "memory(GiB)": 15.32, + "step": 195, + "token_acc": 0.6126569173029637, + "train_speed(iter/s)": 0.138501 + }, + { + "epoch": 0.06848875286261584, + "grad_norm": 1.5822319984436035, + "learning_rate": 9.990652854565348e-05, + "loss": 1.533009910583496, + "memory(GiB)": 15.32, + "step": 200, + "token_acc": 0.6064755384395949, + "train_speed(iter/s)": 0.138866 + }, + { + "epoch": 0.06848875286261584, + "eval_loss": 1.4378730058670044, + "eval_runtime": 44.9673, + "eval_samples_per_second": 10.319, + "eval_steps_per_second": 10.319, + "eval_token_acc": 0.6127116586087379, + "step": 200 + }, + { + "epoch": 0.07020097168418124, + "grad_norm": 1.3716204166412354, + "learning_rate": 9.988842438855157e-05, + "loss": 1.5009285926818847, + "memory(GiB)": 15.32, + "step": 205, + "token_acc": 0.6134132518778936, + "train_speed(iter/s)": 0.134483 + }, + { + "epoch": 0.07191319050574664, + "grad_norm": 1.141250729560852, + "learning_rate": 9.98687205796163e-05, + "loss": 1.3672036170959472, + "memory(GiB)": 15.32, + "step": 210, + "token_acc": 0.6230096784264751, + "train_speed(iter/s)": 0.13531 + }, + { + "epoch": 0.07362540932731203, + "grad_norm": 1.2666934728622437, + "learning_rate": 9.984741775064222e-05, + "loss": 1.4257469177246094, + "memory(GiB)": 15.32, + "step": 215, + "token_acc": 0.6238795267120831, + "train_speed(iter/s)": 0.135705 + }, + { + "epoch": 0.07533762814887743, + "grad_norm": 1.3408336639404297, + "learning_rate": 9.982451658469581e-05, + "loss": 1.5237869262695312, + "memory(GiB)": 15.32, + "step": 220, + "token_acc": 0.6102232201822745, + "train_speed(iter/s)": 0.136093 + }, + { + "epoch": 0.07704984697044283, + "grad_norm": 1.258309006690979, + "learning_rate": 9.980001781609352e-05, + "loss": 1.4744735717773438, + "memory(GiB)": 15.32, + "step": 225, + "token_acc": 0.6063757011903134, + "train_speed(iter/s)": 0.136576 + }, + { + "epoch": 0.07876206579200822, + "grad_norm": 1.4546130895614624, + "learning_rate": 9.977392223037831e-05, + "loss": 1.4169251441955566, + "memory(GiB)": 15.32, + "step": 230, + "token_acc": 0.6351069379238393, + "train_speed(iter/s)": 0.137064 + }, + { + "epoch": 0.08047428461357362, + "grad_norm": 1.385501742362976, + "learning_rate": 9.97462306642944e-05, + "loss": 1.453796672821045, + "memory(GiB)": 15.32, + "step": 235, + "token_acc": 0.6182396606574762, + "train_speed(iter/s)": 0.137373 + }, + { + "epoch": 0.08218650343513902, + "grad_norm": 1.577653169631958, + "learning_rate": 9.971694400576053e-05, + "loss": 1.4679311752319335, + "memory(GiB)": 15.32, + "step": 240, + "token_acc": 0.6250562303193882, + "train_speed(iter/s)": 0.137784 + }, + { + "epoch": 0.0838987222567044, + "grad_norm": 1.2884719371795654, + "learning_rate": 9.968606319384132e-05, + "loss": 1.4336971282958983, + "memory(GiB)": 15.32, + "step": 245, + "token_acc": 0.6329868006074056, + "train_speed(iter/s)": 0.138133 + }, + { + "epoch": 0.0856109410782698, + "grad_norm": 1.5178383588790894, + "learning_rate": 9.965358921871735e-05, + "loss": 1.556607723236084, + "memory(GiB)": 15.32, + "step": 250, + "token_acc": 0.6054276972196355, + "train_speed(iter/s)": 0.138371 + }, + { + "epoch": 0.0856109410782698, + "eval_loss": 1.4204825162887573, + "eval_runtime": 44.9973, + "eval_samples_per_second": 10.312, + "eval_steps_per_second": 10.312, + "eval_token_acc": 0.6165088195686248, + "step": 250 + }, + { + "epoch": 0.0873231598998352, + "grad_norm": 1.404898762702942, + "learning_rate": 9.961952312165326e-05, + "loss": 1.4677803993225098, + "memory(GiB)": 15.32, + "step": 255, + "token_acc": 0.6150018989745537, + "train_speed(iter/s)": 0.13496 + }, + { + "epoch": 0.08903537872140059, + "grad_norm": 1.2489333152770996, + "learning_rate": 9.95838659949645e-05, + "loss": 1.462001419067383, + "memory(GiB)": 15.32, + "step": 260, + "token_acc": 0.6207925010651896, + "train_speed(iter/s)": 0.135457 + }, + { + "epoch": 0.09074759754296599, + "grad_norm": 1.403685212135315, + "learning_rate": 9.954661898198216e-05, + "loss": 1.522507095336914, + "memory(GiB)": 15.32, + "step": 265, + "token_acc": 0.6147701793721974, + "train_speed(iter/s)": 0.135657 + }, + { + "epoch": 0.09245981636453139, + "grad_norm": 1.347508430480957, + "learning_rate": 9.950778327701643e-05, + "loss": 1.4577245712280273, + "memory(GiB)": 15.32, + "step": 270, + "token_acc": 0.6204641620720606, + "train_speed(iter/s)": 0.13586 + }, + { + "epoch": 0.09417203518609678, + "grad_norm": 1.4898968935012817, + "learning_rate": 9.946736012531821e-05, + "loss": 1.4915903091430665, + "memory(GiB)": 15.32, + "step": 275, + "token_acc": 0.6117818760780152, + "train_speed(iter/s)": 0.13626 + }, + { + "epoch": 0.09588425400766218, + "grad_norm": 1.5971683263778687, + "learning_rate": 9.942535082303927e-05, + "loss": 1.466657829284668, + "memory(GiB)": 15.32, + "step": 280, + "token_acc": 0.6194750441996464, + "train_speed(iter/s)": 0.13679 + }, + { + "epoch": 0.09759647282922758, + "grad_norm": 1.1435737609863281, + "learning_rate": 9.938175671719063e-05, + "loss": 1.5881732940673827, + "memory(GiB)": 15.32, + "step": 285, + "token_acc": 0.5941022280471822, + "train_speed(iter/s)": 0.136916 + }, + { + "epoch": 0.09930869165079297, + "grad_norm": 1.170966386795044, + "learning_rate": 9.933657920559939e-05, + "loss": 1.4529515266418458, + "memory(GiB)": 15.32, + "step": 290, + "token_acc": 0.6212765957446809, + "train_speed(iter/s)": 0.137235 + }, + { + "epoch": 0.10102091047235837, + "grad_norm": 1.4379491806030273, + "learning_rate": 9.928981973686388e-05, + "loss": 1.4789634704589845, + "memory(GiB)": 15.32, + "step": 295, + "token_acc": 0.6101909039865244, + "train_speed(iter/s)": 0.137716 + }, + { + "epoch": 0.10273312929392377, + "grad_norm": 1.3779692649841309, + "learning_rate": 9.924147981030727e-05, + "loss": 1.501486873626709, + "memory(GiB)": 15.32, + "step": 300, + "token_acc": 0.609641638225256, + "train_speed(iter/s)": 0.138122 + }, + { + "epoch": 0.10273312929392377, + "eval_loss": 1.3993631601333618, + "eval_runtime": 44.6573, + "eval_samples_per_second": 10.39, + "eval_steps_per_second": 10.39, + "eval_token_acc": 0.6185619356690288, + "step": 300 + }, + { + "epoch": 0.10444534811548915, + "grad_norm": 1.2503433227539062, + "learning_rate": 9.919156097592943e-05, + "loss": 1.3608002662658691, + "memory(GiB)": 15.32, + "step": 305, + "token_acc": 0.6209487580271558, + "train_speed(iter/s)": 0.135484 + }, + { + "epoch": 0.10615756693705455, + "grad_norm": 1.4793421030044556, + "learning_rate": 9.914006483435731e-05, + "loss": 1.448813533782959, + "memory(GiB)": 15.32, + "step": 310, + "token_acc": 0.6231512224569876, + "train_speed(iter/s)": 0.135644 + }, + { + "epoch": 0.10786978575861995, + "grad_norm": 1.6080430746078491, + "learning_rate": 9.90869930367935e-05, + "loss": 1.4645686149597168, + "memory(GiB)": 15.32, + "step": 315, + "token_acc": 0.6132359260850881, + "train_speed(iter/s)": 0.136001 + }, + { + "epoch": 0.10958200458018534, + "grad_norm": 1.4214388132095337, + "learning_rate": 9.903234728496341e-05, + "loss": 1.4589122772216796, + "memory(GiB)": 15.32, + "step": 320, + "token_acc": 0.6142734728852981, + "train_speed(iter/s)": 0.136331 + }, + { + "epoch": 0.11129422340175074, + "grad_norm": 1.5650954246520996, + "learning_rate": 9.897612933106061e-05, + "loss": 1.457740879058838, + "memory(GiB)": 15.32, + "step": 325, + "token_acc": 0.623991935483871, + "train_speed(iter/s)": 0.136613 + }, + { + "epoch": 0.11300644222331614, + "grad_norm": 1.3945279121398926, + "learning_rate": 9.891834097769071e-05, + "loss": 1.556583309173584, + "memory(GiB)": 15.32, + "step": 330, + "token_acc": 0.5994619638320131, + "train_speed(iter/s)": 0.13688 + }, + { + "epoch": 0.11471866104488154, + "grad_norm": 1.281740427017212, + "learning_rate": 9.885898407781351e-05, + "loss": 1.5181618690490724, + "memory(GiB)": 15.32, + "step": 335, + "token_acc": 0.6130885475209192, + "train_speed(iter/s)": 0.137026 + }, + { + "epoch": 0.11643087986644693, + "grad_norm": 1.3207279443740845, + "learning_rate": 9.879806053468361e-05, + "loss": 1.6090400695800782, + "memory(GiB)": 15.32, + "step": 340, + "token_acc": 0.6032722513089005, + "train_speed(iter/s)": 0.137113 + }, + { + "epoch": 0.11814309868801233, + "grad_norm": 1.0481092929840088, + "learning_rate": 9.873557230178942e-05, + "loss": 1.4021844863891602, + "memory(GiB)": 15.32, + "step": 345, + "token_acc": 0.6353738204693927, + "train_speed(iter/s)": 0.137288 + }, + { + "epoch": 0.11985531750957773, + "grad_norm": 1.6984754800796509, + "learning_rate": 9.867152138279043e-05, + "loss": 1.397127914428711, + "memory(GiB)": 15.32, + "step": 350, + "token_acc": 0.6340796667534496, + "train_speed(iter/s)": 0.137639 + }, + { + "epoch": 0.11985531750957773, + "eval_loss": 1.386860728263855, + "eval_runtime": 44.8796, + "eval_samples_per_second": 10.339, + "eval_steps_per_second": 10.339, + "eval_token_acc": 0.6218734132503256, + "step": 350 + }, + { + "epoch": 0.12156753633114312, + "grad_norm": 1.09983491897583, + "learning_rate": 9.860590983145307e-05, + "loss": 1.4905285835266113, + "memory(GiB)": 15.32, + "step": 355, + "token_acc": 0.6218814288386011, + "train_speed(iter/s)": 0.135057 + }, + { + "epoch": 0.12327975515270852, + "grad_norm": 1.3224064111709595, + "learning_rate": 9.853873975158476e-05, + "loss": 1.4726950645446777, + "memory(GiB)": 15.32, + "step": 360, + "token_acc": 0.6144313927972118, + "train_speed(iter/s)": 0.135393 + }, + { + "epoch": 0.12499197397427392, + "grad_norm": 1.4773943424224854, + "learning_rate": 9.847001329696653e-05, + "loss": 1.4908391952514648, + "memory(GiB)": 15.32, + "step": 365, + "token_acc": 0.6116747741487144, + "train_speed(iter/s)": 0.135601 + }, + { + "epoch": 0.1267041927958393, + "grad_norm": 1.2326139211654663, + "learning_rate": 9.839973267128389e-05, + "loss": 1.4283517837524413, + "memory(GiB)": 15.32, + "step": 370, + "token_acc": 0.6196575776791376, + "train_speed(iter/s)": 0.135823 + }, + { + "epoch": 0.1284164116174047, + "grad_norm": 1.629271149635315, + "learning_rate": 9.832790012805626e-05, + "loss": 1.414753246307373, + "memory(GiB)": 15.32, + "step": 375, + "token_acc": 0.6297824456114028, + "train_speed(iter/s)": 0.136181 + }, + { + "epoch": 0.1301286304389701, + "grad_norm": 1.4482200145721436, + "learning_rate": 9.825451797056462e-05, + "loss": 1.5005831718444824, + "memory(GiB)": 15.32, + "step": 380, + "token_acc": 0.6183285602707628, + "train_speed(iter/s)": 0.136211 + }, + { + "epoch": 0.1318408492605355, + "grad_norm": 1.4535025358200073, + "learning_rate": 9.81795885517777e-05, + "loss": 1.468778419494629, + "memory(GiB)": 15.32, + "step": 385, + "token_acc": 0.6203135650988412, + "train_speed(iter/s)": 0.136384 + }, + { + "epoch": 0.13355306808210088, + "grad_norm": 1.40886652469635, + "learning_rate": 9.810311427427653e-05, + "loss": 1.4840813636779786, + "memory(GiB)": 15.32, + "step": 390, + "token_acc": 0.6215198386012105, + "train_speed(iter/s)": 0.136682 + }, + { + "epoch": 0.1352652869036663, + "grad_norm": 1.2200162410736084, + "learning_rate": 9.80250975901774e-05, + "loss": 1.5448409080505372, + "memory(GiB)": 15.32, + "step": 395, + "token_acc": 0.6112213841471483, + "train_speed(iter/s)": 0.136925 + }, + { + "epoch": 0.13697750572523168, + "grad_norm": 1.6576746702194214, + "learning_rate": 9.794554100105324e-05, + "loss": 1.5157842636108398, + "memory(GiB)": 15.32, + "step": 400, + "token_acc": 0.601663990658298, + "train_speed(iter/s)": 0.137014 + }, + { + "epoch": 0.13697750572523168, + "eval_loss": 1.3783774375915527, + "eval_runtime": 45.0435, + "eval_samples_per_second": 10.301, + "eval_steps_per_second": 10.301, + "eval_token_acc": 0.6243018301432766, + "step": 400 + }, + { + "epoch": 0.1386897245467971, + "grad_norm": 1.212185263633728, + "learning_rate": 9.786444705785341e-05, + "loss": 1.4979464530944824, + "memory(GiB)": 15.32, + "step": 405, + "token_acc": 0.62297904907682, + "train_speed(iter/s)": 0.134778 + }, + { + "epoch": 0.14040194336836248, + "grad_norm": 1.1128188371658325, + "learning_rate": 9.778181836082185e-05, + "loss": 1.5300077438354491, + "memory(GiB)": 15.32, + "step": 410, + "token_acc": 0.6158506429277942, + "train_speed(iter/s)": 0.134974 + }, + { + "epoch": 0.14211416218992787, + "grad_norm": 1.2315031290054321, + "learning_rate": 9.769765755941383e-05, + "loss": 1.5104138374328613, + "memory(GiB)": 15.32, + "step": 415, + "token_acc": 0.6147315556618438, + "train_speed(iter/s)": 0.135097 + }, + { + "epoch": 0.14382638101149328, + "grad_norm": 1.402356505393982, + "learning_rate": 9.761196735221084e-05, + "loss": 1.5413570404052734, + "memory(GiB)": 15.32, + "step": 420, + "token_acc": 0.6145646185171877, + "train_speed(iter/s)": 0.135087 + }, + { + "epoch": 0.14553859983305867, + "grad_norm": 1.5219883918762207, + "learning_rate": 9.752475048683419e-05, + "loss": 1.5671575546264649, + "memory(GiB)": 15.32, + "step": 425, + "token_acc": 0.602426059697424, + "train_speed(iter/s)": 0.135166 + }, + { + "epoch": 0.14725081865462405, + "grad_norm": 1.7040166854858398, + "learning_rate": 9.743600975985682e-05, + "loss": 1.4261579513549805, + "memory(GiB)": 15.32, + "step": 430, + "token_acc": 0.6317440401505646, + "train_speed(iter/s)": 0.13543 + }, + { + "epoch": 0.14896303747618947, + "grad_norm": 1.167004108428955, + "learning_rate": 9.73457480167137e-05, + "loss": 1.4740997314453126, + "memory(GiB)": 15.32, + "step": 435, + "token_acc": 0.6143364928909952, + "train_speed(iter/s)": 0.135606 + }, + { + "epoch": 0.15067525629775486, + "grad_norm": 1.1588422060012817, + "learning_rate": 9.725396815161053e-05, + "loss": 1.33306941986084, + "memory(GiB)": 15.32, + "step": 440, + "token_acc": 0.6402698342865523, + "train_speed(iter/s)": 0.13594 + }, + { + "epoch": 0.15238747511932024, + "grad_norm": 1.533815860748291, + "learning_rate": 9.7160673107431e-05, + "loss": 1.4070747375488282, + "memory(GiB)": 15.32, + "step": 445, + "token_acc": 0.6401828740887187, + "train_speed(iter/s)": 0.136063 + }, + { + "epoch": 0.15409969394088566, + "grad_norm": 1.0575358867645264, + "learning_rate": 9.706586587564237e-05, + "loss": 1.5491487503051757, + "memory(GiB)": 15.32, + "step": 450, + "token_acc": 0.6109122464475539, + "train_speed(iter/s)": 0.136094 + }, + { + "epoch": 0.15409969394088566, + "eval_loss": 1.3688989877700806, + "eval_runtime": 44.5355, + "eval_samples_per_second": 10.419, + "eval_steps_per_second": 10.419, + "eval_token_acc": 0.6254939620725435, + "step": 450 + }, + { + "epoch": 0.15581191276245104, + "grad_norm": 1.4984129667282104, + "learning_rate": 9.696954949619955e-05, + "loss": 1.5960139274597167, + "memory(GiB)": 15.32, + "step": 455, + "token_acc": 0.6220436455040561, + "train_speed(iter/s)": 0.13406 + }, + { + "epoch": 0.15752413158401643, + "grad_norm": 1.2888133525848389, + "learning_rate": 9.687172705744772e-05, + "loss": 1.347306728363037, + "memory(GiB)": 15.32, + "step": 460, + "token_acc": 0.6442477876106195, + "train_speed(iter/s)": 0.134252 + }, + { + "epoch": 0.15923635040558184, + "grad_norm": 1.5194957256317139, + "learning_rate": 9.677240169602318e-05, + "loss": 1.5573819160461426, + "memory(GiB)": 15.32, + "step": 465, + "token_acc": 0.5980867514087276, + "train_speed(iter/s)": 0.134365 + }, + { + "epoch": 0.16094856922714723, + "grad_norm": 1.5082134008407593, + "learning_rate": 9.667157659675283e-05, + "loss": 1.3614291191101073, + "memory(GiB)": 15.32, + "step": 470, + "token_acc": 0.6335267569310122, + "train_speed(iter/s)": 0.134561 + }, + { + "epoch": 0.16266078804871262, + "grad_norm": 1.4609310626983643, + "learning_rate": 9.656925499255206e-05, + "loss": 1.4874407768249511, + "memory(GiB)": 15.32, + "step": 475, + "token_acc": 0.6137887413029728, + "train_speed(iter/s)": 0.134713 + }, + { + "epoch": 0.16437300687027803, + "grad_norm": 1.368520736694336, + "learning_rate": 9.646544016432108e-05, + "loss": 1.452055072784424, + "memory(GiB)": 15.32, + "step": 480, + "token_acc": 0.6144850498338871, + "train_speed(iter/s)": 0.134929 + }, + { + "epoch": 0.16608522569184342, + "grad_norm": 1.2556066513061523, + "learning_rate": 9.63601354408397e-05, + "loss": 1.4027675628662108, + "memory(GiB)": 15.32, + "step": 485, + "token_acc": 0.6283502084574152, + "train_speed(iter/s)": 0.135198 + }, + { + "epoch": 0.1677974445134088, + "grad_norm": 1.2234543561935425, + "learning_rate": 9.625334419866064e-05, + "loss": 1.4144779205322267, + "memory(GiB)": 15.32, + "step": 490, + "token_acc": 0.6263164911345154, + "train_speed(iter/s)": 0.13535 + }, + { + "epoch": 0.16950966333497422, + "grad_norm": 1.068080186843872, + "learning_rate": 9.614506986200118e-05, + "loss": 1.4950379371643066, + "memory(GiB)": 15.32, + "step": 495, + "token_acc": 0.6178814155144129, + "train_speed(iter/s)": 0.135517 + }, + { + "epoch": 0.1712218821565396, + "grad_norm": 1.3616589307785034, + "learning_rate": 9.603531590263348e-05, + "loss": 1.487041187286377, + "memory(GiB)": 15.32, + "step": 500, + "token_acc": 0.6093727469358328, + "train_speed(iter/s)": 0.135649 + }, + { + "epoch": 0.1712218821565396, + "eval_loss": 1.3609764575958252, + "eval_runtime": 44.7778, + "eval_samples_per_second": 10.362, + "eval_steps_per_second": 10.362, + "eval_token_acc": 0.6265536348985584, + "step": 500 + }, + { + "epoch": 0.172934100978105, + "grad_norm": 1.4369703531265259, + "learning_rate": 9.592408583977311e-05, + "loss": 1.4402120590209961, + "memory(GiB)": 15.32, + "step": 505, + "token_acc": 0.6260454002389486, + "train_speed(iter/s)": 0.133797 + }, + { + "epoch": 0.1746463197996704, + "grad_norm": 1.203952431678772, + "learning_rate": 9.58113832399664e-05, + "loss": 1.4445464134216308, + "memory(GiB)": 15.32, + "step": 510, + "token_acc": 0.6180904522613065, + "train_speed(iter/s)": 0.134038 + }, + { + "epoch": 0.1763585386212358, + "grad_norm": 1.2848012447357178, + "learning_rate": 9.569721171697585e-05, + "loss": 1.4175099372863769, + "memory(GiB)": 15.32, + "step": 515, + "token_acc": 0.6332924517783476, + "train_speed(iter/s)": 0.134211 + }, + { + "epoch": 0.17807075744280118, + "grad_norm": 1.4535198211669922, + "learning_rate": 9.55815749316645e-05, + "loss": 1.376161479949951, + "memory(GiB)": 15.32, + "step": 520, + "token_acc": 0.6331045906294368, + "train_speed(iter/s)": 0.134327 + }, + { + "epoch": 0.1797829762643666, + "grad_norm": 1.179976224899292, + "learning_rate": 9.546447659187833e-05, + "loss": 1.3935976028442383, + "memory(GiB)": 15.32, + "step": 525, + "token_acc": 0.6313333333333333, + "train_speed(iter/s)": 0.134641 + }, + { + "epoch": 0.18149519508593198, + "grad_norm": 1.4829586744308472, + "learning_rate": 9.53459204523275e-05, + "loss": 1.3937008857727051, + "memory(GiB)": 15.32, + "step": 530, + "token_acc": 0.6215807767870636, + "train_speed(iter/s)": 0.134945 + }, + { + "epoch": 0.18320741390749737, + "grad_norm": 1.3880908489227295, + "learning_rate": 9.522591031446595e-05, + "loss": 1.3785651206970215, + "memory(GiB)": 15.32, + "step": 535, + "token_acc": 0.631578947368421, + "train_speed(iter/s)": 0.135103 + }, + { + "epoch": 0.18491963272906278, + "grad_norm": 1.340599536895752, + "learning_rate": 9.510445002636943e-05, + "loss": 1.464602565765381, + "memory(GiB)": 15.32, + "step": 540, + "token_acc": 0.6189370132531766, + "train_speed(iter/s)": 0.135291 + }, + { + "epoch": 0.18663185155062817, + "grad_norm": 1.275866985321045, + "learning_rate": 9.498154348261216e-05, + "loss": 1.3418386459350586, + "memory(GiB)": 15.32, + "step": 545, + "token_acc": 0.6366302319381498, + "train_speed(iter/s)": 0.135445 + }, + { + "epoch": 0.18834407037219356, + "grad_norm": 1.186716914176941, + "learning_rate": 9.485719462414201e-05, + "loss": 1.4290853500366212, + "memory(GiB)": 15.32, + "step": 550, + "token_acc": 0.6228070175438597, + "train_speed(iter/s)": 0.135605 + }, + { + "epoch": 0.18834407037219356, + "eval_loss": 1.3547223806381226, + "eval_runtime": 44.6926, + "eval_samples_per_second": 10.382, + "eval_steps_per_second": 10.382, + "eval_token_acc": 0.6270834713115658, + "step": 550 + }, + { + "epoch": 0.19005628919375897, + "grad_norm": 1.2635301351547241, + "learning_rate": 9.473140743815405e-05, + "loss": 1.356753158569336, + "memory(GiB)": 15.32, + "step": 555, + "token_acc": 0.6281925343811395, + "train_speed(iter/s)": 0.134073 + }, + { + "epoch": 0.19176850801532436, + "grad_norm": 1.388697624206543, + "learning_rate": 9.460418595796268e-05, + "loss": 1.601140022277832, + "memory(GiB)": 15.32, + "step": 560, + "token_acc": 0.6036690470327506, + "train_speed(iter/s)": 0.134049 + }, + { + "epoch": 0.19348072683688974, + "grad_norm": 1.3623664379119873, + "learning_rate": 9.447553426287243e-05, + "loss": 1.455824375152588, + "memory(GiB)": 15.32, + "step": 565, + "token_acc": 0.6286046812362602, + "train_speed(iter/s)": 0.134161 + }, + { + "epoch": 0.19519294565845516, + "grad_norm": 1.2108103036880493, + "learning_rate": 9.434545647804703e-05, + "loss": 1.4541306495666504, + "memory(GiB)": 15.32, + "step": 570, + "token_acc": 0.6138205980066446, + "train_speed(iter/s)": 0.134362 + }, + { + "epoch": 0.19690516448002054, + "grad_norm": 1.2324427366256714, + "learning_rate": 9.421395677437723e-05, + "loss": 1.4434879302978516, + "memory(GiB)": 15.32, + "step": 575, + "token_acc": 0.6222222222222222, + "train_speed(iter/s)": 0.134457 + }, + { + "epoch": 0.19861738330158593, + "grad_norm": 1.462127923965454, + "learning_rate": 9.408103936834702e-05, + "loss": 1.4318212509155273, + "memory(GiB)": 15.32, + "step": 580, + "token_acc": 0.6215027977617905, + "train_speed(iter/s)": 0.134708 + }, + { + "epoch": 0.20032960212315135, + "grad_norm": 1.274062156677246, + "learning_rate": 9.394670852189839e-05, + "loss": 1.3105790138244628, + "memory(GiB)": 15.32, + "step": 585, + "token_acc": 0.650816873933187, + "train_speed(iter/s)": 0.134765 + }, + { + "epoch": 0.20204182094471673, + "grad_norm": 1.4067606925964355, + "learning_rate": 9.381096854229476e-05, + "loss": 1.463475227355957, + "memory(GiB)": 15.32, + "step": 590, + "token_acc": 0.6197799284104468, + "train_speed(iter/s)": 0.134915 + }, + { + "epoch": 0.20375403976628212, + "grad_norm": 1.2525004148483276, + "learning_rate": 9.367382378198282e-05, + "loss": 1.445170783996582, + "memory(GiB)": 15.32, + "step": 595, + "token_acc": 0.6150152771715408, + "train_speed(iter/s)": 0.135101 + }, + { + "epoch": 0.20546625858784753, + "grad_norm": 1.3698608875274658, + "learning_rate": 9.353527863845295e-05, + "loss": 1.4169211387634277, + "memory(GiB)": 15.32, + "step": 600, + "token_acc": 0.6290930165169516, + "train_speed(iter/s)": 0.135192 + }, + { + "epoch": 0.20546625858784753, + "eval_loss": 1.3479077816009521, + "eval_runtime": 44.9823, + "eval_samples_per_second": 10.315, + "eval_steps_per_second": 10.315, + "eval_token_acc": 0.6281652206547895, + "step": 600 + }, + { + "epoch": 0.20717847740941292, + "grad_norm": 1.1415245532989502, + "learning_rate": 9.339533755409828e-05, + "loss": 1.448196029663086, + "memory(GiB)": 15.32, + "step": 605, + "token_acc": 0.6278725544148623, + "train_speed(iter/s)": 0.133652 + }, + { + "epoch": 0.2088906962309783, + "grad_norm": 1.2748287916183472, + "learning_rate": 9.325400501607218e-05, + "loss": 1.3196575164794921, + "memory(GiB)": 15.32, + "step": 610, + "token_acc": 0.6364667517368495, + "train_speed(iter/s)": 0.13384 + }, + { + "epoch": 0.21060291505254372, + "grad_norm": 1.4961546659469604, + "learning_rate": 9.311128555614443e-05, + "loss": 1.419217586517334, + "memory(GiB)": 15.32, + "step": 615, + "token_acc": 0.6156094709149371, + "train_speed(iter/s)": 0.134011 + }, + { + "epoch": 0.2123151338741091, + "grad_norm": 1.054031252861023, + "learning_rate": 9.296718375055586e-05, + "loss": 1.4678228378295899, + "memory(GiB)": 15.32, + "step": 620, + "token_acc": 0.6123572313196642, + "train_speed(iter/s)": 0.13419 + }, + { + "epoch": 0.2140273526956745, + "grad_norm": 0.9468833208084106, + "learning_rate": 9.28217042198717e-05, + "loss": 1.3832430839538574, + "memory(GiB)": 15.32, + "step": 625, + "token_acc": 0.6313782219884272, + "train_speed(iter/s)": 0.134403 + }, + { + "epoch": 0.2157395715172399, + "grad_norm": 0.9741238355636597, + "learning_rate": 9.267485162883333e-05, + "loss": 1.4765690803527831, + "memory(GiB)": 15.32, + "step": 630, + "token_acc": 0.6204635387224421, + "train_speed(iter/s)": 0.134549 + }, + { + "epoch": 0.2174517903388053, + "grad_norm": 1.1441322565078735, + "learning_rate": 9.252663068620873e-05, + "loss": 1.4201568603515624, + "memory(GiB)": 15.32, + "step": 635, + "token_acc": 0.623625, + "train_speed(iter/s)": 0.134727 + }, + { + "epoch": 0.21916400916037068, + "grad_norm": 1.5122190713882446, + "learning_rate": 9.237704614464156e-05, + "loss": 1.3318071365356445, + "memory(GiB)": 15.32, + "step": 640, + "token_acc": 0.6324027755365499, + "train_speed(iter/s)": 0.134976 + }, + { + "epoch": 0.2208762279819361, + "grad_norm": 1.3553342819213867, + "learning_rate": 9.222610280049869e-05, + "loss": 1.5072690963745117, + "memory(GiB)": 15.32, + "step": 645, + "token_acc": 0.6124360094893245, + "train_speed(iter/s)": 0.135095 + }, + { + "epoch": 0.22258844680350148, + "grad_norm": 1.1465612649917603, + "learning_rate": 9.207380549371642e-05, + "loss": 1.4510274887084962, + "memory(GiB)": 15.32, + "step": 650, + "token_acc": 0.6265912305516266, + "train_speed(iter/s)": 0.135197 + }, + { + "epoch": 0.22258844680350148, + "eval_loss": 1.3450487852096558, + "eval_runtime": 44.9019, + "eval_samples_per_second": 10.334, + "eval_steps_per_second": 10.334, + "eval_token_acc": 0.6296443473077687, + "step": 650 + }, + { + "epoch": 0.2243006656250669, + "grad_norm": 1.0324286222457886, + "learning_rate": 9.192015910764534e-05, + "loss": 1.4560422897338867, + "memory(GiB)": 15.32, + "step": 655, + "token_acc": 0.628179818376671, + "train_speed(iter/s)": 0.133885 + }, + { + "epoch": 0.22601288444663228, + "grad_norm": 1.246894121170044, + "learning_rate": 9.17651685688937e-05, + "loss": 1.5424548149108888, + "memory(GiB)": 15.32, + "step": 660, + "token_acc": 0.5971602434077079, + "train_speed(iter/s)": 0.133987 + }, + { + "epoch": 0.22772510326819767, + "grad_norm": 1.2521884441375732, + "learning_rate": 9.160883884716947e-05, + "loss": 1.366903591156006, + "memory(GiB)": 15.32, + "step": 665, + "token_acc": 0.6359895248784138, + "train_speed(iter/s)": 0.134166 + }, + { + "epoch": 0.22943732208976308, + "grad_norm": 1.3640334606170654, + "learning_rate": 9.145117495512092e-05, + "loss": 1.3627840995788574, + "memory(GiB)": 15.32, + "step": 670, + "token_acc": 0.6354868542740124, + "train_speed(iter/s)": 0.134411 + }, + { + "epoch": 0.23114954091132847, + "grad_norm": 1.3016501665115356, + "learning_rate": 9.129218194817601e-05, + "loss": 1.278130054473877, + "memory(GiB)": 15.32, + "step": 675, + "token_acc": 0.6589810851856965, + "train_speed(iter/s)": 0.134591 + }, + { + "epoch": 0.23286175973289386, + "grad_norm": 1.3176417350769043, + "learning_rate": 9.113186492438018e-05, + "loss": 1.466660976409912, + "memory(GiB)": 15.32, + "step": 680, + "token_acc": 0.6237723495341224, + "train_speed(iter/s)": 0.134631 + }, + { + "epoch": 0.23457397855445927, + "grad_norm": 1.349757432937622, + "learning_rate": 9.097022902423295e-05, + "loss": 1.4492982864379882, + "memory(GiB)": 15.32, + "step": 685, + "token_acc": 0.6251443418013857, + "train_speed(iter/s)": 0.134732 + }, + { + "epoch": 0.23628619737602466, + "grad_norm": 1.112446665763855, + "learning_rate": 9.080727943052303e-05, + "loss": 1.439016056060791, + "memory(GiB)": 15.32, + "step": 690, + "token_acc": 0.6184631803628602, + "train_speed(iter/s)": 0.134842 + }, + { + "epoch": 0.23799841619759005, + "grad_norm": 1.3326408863067627, + "learning_rate": 9.064302136816221e-05, + "loss": 1.4575718879699706, + "memory(GiB)": 15.32, + "step": 695, + "token_acc": 0.6201815550585449, + "train_speed(iter/s)": 0.134928 + }, + { + "epoch": 0.23971063501915546, + "grad_norm": 1.1848535537719727, + "learning_rate": 9.04774601040178e-05, + "loss": 1.4084733963012694, + "memory(GiB)": 15.32, + "step": 700, + "token_acc": 0.6261571582346609, + "train_speed(iter/s)": 0.135033 + }, + { + "epoch": 0.23971063501915546, + "eval_loss": 1.3393454551696777, + "eval_runtime": 44.712, + "eval_samples_per_second": 10.378, + "eval_steps_per_second": 10.378, + "eval_token_acc": 0.6300858776519417, + "step": 700 + }, + { + "epoch": 0.24142285384072085, + "grad_norm": 1.1771100759506226, + "learning_rate": 9.031060094674371e-05, + "loss": 1.4020368576049804, + "memory(GiB)": 15.32, + "step": 705, + "token_acc": 0.6297300308217907, + "train_speed(iter/s)": 0.133831 + }, + { + "epoch": 0.24313507266228623, + "grad_norm": 1.1491320133209229, + "learning_rate": 9.014244924661025e-05, + "loss": 1.3213591575622559, + "memory(GiB)": 15.32, + "step": 710, + "token_acc": 0.6496647509578544, + "train_speed(iter/s)": 0.134035 + }, + { + "epoch": 0.24484729148385165, + "grad_norm": 1.3614366054534912, + "learning_rate": 8.997301039533264e-05, + "loss": 1.4100239753723145, + "memory(GiB)": 15.32, + "step": 715, + "token_acc": 0.6330726904141326, + "train_speed(iter/s)": 0.134204 + }, + { + "epoch": 0.24655951030541703, + "grad_norm": 1.3133790493011475, + "learning_rate": 8.980228982589801e-05, + "loss": 1.506552791595459, + "memory(GiB)": 15.32, + "step": 720, + "token_acc": 0.6086352733240752, + "train_speed(iter/s)": 0.134243 + }, + { + "epoch": 0.24827172912698242, + "grad_norm": 0.9936130046844482, + "learning_rate": 8.963029301239131e-05, + "loss": 1.3643118858337402, + "memory(GiB)": 15.32, + "step": 725, + "token_acc": 0.6338939197930142, + "train_speed(iter/s)": 0.134409 + }, + { + "epoch": 0.24998394794854784, + "grad_norm": 1.3456363677978516, + "learning_rate": 8.945702546981969e-05, + "loss": 1.4293991088867188, + "memory(GiB)": 15.32, + "step": 730, + "token_acc": 0.6232367825993324, + "train_speed(iter/s)": 0.134509 + }, + { + "epoch": 0.2516961667701132, + "grad_norm": 1.2577625513076782, + "learning_rate": 8.928249275393572e-05, + "loss": 1.365379810333252, + "memory(GiB)": 15.32, + "step": 735, + "token_acc": 0.637854550342788, + "train_speed(iter/s)": 0.13468 + }, + { + "epoch": 0.2534083855916786, + "grad_norm": 1.392080545425415, + "learning_rate": 8.910670046105926e-05, + "loss": 1.4698296546936036, + "memory(GiB)": 15.32, + "step": 740, + "token_acc": 0.6139221994735302, + "train_speed(iter/s)": 0.134793 + }, + { + "epoch": 0.255120604413244, + "grad_norm": 1.2273677587509155, + "learning_rate": 8.892965422789793e-05, + "loss": 1.429222297668457, + "memory(GiB)": 15.32, + "step": 745, + "token_acc": 0.6238643246517263, + "train_speed(iter/s)": 0.134936 + }, + { + "epoch": 0.2568328232348094, + "grad_norm": 1.176615834236145, + "learning_rate": 8.875135973136648e-05, + "loss": 1.4213340759277344, + "memory(GiB)": 15.32, + "step": 750, + "token_acc": 0.630563446969697, + "train_speed(iter/s)": 0.135038 + }, + { + "epoch": 0.2568328232348094, + "eval_loss": 1.337463617324829, + "eval_runtime": 44.8581, + "eval_samples_per_second": 10.344, + "eval_steps_per_second": 10.344, + "eval_token_acc": 0.6305053314789059, + "step": 750 + }, + { + "epoch": 0.2585450420563748, + "grad_norm": 1.2160909175872803, + "learning_rate": 8.857182268840472e-05, + "loss": 1.4353949546813964, + "memory(GiB)": 15.32, + "step": 755, + "token_acc": 0.6297282391647491, + "train_speed(iter/s)": 0.133908 + }, + { + "epoch": 0.2602572608779402, + "grad_norm": 1.3656470775604248, + "learning_rate": 8.839104885579413e-05, + "loss": 1.422421646118164, + "memory(GiB)": 15.32, + "step": 760, + "token_acc": 0.6235689397710303, + "train_speed(iter/s)": 0.134061 + }, + { + "epoch": 0.26196947969950557, + "grad_norm": 1.4039480686187744, + "learning_rate": 8.820904402997342e-05, + "loss": 1.3555766105651856, + "memory(GiB)": 15.32, + "step": 765, + "token_acc": 0.6265711582646304, + "train_speed(iter/s)": 0.134255 + }, + { + "epoch": 0.263681698521071, + "grad_norm": 1.5465834140777588, + "learning_rate": 8.802581404685255e-05, + "loss": 1.2774075508117675, + "memory(GiB)": 15.32, + "step": 770, + "token_acc": 0.6540712255475672, + "train_speed(iter/s)": 0.134396 + }, + { + "epoch": 0.2653939173426364, + "grad_norm": 1.1505459547042847, + "learning_rate": 8.784136478162567e-05, + "loss": 1.3418766975402832, + "memory(GiB)": 15.32, + "step": 775, + "token_acc": 0.6489575483546848, + "train_speed(iter/s)": 0.134525 + }, + { + "epoch": 0.26710613616420176, + "grad_norm": 1.2169164419174194, + "learning_rate": 8.765570214858267e-05, + "loss": 1.449625015258789, + "memory(GiB)": 15.32, + "step": 780, + "token_acc": 0.620192894640459, + "train_speed(iter/s)": 0.134593 + }, + { + "epoch": 0.26881835498576717, + "grad_norm": 1.2266579866409302, + "learning_rate": 8.746883210091963e-05, + "loss": 1.4721331596374512, + "memory(GiB)": 15.32, + "step": 785, + "token_acc": 0.6199640038765056, + "train_speed(iter/s)": 0.134791 + }, + { + "epoch": 0.2705305738073326, + "grad_norm": 1.4117833375930786, + "learning_rate": 8.728076063054785e-05, + "loss": 1.5259774208068848, + "memory(GiB)": 15.32, + "step": 790, + "token_acc": 0.6169594888921156, + "train_speed(iter/s)": 0.134932 + }, + { + "epoch": 0.272242792628898, + "grad_norm": 1.3410435914993286, + "learning_rate": 8.709149376790177e-05, + "loss": 1.3391845703125, + "memory(GiB)": 15.32, + "step": 795, + "token_acc": 0.6382920487204051, + "train_speed(iter/s)": 0.135126 + }, + { + "epoch": 0.27395501145046336, + "grad_norm": 1.0829919576644897, + "learning_rate": 8.690103758174558e-05, + "loss": 1.3880299568176269, + "memory(GiB)": 15.32, + "step": 800, + "token_acc": 0.6328417602996255, + "train_speed(iter/s)": 0.13526 + }, + { + "epoch": 0.27395501145046336, + "eval_loss": 1.3367787599563599, + "eval_runtime": 44.9261, + "eval_samples_per_second": 10.328, + "eval_steps_per_second": 10.328, + "eval_token_acc": 0.6309468618230788, + "step": 800 + }, + { + "epoch": 0.2756672302720288, + "grad_norm": 1.461214542388916, + "learning_rate": 8.670939817897865e-05, + "loss": 1.4172840118408203, + "memory(GiB)": 15.32, + "step": 805, + "token_acc": 0.6306816045716004, + "train_speed(iter/s)": 0.134157 + }, + { + "epoch": 0.2773794490935942, + "grad_norm": 1.2514489889144897, + "learning_rate": 8.651658170443971e-05, + "loss": 1.5141790390014649, + "memory(GiB)": 15.32, + "step": 810, + "token_acc": 0.60939962745379, + "train_speed(iter/s)": 0.13425 + }, + { + "epoch": 0.27909166791515955, + "grad_norm": 1.2692232131958008, + "learning_rate": 8.63225943407098e-05, + "loss": 1.5246879577636718, + "memory(GiB)": 15.32, + "step": 815, + "token_acc": 0.6184591405032287, + "train_speed(iter/s)": 0.134285 + }, + { + "epoch": 0.28080388673672496, + "grad_norm": 1.2118124961853027, + "learning_rate": 8.612744230791405e-05, + "loss": 1.2990687370300293, + "memory(GiB)": 15.32, + "step": 820, + "token_acc": 0.6427078980359382, + "train_speed(iter/s)": 0.134458 + }, + { + "epoch": 0.2825161055582904, + "grad_norm": 1.2870780229568481, + "learning_rate": 8.593113186352222e-05, + "loss": 1.4026402473449706, + "memory(GiB)": 15.32, + "step": 825, + "token_acc": 0.6355414012738854, + "train_speed(iter/s)": 0.134563 + }, + { + "epoch": 0.28422832437985573, + "grad_norm": 1.2170543670654297, + "learning_rate": 8.573366930214806e-05, + "loss": 1.4181410789489746, + "memory(GiB)": 15.32, + "step": 830, + "token_acc": 0.6260734900746163, + "train_speed(iter/s)": 0.134679 + }, + { + "epoch": 0.28594054320142115, + "grad_norm": 1.3258826732635498, + "learning_rate": 8.553506095534747e-05, + "loss": 1.4210317611694336, + "memory(GiB)": 15.32, + "step": 835, + "token_acc": 0.6327089033887603, + "train_speed(iter/s)": 0.13473 + }, + { + "epoch": 0.28765276202298656, + "grad_norm": 1.275274395942688, + "learning_rate": 8.533531319141552e-05, + "loss": 1.4135967254638673, + "memory(GiB)": 15.32, + "step": 840, + "token_acc": 0.6305059149257488, + "train_speed(iter/s)": 0.134877 + }, + { + "epoch": 0.2893649808445519, + "grad_norm": 1.228824496269226, + "learning_rate": 8.51344324151822e-05, + "loss": 1.4475007057189941, + "memory(GiB)": 15.32, + "step": 845, + "token_acc": 0.6264236902050114, + "train_speed(iter/s)": 0.134989 + }, + { + "epoch": 0.29107719966611734, + "grad_norm": 1.2104127407073975, + "learning_rate": 8.493242506780705e-05, + "loss": 1.4581039428710938, + "memory(GiB)": 15.32, + "step": 850, + "token_acc": 0.6153358681875792, + "train_speed(iter/s)": 0.135108 + }, + { + "epoch": 0.29107719966611734, + "eval_loss": 1.325344443321228, + "eval_runtime": 44.8556, + "eval_samples_per_second": 10.344, + "eval_steps_per_second": 10.344, + "eval_token_acc": 0.6316091573393382, + "step": 850 + }, + { + "epoch": 0.29278941848768275, + "grad_norm": 1.3452321290969849, + "learning_rate": 8.472929762657271e-05, + "loss": 1.5027917861938476, + "memory(GiB)": 15.32, + "step": 855, + "token_acc": 0.6276361287380781, + "train_speed(iter/s)": 0.134147 + }, + { + "epoch": 0.2945016373092481, + "grad_norm": 1.1807208061218262, + "learning_rate": 8.452505660467712e-05, + "loss": 1.3397924423217773, + "memory(GiB)": 15.32, + "step": 860, + "token_acc": 0.6366967674388169, + "train_speed(iter/s)": 0.134299 + }, + { + "epoch": 0.2962138561308135, + "grad_norm": 1.3460243940353394, + "learning_rate": 8.431970855102474e-05, + "loss": 1.3599821090698243, + "memory(GiB)": 15.32, + "step": 865, + "token_acc": 0.6371296166268783, + "train_speed(iter/s)": 0.134374 + }, + { + "epoch": 0.29792607495237894, + "grad_norm": 1.261621117591858, + "learning_rate": 8.411326005001657e-05, + "loss": 1.3572757720947266, + "memory(GiB)": 15.32, + "step": 870, + "token_acc": 0.6300678112826432, + "train_speed(iter/s)": 0.134556 + }, + { + "epoch": 0.2996382937739443, + "grad_norm": 1.522195816040039, + "learning_rate": 8.390571772133896e-05, + "loss": 1.4110926628112792, + "memory(GiB)": 15.32, + "step": 875, + "token_acc": 0.6273284461002283, + "train_speed(iter/s)": 0.134665 + }, + { + "epoch": 0.3013505125955097, + "grad_norm": 1.0426990985870361, + "learning_rate": 8.369708821975145e-05, + "loss": 1.3695934295654297, + "memory(GiB)": 15.32, + "step": 880, + "token_acc": 0.6333983830499025, + "train_speed(iter/s)": 0.13481 + }, + { + "epoch": 0.3030627314170751, + "grad_norm": 1.4226237535476685, + "learning_rate": 8.348737823487325e-05, + "loss": 1.3814892768859863, + "memory(GiB)": 15.32, + "step": 885, + "token_acc": 0.6377358490566037, + "train_speed(iter/s)": 0.134936 + }, + { + "epoch": 0.3047749502386405, + "grad_norm": 1.1524187326431274, + "learning_rate": 8.327659449096892e-05, + "loss": 1.395742130279541, + "memory(GiB)": 15.32, + "step": 890, + "token_acc": 0.6331342668863262, + "train_speed(iter/s)": 0.135009 + }, + { + "epoch": 0.3064871690602059, + "grad_norm": 1.2690176963806152, + "learning_rate": 8.306474374673259e-05, + "loss": 1.3416844367980958, + "memory(GiB)": 15.32, + "step": 895, + "token_acc": 0.6420717975890846, + "train_speed(iter/s)": 0.135193 + }, + { + "epoch": 0.3081993878817713, + "grad_norm": 0.8282930254936218, + "learning_rate": 8.285183279507136e-05, + "loss": 1.3010485649108887, + "memory(GiB)": 15.32, + "step": 900, + "token_acc": 0.6461113114557038, + "train_speed(iter/s)": 0.135402 + }, + { + "epoch": 0.3081993878817713, + "eval_loss": 1.323546290397644, + "eval_runtime": 44.8929, + "eval_samples_per_second": 10.336, + "eval_steps_per_second": 10.336, + "eval_token_acc": 0.6333532021988211, + "step": 900 + }, + { + "epoch": 0.3099116067033367, + "grad_norm": 1.4451388120651245, + "learning_rate": 8.263786846288744e-05, + "loss": 1.4001890182495118, + "memory(GiB)": 15.32, + "step": 905, + "token_acc": 0.6325559899092993, + "train_speed(iter/s)": 0.134476 + }, + { + "epoch": 0.3116238255249021, + "grad_norm": 1.1762584447860718, + "learning_rate": 8.242285761085919e-05, + "loss": 1.4805274963378907, + "memory(GiB)": 15.32, + "step": 910, + "token_acc": 0.6145933014354067, + "train_speed(iter/s)": 0.134583 + }, + { + "epoch": 0.3133360443464675, + "grad_norm": 1.0868926048278809, + "learning_rate": 8.220680713322132e-05, + "loss": 1.3921205520629882, + "memory(GiB)": 15.32, + "step": 915, + "token_acc": 0.6324491495182586, + "train_speed(iter/s)": 0.134697 + }, + { + "epoch": 0.31504826316803286, + "grad_norm": 1.2927578687667847, + "learning_rate": 8.198972395754359e-05, + "loss": 1.4358938217163086, + "memory(GiB)": 15.32, + "step": 920, + "token_acc": 0.6267158108795119, + "train_speed(iter/s)": 0.134799 + }, + { + "epoch": 0.3167604819895983, + "grad_norm": 1.0938469171524048, + "learning_rate": 8.177161504450888e-05, + "loss": 1.2609033584594727, + "memory(GiB)": 15.32, + "step": 925, + "token_acc": 0.6594454072790294, + "train_speed(iter/s)": 0.134976 + }, + { + "epoch": 0.3184727008111637, + "grad_norm": 1.3760744333267212, + "learning_rate": 8.155248738768985e-05, + "loss": 1.4250566482543945, + "memory(GiB)": 15.32, + "step": 930, + "token_acc": 0.6287696824125968, + "train_speed(iter/s)": 0.135055 + }, + { + "epoch": 0.32018491963272905, + "grad_norm": 1.4183341264724731, + "learning_rate": 8.133234801332484e-05, + "loss": 1.336586856842041, + "memory(GiB)": 15.32, + "step": 935, + "token_acc": 0.6398090554028641, + "train_speed(iter/s)": 0.135219 + }, + { + "epoch": 0.32189713845429446, + "grad_norm": 1.2634278535842896, + "learning_rate": 8.111120398009242e-05, + "loss": 1.350379180908203, + "memory(GiB)": 15.32, + "step": 940, + "token_acc": 0.6342271293375394, + "train_speed(iter/s)": 0.135371 + }, + { + "epoch": 0.3236093572758599, + "grad_norm": 1.3512084484100342, + "learning_rate": 8.088906237888517e-05, + "loss": 1.4703611373901366, + "memory(GiB)": 15.32, + "step": 945, + "token_acc": 0.6232667450058754, + "train_speed(iter/s)": 0.135406 + }, + { + "epoch": 0.32532157609742524, + "grad_norm": 1.3143584728240967, + "learning_rate": 8.06659303325823e-05, + "loss": 1.390449333190918, + "memory(GiB)": 15.32, + "step": 950, + "token_acc": 0.636208076659822, + "train_speed(iter/s)": 0.135566 + }, + { + "epoch": 0.32532157609742524, + "eval_loss": 1.3236278295516968, + "eval_runtime": 44.8984, + "eval_samples_per_second": 10.334, + "eval_steps_per_second": 10.334, + "eval_token_acc": 0.6335960438881162, + "step": 950 + }, + { + "epoch": 0.32703379491899065, + "grad_norm": 1.2162400484085083, + "learning_rate": 8.044181499582118e-05, + "loss": 1.3443108558654786, + "memory(GiB)": 15.32, + "step": 955, + "token_acc": 0.6346888926180754, + "train_speed(iter/s)": 0.134719 + }, + { + "epoch": 0.32874601374055606, + "grad_norm": 1.1429424285888672, + "learning_rate": 8.021672355476801e-05, + "loss": 1.2802536010742187, + "memory(GiB)": 15.74, + "step": 960, + "token_acc": 0.6496992133271634, + "train_speed(iter/s)": 0.134826 + }, + { + "epoch": 0.3304582325621214, + "grad_norm": 1.4166802167892456, + "learning_rate": 7.999066322688742e-05, + "loss": 1.4288368225097656, + "memory(GiB)": 15.74, + "step": 965, + "token_acc": 0.6297489595918915, + "train_speed(iter/s)": 0.134915 + }, + { + "epoch": 0.33217045138368684, + "grad_norm": 1.2539074420928955, + "learning_rate": 7.976364126071092e-05, + "loss": 1.3691092491149903, + "memory(GiB)": 15.74, + "step": 970, + "token_acc": 0.6405427859318749, + "train_speed(iter/s)": 0.135007 + }, + { + "epoch": 0.33388267020525225, + "grad_norm": 1.3559014797210693, + "learning_rate": 7.95356649356046e-05, + "loss": 1.3649935722351074, + "memory(GiB)": 15.74, + "step": 975, + "token_acc": 0.6353207949846605, + "train_speed(iter/s)": 0.135096 + }, + { + "epoch": 0.3355948890268176, + "grad_norm": 1.116381287574768, + "learning_rate": 7.930674156153568e-05, + "loss": 1.3598039627075196, + "memory(GiB)": 15.74, + "step": 980, + "token_acc": 0.6407874670733398, + "train_speed(iter/s)": 0.135224 + }, + { + "epoch": 0.337307107848383, + "grad_norm": 1.3899317979812622, + "learning_rate": 7.90768784788381e-05, + "loss": 1.438613224029541, + "memory(GiB)": 15.74, + "step": 985, + "token_acc": 0.6283283440995746, + "train_speed(iter/s)": 0.135224 + }, + { + "epoch": 0.33901932666994844, + "grad_norm": 1.627461314201355, + "learning_rate": 7.884608305797715e-05, + "loss": 1.4320734024047852, + "memory(GiB)": 15.74, + "step": 990, + "token_acc": 0.627214933044772, + "train_speed(iter/s)": 0.135256 + }, + { + "epoch": 0.3407315454915138, + "grad_norm": 1.1361817121505737, + "learning_rate": 7.861436269931321e-05, + "loss": 1.3207384109497071, + "memory(GiB)": 15.74, + "step": 995, + "token_acc": 0.6480177360459051, + "train_speed(iter/s)": 0.135287 + }, + { + "epoch": 0.3424437643130792, + "grad_norm": 1.3648505210876465, + "learning_rate": 7.838172483286442e-05, + "loss": 1.3524131774902344, + "memory(GiB)": 15.74, + "step": 1000, + "token_acc": 0.6317077148915349, + "train_speed(iter/s)": 0.135398 + }, + { + "epoch": 0.3424437643130792, + "eval_loss": 1.3165655136108398, + "eval_runtime": 45.1825, + "eval_samples_per_second": 10.269, + "eval_steps_per_second": 10.269, + "eval_token_acc": 0.6340154977150805, + "step": 1000 + }, + { + "epoch": 0.3441559831346446, + "grad_norm": 1.2596030235290527, + "learning_rate": 7.814817691806836e-05, + "loss": 1.399399757385254, + "memory(GiB)": 15.74, + "step": 1005, + "token_acc": 0.6334636043355216, + "train_speed(iter/s)": 0.134558 + }, + { + "epoch": 0.34586820195621, + "grad_norm": 1.1375610828399658, + "learning_rate": 7.791372644354296e-05, + "loss": 1.381869125366211, + "memory(GiB)": 15.74, + "step": 1010, + "token_acc": 0.6355127867123649, + "train_speed(iter/s)": 0.134638 + }, + { + "epoch": 0.3475804207777754, + "grad_norm": 1.1278328895568848, + "learning_rate": 7.767838092684638e-05, + "loss": 1.3804694175720216, + "memory(GiB)": 15.74, + "step": 1015, + "token_acc": 0.6283924843423799, + "train_speed(iter/s)": 0.134737 + }, + { + "epoch": 0.3492926395993408, + "grad_norm": 1.0611286163330078, + "learning_rate": 7.744214791423596e-05, + "loss": 1.42586669921875, + "memory(GiB)": 15.74, + "step": 1020, + "token_acc": 0.6249854600442014, + "train_speed(iter/s)": 0.13486 + }, + { + "epoch": 0.3510048584209062, + "grad_norm": 1.2869757413864136, + "learning_rate": 7.720503498042619e-05, + "loss": 1.375808048248291, + "memory(GiB)": 15.74, + "step": 1025, + "token_acc": 0.6329676071055381, + "train_speed(iter/s)": 0.134993 + }, + { + "epoch": 0.3527170772424716, + "grad_norm": 1.2440321445465088, + "learning_rate": 7.696704972834587e-05, + "loss": 1.413187026977539, + "memory(GiB)": 15.74, + "step": 1030, + "token_acc": 0.623336745138178, + "train_speed(iter/s)": 0.135076 + }, + { + "epoch": 0.354429296064037, + "grad_norm": 1.420928716659546, + "learning_rate": 7.672819978889435e-05, + "loss": 1.4541206359863281, + "memory(GiB)": 15.74, + "step": 1035, + "token_acc": 0.6163740920096852, + "train_speed(iter/s)": 0.135174 + }, + { + "epoch": 0.35614151488560236, + "grad_norm": 1.2337260246276855, + "learning_rate": 7.648849282069681e-05, + "loss": 1.4779060363769532, + "memory(GiB)": 15.74, + "step": 1040, + "token_acc": 0.6134244056219624, + "train_speed(iter/s)": 0.135232 + }, + { + "epoch": 0.3578537337071678, + "grad_norm": 1.2223258018493652, + "learning_rate": 7.624793650985873e-05, + "loss": 1.4097661018371581, + "memory(GiB)": 15.74, + "step": 1045, + "token_acc": 0.621295606850335, + "train_speed(iter/s)": 0.135327 + }, + { + "epoch": 0.3595659525287332, + "grad_norm": 1.2556880712509155, + "learning_rate": 7.600653856971937e-05, + "loss": 1.3981780052185058, + "memory(GiB)": 15.74, + "step": 1050, + "token_acc": 0.6346985499872806, + "train_speed(iter/s)": 0.135375 + }, + { + "epoch": 0.3595659525287332, + "eval_loss": 1.315061330795288, + "eval_runtime": 45.231, + "eval_samples_per_second": 10.258, + "eval_steps_per_second": 10.258, + "eval_token_acc": 0.6336843499569508, + "step": 1050 + }, + { + "epoch": 0.36127817135029855, + "grad_norm": 1.3148318529129028, + "learning_rate": 7.576430674060452e-05, + "loss": 1.3192906379699707, + "memory(GiB)": 15.74, + "step": 1055, + "token_acc": 0.6354069357329418, + "train_speed(iter/s)": 0.134563 + }, + { + "epoch": 0.36299039017186396, + "grad_norm": 1.1245882511138916, + "learning_rate": 7.552124878957829e-05, + "loss": 1.3747568130493164, + "memory(GiB)": 15.74, + "step": 1060, + "token_acc": 0.6334835801674179, + "train_speed(iter/s)": 0.134634 + }, + { + "epoch": 0.3647026089934294, + "grad_norm": 1.5016648769378662, + "learning_rate": 7.527737251019399e-05, + "loss": 1.4022554397583007, + "memory(GiB)": 15.74, + "step": 1065, + "token_acc": 0.627781192378611, + "train_speed(iter/s)": 0.134711 + }, + { + "epoch": 0.36641482781499474, + "grad_norm": 1.3347753286361694, + "learning_rate": 7.503268572224435e-05, + "loss": 1.3496832847595215, + "memory(GiB)": 15.74, + "step": 1070, + "token_acc": 0.630678077682686, + "train_speed(iter/s)": 0.134798 + }, + { + "epoch": 0.36812704663656015, + "grad_norm": 1.0668926239013672, + "learning_rate": 7.478719627151072e-05, + "loss": 1.403731632232666, + "memory(GiB)": 15.74, + "step": 1075, + "token_acc": 0.6323407284375371, + "train_speed(iter/s)": 0.134878 + }, + { + "epoch": 0.36983926545812557, + "grad_norm": 1.3083745241165161, + "learning_rate": 7.454091202951148e-05, + "loss": 1.515788745880127, + "memory(GiB)": 15.74, + "step": 1080, + "token_acc": 0.6159872689435671, + "train_speed(iter/s)": 0.134863 + }, + { + "epoch": 0.3715514842796909, + "grad_norm": 1.2629404067993164, + "learning_rate": 7.429384089324967e-05, + "loss": 1.3830952644348145, + "memory(GiB)": 15.74, + "step": 1085, + "token_acc": 0.6243265086206896, + "train_speed(iter/s)": 0.134962 + }, + { + "epoch": 0.37326370310125634, + "grad_norm": 1.1662487983703613, + "learning_rate": 7.404599078495977e-05, + "loss": 1.3385626792907714, + "memory(GiB)": 15.74, + "step": 1090, + "token_acc": 0.6372375036971311, + "train_speed(iter/s)": 0.135136 + }, + { + "epoch": 0.37497592192282175, + "grad_norm": 1.3338191509246826, + "learning_rate": 7.379736965185368e-05, + "loss": 1.446464729309082, + "memory(GiB)": 15.74, + "step": 1095, + "token_acc": 0.6128081457663451, + "train_speed(iter/s)": 0.135224 + }, + { + "epoch": 0.3766881407443871, + "grad_norm": 1.0240142345428467, + "learning_rate": 7.354798546586593e-05, + "loss": 1.3413557052612304, + "memory(GiB)": 15.74, + "step": 1100, + "token_acc": 0.6431327160493827, + "train_speed(iter/s)": 0.135314 + }, + { + "epoch": 0.3766881407443871, + "eval_loss": 1.309690237045288, + "eval_runtime": 45.2399, + "eval_samples_per_second": 10.256, + "eval_steps_per_second": 10.256, + "eval_token_acc": 0.6364880676424487, + "step": 1100 + }, + { + "epoch": 0.3784003595659525, + "grad_norm": 1.2960898876190186, + "learning_rate": 7.329784622339794e-05, + "loss": 1.3781036376953124, + "memory(GiB)": 15.74, + "step": 1105, + "token_acc": 0.6364358533033232, + "train_speed(iter/s)": 0.134517 + }, + { + "epoch": 0.38011257838751794, + "grad_norm": 1.14024019241333, + "learning_rate": 7.30469599450618e-05, + "loss": 1.424424648284912, + "memory(GiB)": 15.74, + "step": 1110, + "token_acc": 0.6228834059022739, + "train_speed(iter/s)": 0.134589 + }, + { + "epoch": 0.3818247972090833, + "grad_norm": 1.504570722579956, + "learning_rate": 7.279533467542294e-05, + "loss": 1.3886597633361817, + "memory(GiB)": 15.74, + "step": 1115, + "token_acc": 0.6313715186547556, + "train_speed(iter/s)": 0.134695 + }, + { + "epoch": 0.3835370160306487, + "grad_norm": 1.110105037689209, + "learning_rate": 7.254297848274228e-05, + "loss": 1.352193832397461, + "memory(GiB)": 15.74, + "step": 1120, + "token_acc": 0.6392083444771329, + "train_speed(iter/s)": 0.134796 + }, + { + "epoch": 0.38524923485221413, + "grad_norm": 1.1653887033462524, + "learning_rate": 7.228989945871745e-05, + "loss": 1.3307332038879394, + "memory(GiB)": 15.74, + "step": 1125, + "token_acc": 0.6407333994053518, + "train_speed(iter/s)": 0.134851 + }, + { + "epoch": 0.3869614536737795, + "grad_norm": 1.3624780178070068, + "learning_rate": 7.20361057182234e-05, + "loss": 1.3443083763122559, + "memory(GiB)": 15.74, + "step": 1130, + "token_acc": 0.6417009602194788, + "train_speed(iter/s)": 0.134877 + }, + { + "epoch": 0.3886736724953449, + "grad_norm": 1.2703291177749634, + "learning_rate": 7.178160539905215e-05, + "loss": 1.4481989860534668, + "memory(GiB)": 15.74, + "step": 1135, + "token_acc": 0.6310376704414143, + "train_speed(iter/s)": 0.134867 + }, + { + "epoch": 0.3903858913169103, + "grad_norm": 1.2308317422866821, + "learning_rate": 7.152640666165187e-05, + "loss": 1.3691473960876466, + "memory(GiB)": 15.74, + "step": 1140, + "token_acc": 0.6368863334807607, + "train_speed(iter/s)": 0.134975 + }, + { + "epoch": 0.3920981101384757, + "grad_norm": 1.4585771560668945, + "learning_rate": 7.127051768886527e-05, + "loss": 1.3355751037597656, + "memory(GiB)": 15.74, + "step": 1145, + "token_acc": 0.6386348791639451, + "train_speed(iter/s)": 0.135109 + }, + { + "epoch": 0.3938103289600411, + "grad_norm": 1.3277227878570557, + "learning_rate": 7.101394668566712e-05, + "loss": 1.3943704605102538, + "memory(GiB)": 15.74, + "step": 1150, + "token_acc": 0.631716303346638, + "train_speed(iter/s)": 0.135206 + }, + { + "epoch": 0.3938103289600411, + "eval_loss": 1.3068937063217163, + "eval_runtime": 45.1677, + "eval_samples_per_second": 10.273, + "eval_steps_per_second": 10.273, + "eval_token_acc": 0.634788175817383, + "step": 1150 + }, + { + "epoch": 0.3955225477816065, + "grad_norm": 1.3035794496536255, + "learning_rate": 7.075670187890122e-05, + "loss": 1.3599701881408692, + "memory(GiB)": 15.74, + "step": 1155, + "token_acc": 0.6343812810497175, + "train_speed(iter/s)": 0.134466 + }, + { + "epoch": 0.39723476660317186, + "grad_norm": 1.557301640510559, + "learning_rate": 7.049879151701666e-05, + "loss": 1.3485316276550292, + "memory(GiB)": 15.74, + "step": 1160, + "token_acc": 0.632473152748838, + "train_speed(iter/s)": 0.134587 + }, + { + "epoch": 0.3989469854247373, + "grad_norm": 1.376206874847412, + "learning_rate": 7.024022386980321e-05, + "loss": 1.3916336059570313, + "memory(GiB)": 15.74, + "step": 1165, + "token_acc": 0.621045197740113, + "train_speed(iter/s)": 0.134688 + }, + { + "epoch": 0.4006592042463027, + "grad_norm": 1.4144821166992188, + "learning_rate": 6.998100722812625e-05, + "loss": 1.4383736610412599, + "memory(GiB)": 15.74, + "step": 1170, + "token_acc": 0.6206663055254604, + "train_speed(iter/s)": 0.134744 + }, + { + "epoch": 0.40237142306786805, + "grad_norm": 1.2632356882095337, + "learning_rate": 6.972114990366093e-05, + "loss": 1.3934082984924316, + "memory(GiB)": 15.74, + "step": 1175, + "token_acc": 0.6260483473112974, + "train_speed(iter/s)": 0.134874 + }, + { + "epoch": 0.40408364188943346, + "grad_norm": 1.4145216941833496, + "learning_rate": 6.946066022862561e-05, + "loss": 1.4188349723815918, + "memory(GiB)": 15.74, + "step": 1180, + "token_acc": 0.624868282402529, + "train_speed(iter/s)": 0.134916 + }, + { + "epoch": 0.4057958607109989, + "grad_norm": 1.4259588718414307, + "learning_rate": 6.919954655551469e-05, + "loss": 1.3875085830688476, + "memory(GiB)": 15.74, + "step": 1185, + "token_acc": 0.634014598540146, + "train_speed(iter/s)": 0.134973 + }, + { + "epoch": 0.40750807953256424, + "grad_norm": 0.9245979189872742, + "learning_rate": 6.893781725683083e-05, + "loss": 1.3513984680175781, + "memory(GiB)": 15.74, + "step": 1190, + "token_acc": 0.6422018348623854, + "train_speed(iter/s)": 0.135071 + }, + { + "epoch": 0.40922029835412965, + "grad_norm": 1.1376724243164062, + "learning_rate": 6.867548072481648e-05, + "loss": 1.3279351234436034, + "memory(GiB)": 15.74, + "step": 1195, + "token_acc": 0.6426051902242378, + "train_speed(iter/s)": 0.135166 + }, + { + "epoch": 0.41093251717569507, + "grad_norm": 1.2558095455169678, + "learning_rate": 6.841254537118477e-05, + "loss": 1.4690144538879395, + "memory(GiB)": 15.74, + "step": 1200, + "token_acc": 0.620484429065744, + "train_speed(iter/s)": 0.135202 + }, + { + "epoch": 0.41093251717569507, + "eval_loss": 1.3049299716949463, + "eval_runtime": 45.3031, + "eval_samples_per_second": 10.242, + "eval_steps_per_second": 10.242, + "eval_token_acc": 0.6345674106452966, + "step": 1200 + }, + { + "epoch": 0.4126447359972604, + "grad_norm": 1.0464534759521484, + "learning_rate": 6.814901962684979e-05, + "loss": 1.3037472724914552, + "memory(GiB)": 15.74, + "step": 1205, + "token_acc": 0.6349928605425987, + "train_speed(iter/s)": 0.134532 + }, + { + "epoch": 0.41435695481882584, + "grad_norm": 1.3323689699172974, + "learning_rate": 6.78849119416563e-05, + "loss": 1.3335732460021972, + "memory(GiB)": 15.74, + "step": 1210, + "token_acc": 0.6469460771786353, + "train_speed(iter/s)": 0.134559 + }, + { + "epoch": 0.41606917364039125, + "grad_norm": 1.1697125434875488, + "learning_rate": 6.762023078410867e-05, + "loss": 1.2790194511413575, + "memory(GiB)": 15.74, + "step": 1215, + "token_acc": 0.658312447786132, + "train_speed(iter/s)": 0.134623 + }, + { + "epoch": 0.4177813924619566, + "grad_norm": 1.3868632316589355, + "learning_rate": 6.735498464109952e-05, + "loss": 1.4868104934692383, + "memory(GiB)": 15.74, + "step": 1220, + "token_acc": 0.6231562252180809, + "train_speed(iter/s)": 0.134668 + }, + { + "epoch": 0.419493611283522, + "grad_norm": 1.3491206169128418, + "learning_rate": 6.708918201763747e-05, + "loss": 1.4445602416992187, + "memory(GiB)": 15.74, + "step": 1225, + "token_acc": 0.6272307500311993, + "train_speed(iter/s)": 0.134727 + }, + { + "epoch": 0.42120583010508744, + "grad_norm": 0.9048284292221069, + "learning_rate": 6.682283143657445e-05, + "loss": 1.3085361480712892, + "memory(GiB)": 15.74, + "step": 1230, + "token_acc": 0.6359479925703672, + "train_speed(iter/s)": 0.134833 + }, + { + "epoch": 0.4229180489266528, + "grad_norm": 1.0411579608917236, + "learning_rate": 6.655594143833236e-05, + "loss": 1.3928863525390625, + "memory(GiB)": 15.74, + "step": 1235, + "token_acc": 0.6353584729981379, + "train_speed(iter/s)": 0.134911 + }, + { + "epoch": 0.4246302677482182, + "grad_norm": 1.4906457662582397, + "learning_rate": 6.628852058062944e-05, + "loss": 1.377931785583496, + "memory(GiB)": 15.74, + "step": 1240, + "token_acc": 0.6396691857978041, + "train_speed(iter/s)": 0.134937 + }, + { + "epoch": 0.42634248656978363, + "grad_norm": 1.1767210960388184, + "learning_rate": 6.602057743820558e-05, + "loss": 1.433674144744873, + "memory(GiB)": 15.74, + "step": 1245, + "token_acc": 0.6263031275060145, + "train_speed(iter/s)": 0.134954 + }, + { + "epoch": 0.428054705391349, + "grad_norm": 1.3877754211425781, + "learning_rate": 6.575212060254758e-05, + "loss": 1.4362284660339355, + "memory(GiB)": 15.74, + "step": 1250, + "token_acc": 0.6198518518518519, + "train_speed(iter/s)": 0.13505 + }, + { + "epoch": 0.428054705391349, + "eval_loss": 1.3038532733917236, + "eval_runtime": 45.8175, + "eval_samples_per_second": 10.127, + "eval_steps_per_second": 10.127, + "eval_token_acc": 0.6357595425745635, + "step": 1250 + }, + { + "epoch": 0.4297669242129144, + "grad_norm": 1.417815089225769, + "learning_rate": 6.54831586816136e-05, + "loss": 1.3865683555603028, + "memory(GiB)": 15.74, + "step": 1255, + "token_acc": 0.6353321281587754, + "train_speed(iter/s)": 0.134384 + }, + { + "epoch": 0.4314791430344798, + "grad_norm": 1.389393925666809, + "learning_rate": 6.521370029955713e-05, + "loss": 1.3478347778320312, + "memory(GiB)": 15.74, + "step": 1260, + "token_acc": 0.635971040264194, + "train_speed(iter/s)": 0.134475 + }, + { + "epoch": 0.4331913618560452, + "grad_norm": 1.1969691514968872, + "learning_rate": 6.494375409645048e-05, + "loss": 1.4288113594055176, + "memory(GiB)": 15.74, + "step": 1265, + "token_acc": 0.6169256453062797, + "train_speed(iter/s)": 0.134557 + }, + { + "epoch": 0.4349035806776106, + "grad_norm": 1.4028680324554443, + "learning_rate": 6.467332872800778e-05, + "loss": 1.3811750411987305, + "memory(GiB)": 15.74, + "step": 1270, + "token_acc": 0.630393227744402, + "train_speed(iter/s)": 0.134623 + }, + { + "epoch": 0.436615799499176, + "grad_norm": 1.312172770500183, + "learning_rate": 6.440243286530738e-05, + "loss": 1.4159652709960937, + "memory(GiB)": 15.74, + "step": 1275, + "token_acc": 0.6232462173314993, + "train_speed(iter/s)": 0.134676 + }, + { + "epoch": 0.43832801832074136, + "grad_norm": 1.3302096128463745, + "learning_rate": 6.413107519451382e-05, + "loss": 1.3935270309448242, + "memory(GiB)": 15.74, + "step": 1280, + "token_acc": 0.6334576451784762, + "train_speed(iter/s)": 0.134742 + }, + { + "epoch": 0.4400402371423068, + "grad_norm": 1.190392255783081, + "learning_rate": 6.385926441659933e-05, + "loss": 1.3308396339416504, + "memory(GiB)": 15.74, + "step": 1285, + "token_acc": 0.6419882865295089, + "train_speed(iter/s)": 0.134827 + }, + { + "epoch": 0.4417524559638722, + "grad_norm": 1.5505092144012451, + "learning_rate": 6.358700924706485e-05, + "loss": 1.3729815483093262, + "memory(GiB)": 15.74, + "step": 1290, + "token_acc": 0.6446088505274695, + "train_speed(iter/s)": 0.134935 + }, + { + "epoch": 0.44346467478543755, + "grad_norm": 1.2974587678909302, + "learning_rate": 6.331431841566056e-05, + "loss": 1.371195411682129, + "memory(GiB)": 15.74, + "step": 1295, + "token_acc": 0.6339886883921357, + "train_speed(iter/s)": 0.135008 + }, + { + "epoch": 0.44517689360700297, + "grad_norm": 1.3717535734176636, + "learning_rate": 6.30412006661059e-05, + "loss": 1.4003836631774902, + "memory(GiB)": 15.74, + "step": 1300, + "token_acc": 0.6253154972236245, + "train_speed(iter/s)": 0.135034 + }, + { + "epoch": 0.44517689360700297, + "eval_loss": 1.2974755764007568, + "eval_runtime": 45.5227, + "eval_samples_per_second": 10.193, + "eval_steps_per_second": 10.193, + "eval_token_acc": 0.6357374660573548, + "step": 1300 + }, + { + "epoch": 0.4468891124285684, + "grad_norm": 1.0276782512664795, + "learning_rate": 6.276766475580935e-05, + "loss": 1.3123481750488282, + "memory(GiB)": 15.74, + "step": 1305, + "token_acc": 0.6379213166497576, + "train_speed(iter/s)": 0.13435 + }, + { + "epoch": 0.4486013312501338, + "grad_norm": 1.4574421644210815, + "learning_rate": 6.249371945558751e-05, + "loss": 1.316396999359131, + "memory(GiB)": 15.74, + "step": 1310, + "token_acc": 0.6433888212207335, + "train_speed(iter/s)": 0.134452 + }, + { + "epoch": 0.45031355007169915, + "grad_norm": 1.3185852766036987, + "learning_rate": 6.221937354938386e-05, + "loss": 1.4050587654113769, + "memory(GiB)": 15.74, + "step": 1315, + "token_acc": 0.6323078971900387, + "train_speed(iter/s)": 0.134505 + }, + { + "epoch": 0.45202576889326457, + "grad_norm": 1.5602445602416992, + "learning_rate": 6.194463583398719e-05, + "loss": 1.4056368827819825, + "memory(GiB)": 15.74, + "step": 1320, + "token_acc": 0.6234857849196539, + "train_speed(iter/s)": 0.134575 + }, + { + "epoch": 0.45373798771483, + "grad_norm": 1.22947096824646, + "learning_rate": 6.166951511874948e-05, + "loss": 1.3305022239685058, + "memory(GiB)": 15.74, + "step": 1325, + "token_acc": 0.6454263565891473, + "train_speed(iter/s)": 0.134685 + }, + { + "epoch": 0.45545020653639534, + "grad_norm": 1.1468359231948853, + "learning_rate": 6.139402022530344e-05, + "loss": 1.3677234649658203, + "memory(GiB)": 15.74, + "step": 1330, + "token_acc": 0.6292073086962495, + "train_speed(iter/s)": 0.134777 + }, + { + "epoch": 0.45716242535796076, + "grad_norm": 1.2402437925338745, + "learning_rate": 6.111815998727966e-05, + "loss": 1.401898193359375, + "memory(GiB)": 15.74, + "step": 1335, + "token_acc": 0.6288182659672941, + "train_speed(iter/s)": 0.134864 + }, + { + "epoch": 0.45887464417952617, + "grad_norm": 1.2008066177368164, + "learning_rate": 6.0841943250023345e-05, + "loss": 1.3440381050109864, + "memory(GiB)": 15.74, + "step": 1340, + "token_acc": 0.6449035812672176, + "train_speed(iter/s)": 0.134928 + }, + { + "epoch": 0.46058686300109153, + "grad_norm": 1.2708277702331543, + "learning_rate": 6.056537887031069e-05, + "loss": 1.3351163864135742, + "memory(GiB)": 15.74, + "step": 1345, + "token_acc": 0.6425106124924197, + "train_speed(iter/s)": 0.135024 + }, + { + "epoch": 0.46229908182265694, + "grad_norm": 1.28602135181427, + "learning_rate": 6.028847571606493e-05, + "loss": 1.370203685760498, + "memory(GiB)": 15.74, + "step": 1350, + "token_acc": 0.6256464861575906, + "train_speed(iter/s)": 0.135124 + }, + { + "epoch": 0.46229908182265694, + "eval_loss": 1.2954517602920532, + "eval_runtime": 44.8392, + "eval_samples_per_second": 10.348, + "eval_steps_per_second": 10.348, + "eval_token_acc": 0.6359582312294413, + "step": 1350 + }, + { + "epoch": 0.46401130064422236, + "grad_norm": 1.0725177526474, + "learning_rate": 6.001124266607194e-05, + "loss": 1.4051658630371093, + "memory(GiB)": 15.74, + "step": 1355, + "token_acc": 0.635363480472438, + "train_speed(iter/s)": 0.134486 + }, + { + "epoch": 0.4657235194657877, + "grad_norm": 1.24540376663208, + "learning_rate": 5.973368860969559e-05, + "loss": 1.4290529251098634, + "memory(GiB)": 15.74, + "step": 1360, + "token_acc": 0.6273199703043801, + "train_speed(iter/s)": 0.134562 + }, + { + "epoch": 0.46743573828735313, + "grad_norm": 1.3075815439224243, + "learning_rate": 5.945582244659267e-05, + "loss": 1.4078018188476562, + "memory(GiB)": 15.74, + "step": 1365, + "token_acc": 0.637137069547603, + "train_speed(iter/s)": 0.134621 + }, + { + "epoch": 0.46914795710891855, + "grad_norm": 1.583464503288269, + "learning_rate": 5.917765308642754e-05, + "loss": 1.4400203704833985, + "memory(GiB)": 15.74, + "step": 1370, + "token_acc": 0.6118116520351157, + "train_speed(iter/s)": 0.134706 + }, + { + "epoch": 0.4708601759304839, + "grad_norm": 1.1160328388214111, + "learning_rate": 5.889918944858647e-05, + "loss": 1.4093569755554198, + "memory(GiB)": 15.74, + "step": 1375, + "token_acc": 0.6344725111441307, + "train_speed(iter/s)": 0.134749 + }, + { + "epoch": 0.4725723947520493, + "grad_norm": 1.3302032947540283, + "learning_rate": 5.8620440461891614e-05, + "loss": 1.2986675262451173, + "memory(GiB)": 15.74, + "step": 1380, + "token_acc": 0.650447497794025, + "train_speed(iter/s)": 0.134804 + }, + { + "epoch": 0.47428461357361473, + "grad_norm": 1.6198601722717285, + "learning_rate": 5.8341415064314695e-05, + "loss": 1.438179111480713, + "memory(GiB)": 15.74, + "step": 1385, + "token_acc": 0.6235736153632062, + "train_speed(iter/s)": 0.134862 + }, + { + "epoch": 0.4759968323951801, + "grad_norm": 1.1237093210220337, + "learning_rate": 5.806212220269049e-05, + "loss": 1.3448370933532714, + "memory(GiB)": 15.74, + "step": 1390, + "token_acc": 0.6310979534045565, + "train_speed(iter/s)": 0.134965 + }, + { + "epoch": 0.4777090512167455, + "grad_norm": 1.26457941532135, + "learning_rate": 5.778257083242986e-05, + "loss": 1.377913475036621, + "memory(GiB)": 15.74, + "step": 1395, + "token_acc": 0.6350752878653676, + "train_speed(iter/s)": 0.135002 + }, + { + "epoch": 0.4794212700383109, + "grad_norm": 1.3195010423660278, + "learning_rate": 5.7502769917232635e-05, + "loss": 1.3328777313232423, + "memory(GiB)": 15.74, + "step": 1400, + "token_acc": 0.6353383458646616, + "train_speed(iter/s)": 0.135112 + }, + { + "epoch": 0.4794212700383109, + "eval_loss": 1.2928467988967896, + "eval_runtime": 44.9623, + "eval_samples_per_second": 10.32, + "eval_steps_per_second": 10.32, + "eval_token_acc": 0.6374373578824205, + "step": 1400 + }, + { + "epoch": 0.4811334888598763, + "grad_norm": 1.3384417295455933, + "learning_rate": 5.722272842880023e-05, + "loss": 1.4393559455871583, + "memory(GiB)": 15.74, + "step": 1405, + "token_acc": 0.6358039938439323, + "train_speed(iter/s)": 0.134477 + }, + { + "epoch": 0.4828457076814417, + "grad_norm": 1.435585856437683, + "learning_rate": 5.6942455346547954e-05, + "loss": 1.2933756828308105, + "memory(GiB)": 15.74, + "step": 1410, + "token_acc": 0.6496201052016365, + "train_speed(iter/s)": 0.134578 + }, + { + "epoch": 0.4845579265030071, + "grad_norm": 1.1114050149917603, + "learning_rate": 5.6661959657316996e-05, + "loss": 1.3269688606262207, + "memory(GiB)": 15.74, + "step": 1415, + "token_acc": 0.6503867913635839, + "train_speed(iter/s)": 0.134634 + }, + { + "epoch": 0.48627014532457247, + "grad_norm": 1.4302539825439453, + "learning_rate": 5.638125035508642e-05, + "loss": 1.434300422668457, + "memory(GiB)": 15.74, + "step": 1420, + "token_acc": 0.6298221614227086, + "train_speed(iter/s)": 0.134678 + }, + { + "epoch": 0.4879823641461379, + "grad_norm": 0.9589481353759766, + "learning_rate": 5.610033644068471e-05, + "loss": 1.3287978172302246, + "memory(GiB)": 15.74, + "step": 1425, + "token_acc": 0.6469009272816008, + "train_speed(iter/s)": 0.134788 + }, + { + "epoch": 0.4896945829677033, + "grad_norm": 1.348445177078247, + "learning_rate": 5.58192269215011e-05, + "loss": 1.3722798347473144, + "memory(GiB)": 15.74, + "step": 1430, + "token_acc": 0.6363883600761491, + "train_speed(iter/s)": 0.134843 + }, + { + "epoch": 0.49140680178926865, + "grad_norm": 1.206146240234375, + "learning_rate": 5.5537930811196844e-05, + "loss": 1.3646371841430665, + "memory(GiB)": 15.74, + "step": 1435, + "token_acc": 0.6292120597922473, + "train_speed(iter/s)": 0.134927 + }, + { + "epoch": 0.49311902061083407, + "grad_norm": 1.356586217880249, + "learning_rate": 5.525645712941618e-05, + "loss": 1.3678319931030274, + "memory(GiB)": 15.74, + "step": 1440, + "token_acc": 0.6278630757613893, + "train_speed(iter/s)": 0.135032 + }, + { + "epoch": 0.4948312394323995, + "grad_norm": 1.2946938276290894, + "learning_rate": 5.497481490149705e-05, + "loss": 1.3993468284606934, + "memory(GiB)": 15.74, + "step": 1445, + "token_acc": 0.622295910311069, + "train_speed(iter/s)": 0.135135 + }, + { + "epoch": 0.49654345825396484, + "grad_norm": 1.357661247253418, + "learning_rate": 5.4693013158181826e-05, + "loss": 1.4459356307983398, + "memory(GiB)": 15.74, + "step": 1450, + "token_acc": 0.6195, + "train_speed(iter/s)": 0.135207 + }, + { + "epoch": 0.49654345825396484, + "eval_loss": 1.2901684045791626, + "eval_runtime": 45.1728, + "eval_samples_per_second": 10.272, + "eval_steps_per_second": 10.272, + "eval_token_acc": 0.6373490518135859, + "step": 1450 + }, + { + "epoch": 0.49825567707553026, + "grad_norm": 1.1364332437515259, + "learning_rate": 5.4411060935327616e-05, + "loss": 1.441689395904541, + "memory(GiB)": 15.74, + "step": 1455, + "token_acc": 0.6348721367331445, + "train_speed(iter/s)": 0.134552 + }, + { + "epoch": 0.49996789589709567, + "grad_norm": 0.9982820153236389, + "learning_rate": 5.4128967273616625e-05, + "loss": 1.3047595024108887, + "memory(GiB)": 15.74, + "step": 1460, + "token_acc": 0.6463554255183819, + "train_speed(iter/s)": 0.134643 + }, + { + "epoch": 0.501680114718661, + "grad_norm": 1.2560155391693115, + "learning_rate": 5.3846741218266214e-05, + "loss": 1.373036003112793, + "memory(GiB)": 15.74, + "step": 1465, + "token_acc": 0.6327571197612237, + "train_speed(iter/s)": 0.134708 + }, + { + "epoch": 0.5033923335402264, + "grad_norm": 1.2929894924163818, + "learning_rate": 5.3564391818738944e-05, + "loss": 1.3368280410766602, + "memory(GiB)": 15.74, + "step": 1470, + "token_acc": 0.6465181058495821, + "train_speed(iter/s)": 0.134782 + }, + { + "epoch": 0.5051045523617919, + "grad_norm": 1.3594943284988403, + "learning_rate": 5.3281928128452274e-05, + "loss": 1.4257681846618653, + "memory(GiB)": 15.74, + "step": 1475, + "token_acc": 0.6303446303446304, + "train_speed(iter/s)": 0.134856 + }, + { + "epoch": 0.5068167711833572, + "grad_norm": 1.0604503154754639, + "learning_rate": 5.299935920448843e-05, + "loss": 1.3852405548095703, + "memory(GiB)": 15.74, + "step": 1480, + "token_acc": 0.6341349521363531, + "train_speed(iter/s)": 0.13488 + }, + { + "epoch": 0.5085289900049226, + "grad_norm": 1.3724855184555054, + "learning_rate": 5.271669410730384e-05, + "loss": 1.448958683013916, + "memory(GiB)": 15.74, + "step": 1485, + "token_acc": 0.6159379407616361, + "train_speed(iter/s)": 0.134957 + }, + { + "epoch": 0.510241208826488, + "grad_norm": 1.1618252992630005, + "learning_rate": 5.2433941900438766e-05, + "loss": 1.3326935768127441, + "memory(GiB)": 15.74, + "step": 1490, + "token_acc": 0.638154061298217, + "train_speed(iter/s)": 0.13502 + }, + { + "epoch": 0.5119534276480534, + "grad_norm": 1.050891637802124, + "learning_rate": 5.215111165022652e-05, + "loss": 1.2615792274475097, + "memory(GiB)": 15.74, + "step": 1495, + "token_acc": 0.6487787269883287, + "train_speed(iter/s)": 0.135086 + }, + { + "epoch": 0.5136656464696188, + "grad_norm": 1.589298129081726, + "learning_rate": 5.186821242550294e-05, + "loss": 1.3608013153076173, + "memory(GiB)": 15.74, + "step": 1500, + "token_acc": 0.6363052208835341, + "train_speed(iter/s)": 0.135154 + }, + { + "epoch": 0.5136656464696188, + "eval_loss": 1.288938283920288, + "eval_runtime": 44.8863, + "eval_samples_per_second": 10.337, + "eval_steps_per_second": 10.337, + "eval_token_acc": 0.6378126586749674, + "step": 1500 + }, + { + "epoch": 0.5153778652911842, + "grad_norm": 1.0448987483978271, + "learning_rate": 5.158525329731538e-05, + "loss": 1.3373411178588868, + "memory(GiB)": 15.74, + "step": 1505, + "token_acc": 0.6383010712272129, + "train_speed(iter/s)": 0.134614 + }, + { + "epoch": 0.5170900841127496, + "grad_norm": 1.260827898979187, + "learning_rate": 5.130224333863212e-05, + "loss": 1.4426488876342773, + "memory(GiB)": 15.74, + "step": 1510, + "token_acc": 0.625866050808314, + "train_speed(iter/s)": 0.134656 + }, + { + "epoch": 0.518802302934315, + "grad_norm": 1.3069380521774292, + "learning_rate": 5.1019191624051154e-05, + "loss": 1.3473889350891113, + "memory(GiB)": 15.74, + "step": 1515, + "token_acc": 0.64722323490096, + "train_speed(iter/s)": 0.134713 + }, + { + "epoch": 0.5205145217558804, + "grad_norm": 1.2601243257522583, + "learning_rate": 5.073610722950947e-05, + "loss": 1.4510963439941407, + "memory(GiB)": 15.74, + "step": 1520, + "token_acc": 0.6192196531791907, + "train_speed(iter/s)": 0.134729 + }, + { + "epoch": 0.5222267405774458, + "grad_norm": 1.2570956945419312, + "learning_rate": 5.045299923199186e-05, + "loss": 1.427687168121338, + "memory(GiB)": 15.74, + "step": 1525, + "token_acc": 0.622303166152984, + "train_speed(iter/s)": 0.134779 + }, + { + "epoch": 0.5239389593990111, + "grad_norm": 1.2593395709991455, + "learning_rate": 5.016987670923997e-05, + "loss": 1.4130517959594726, + "memory(GiB)": 15.74, + "step": 1530, + "token_acc": 0.6258360655737705, + "train_speed(iter/s)": 0.13485 + }, + { + "epoch": 0.5256511782205766, + "grad_norm": 1.3395150899887085, + "learning_rate": 4.988674873946118e-05, + "loss": 1.3743631362915039, + "memory(GiB)": 15.74, + "step": 1535, + "token_acc": 0.6377041382590772, + "train_speed(iter/s)": 0.13494 + }, + { + "epoch": 0.527363397042142, + "grad_norm": 2.2758524417877197, + "learning_rate": 4.960362440103756e-05, + "loss": 1.269289493560791, + "memory(GiB)": 15.74, + "step": 1540, + "token_acc": 0.6483722090557565, + "train_speed(iter/s)": 0.135018 + }, + { + "epoch": 0.5290756158637073, + "grad_norm": 0.9651917219161987, + "learning_rate": 4.932051277223468e-05, + "loss": 1.3328426361083985, + "memory(GiB)": 15.74, + "step": 1545, + "token_acc": 0.6461893138052617, + "train_speed(iter/s)": 0.135097 + }, + { + "epoch": 0.5307878346852728, + "grad_norm": 1.3631515502929688, + "learning_rate": 4.9037422930910615e-05, + "loss": 1.3980073928833008, + "memory(GiB)": 15.74, + "step": 1550, + "token_acc": 0.6436149312377211, + "train_speed(iter/s)": 0.13514 + }, + { + "epoch": 0.5307878346852728, + "eval_loss": 1.2880152463912964, + "eval_runtime": 44.9207, + "eval_samples_per_second": 10.329, + "eval_steps_per_second": 10.329, + "eval_token_acc": 0.6381658829503057, + "step": 1550 + }, + { + "epoch": 0.5325000535068382, + "grad_norm": 1.4552366733551025, + "learning_rate": 4.875436395422481e-05, + "loss": 1.4057562828063965, + "memory(GiB)": 15.74, + "step": 1555, + "token_acc": 0.6365109574327762, + "train_speed(iter/s)": 0.134572 + }, + { + "epoch": 0.5342122723284035, + "grad_norm": 1.3907065391540527, + "learning_rate": 4.847134491834713e-05, + "loss": 1.3479011535644532, + "memory(GiB)": 15.74, + "step": 1560, + "token_acc": 0.6325500937004469, + "train_speed(iter/s)": 0.134683 + }, + { + "epoch": 0.535924491149969, + "grad_norm": 1.2653462886810303, + "learning_rate": 4.8188374898166635e-05, + "loss": 1.413265323638916, + "memory(GiB)": 15.74, + "step": 1565, + "token_acc": 0.6244780793319415, + "train_speed(iter/s)": 0.134734 + }, + { + "epoch": 0.5376367099715343, + "grad_norm": 1.1082566976547241, + "learning_rate": 4.790546296700081e-05, + "loss": 1.3549816131591796, + "memory(GiB)": 15.74, + "step": 1570, + "token_acc": 0.629037941971103, + "train_speed(iter/s)": 0.134812 + }, + { + "epoch": 0.5393489287930998, + "grad_norm": 1.0759904384613037, + "learning_rate": 4.762261819630447e-05, + "loss": 1.3705751419067382, + "memory(GiB)": 15.74, + "step": 1575, + "token_acc": 0.6443962922836163, + "train_speed(iter/s)": 0.134819 + }, + { + "epoch": 0.5410611476146652, + "grad_norm": 1.3438408374786377, + "learning_rate": 4.733984965537903e-05, + "loss": 1.3871797561645507, + "memory(GiB)": 15.74, + "step": 1580, + "token_acc": 0.6392115911760867, + "train_speed(iter/s)": 0.13489 + }, + { + "epoch": 0.5427733664362305, + "grad_norm": 1.0663775205612183, + "learning_rate": 4.705716641108157e-05, + "loss": 1.353131103515625, + "memory(GiB)": 15.74, + "step": 1585, + "token_acc": 0.6312800349497597, + "train_speed(iter/s)": 0.134968 + }, + { + "epoch": 0.544485585257796, + "grad_norm": 1.3968427181243896, + "learning_rate": 4.6774577527534195e-05, + "loss": 1.5423471450805664, + "memory(GiB)": 15.74, + "step": 1590, + "token_acc": 0.6110971943887775, + "train_speed(iter/s)": 0.134981 + }, + { + "epoch": 0.5461978040793614, + "grad_norm": 1.1753484010696411, + "learning_rate": 4.6492092065833345e-05, + "loss": 1.3162307739257812, + "memory(GiB)": 15.74, + "step": 1595, + "token_acc": 0.6369168356997972, + "train_speed(iter/s)": 0.135042 + }, + { + "epoch": 0.5479100229009267, + "grad_norm": 1.0700794458389282, + "learning_rate": 4.620971908375934e-05, + "loss": 1.3670080184936524, + "memory(GiB)": 15.74, + "step": 1600, + "token_acc": 0.6353012048192771, + "train_speed(iter/s)": 0.135083 + }, + { + "epoch": 0.5479100229009267, + "eval_loss": 1.2859621047973633, + "eval_runtime": 44.7365, + "eval_samples_per_second": 10.372, + "eval_steps_per_second": 10.372, + "eval_token_acc": 0.6372607457447513, + "step": 1600 + }, + { + "epoch": 0.5496222417224922, + "grad_norm": 1.36152982711792, + "learning_rate": 4.592746763548582e-05, + "loss": 1.4221895217895508, + "memory(GiB)": 15.74, + "step": 1605, + "token_acc": 0.6360437989163155, + "train_speed(iter/s)": 0.134558 + }, + { + "epoch": 0.5513344605440575, + "grad_norm": 1.155228853225708, + "learning_rate": 4.564534677128954e-05, + "loss": 1.3129186630249023, + "memory(GiB)": 15.74, + "step": 1610, + "token_acc": 0.6391490712811557, + "train_speed(iter/s)": 0.134651 + }, + { + "epoch": 0.5530466793656229, + "grad_norm": 1.2682766914367676, + "learning_rate": 4.536336553726008e-05, + "loss": 1.379691219329834, + "memory(GiB)": 15.74, + "step": 1615, + "token_acc": 0.6256661385568199, + "train_speed(iter/s)": 0.134703 + }, + { + "epoch": 0.5547588981871884, + "grad_norm": 1.4693753719329834, + "learning_rate": 4.508153297500992e-05, + "loss": 1.3871756553649903, + "memory(GiB)": 15.74, + "step": 1620, + "token_acc": 0.6318840579710145, + "train_speed(iter/s)": 0.134737 + }, + { + "epoch": 0.5564711170087537, + "grad_norm": 1.3396706581115723, + "learning_rate": 4.4799858121384344e-05, + "loss": 1.3503762245178224, + "memory(GiB)": 15.74, + "step": 1625, + "token_acc": 0.6382567306262086, + "train_speed(iter/s)": 0.134805 + }, + { + "epoch": 0.5581833358303191, + "grad_norm": 1.4248732328414917, + "learning_rate": 4.451835000817185e-05, + "loss": 1.4662213325500488, + "memory(GiB)": 15.74, + "step": 1630, + "token_acc": 0.6264982600850625, + "train_speed(iter/s)": 0.134807 + }, + { + "epoch": 0.5598955546518846, + "grad_norm": 1.3699737787246704, + "learning_rate": 4.42370176618144e-05, + "loss": 1.3409448623657227, + "memory(GiB)": 15.74, + "step": 1635, + "token_acc": 0.6422917451941199, + "train_speed(iter/s)": 0.134831 + }, + { + "epoch": 0.5616077734734499, + "grad_norm": 1.2738423347473145, + "learning_rate": 4.3955870103118145e-05, + "loss": 1.335362148284912, + "memory(GiB)": 15.74, + "step": 1640, + "token_acc": 0.6459505061867267, + "train_speed(iter/s)": 0.134907 + }, + { + "epoch": 0.5633199922950153, + "grad_norm": 1.4507850408554077, + "learning_rate": 4.367491634696405e-05, + "loss": 1.4555521965026856, + "memory(GiB)": 15.74, + "step": 1645, + "token_acc": 0.6190127970749543, + "train_speed(iter/s)": 0.134952 + }, + { + "epoch": 0.5650322111165808, + "grad_norm": 1.4329421520233154, + "learning_rate": 4.3394165402018875e-05, + "loss": 1.3959566116333009, + "memory(GiB)": 15.74, + "step": 1650, + "token_acc": 0.6274984857662023, + "train_speed(iter/s)": 0.135005 + }, + { + "epoch": 0.5650322111165808, + "eval_loss": 1.281676173210144, + "eval_runtime": 45.0822, + "eval_samples_per_second": 10.292, + "eval_steps_per_second": 10.292, + "eval_token_acc": 0.6389827140870257, + "step": 1650 + }, + { + "epoch": 0.5667444299381461, + "grad_norm": 1.3453797101974487, + "learning_rate": 4.311362627044633e-05, + "loss": 1.3569275856018066, + "memory(GiB)": 15.74, + "step": 1655, + "token_acc": 0.6398194434081922, + "train_speed(iter/s)": 0.134451 + }, + { + "epoch": 0.5684566487597115, + "grad_norm": 1.3169301748275757, + "learning_rate": 4.283330794761845e-05, + "loss": 1.4494799613952636, + "memory(GiB)": 15.74, + "step": 1660, + "token_acc": 0.6154090548054011, + "train_speed(iter/s)": 0.134498 + }, + { + "epoch": 0.5701688675812769, + "grad_norm": 1.319637417793274, + "learning_rate": 4.2553219421827066e-05, + "loss": 1.405910587310791, + "memory(GiB)": 15.74, + "step": 1665, + "token_acc": 0.6215663354763297, + "train_speed(iter/s)": 0.134562 + }, + { + "epoch": 0.5718810864028423, + "grad_norm": 1.120036244392395, + "learning_rate": 4.227336967399572e-05, + "loss": 1.329239845275879, + "memory(GiB)": 15.74, + "step": 1670, + "token_acc": 0.6467726625533935, + "train_speed(iter/s)": 0.134602 + }, + { + "epoch": 0.5735933052244077, + "grad_norm": 1.237338900566101, + "learning_rate": 4.199376767739158e-05, + "loss": 1.2887453079223632, + "memory(GiB)": 15.74, + "step": 1675, + "token_acc": 0.6471086036671369, + "train_speed(iter/s)": 0.134703 + }, + { + "epoch": 0.5753055240459731, + "grad_norm": 1.3308744430541992, + "learning_rate": 4.171442239733783e-05, + "loss": 1.342413330078125, + "memory(GiB)": 15.74, + "step": 1680, + "token_acc": 0.6384921952839588, + "train_speed(iter/s)": 0.134751 + }, + { + "epoch": 0.5770177428675385, + "grad_norm": 1.1540745496749878, + "learning_rate": 4.143534279092612e-05, + "loss": 1.3563608169555663, + "memory(GiB)": 15.74, + "step": 1685, + "token_acc": 0.6335287846481876, + "train_speed(iter/s)": 0.134841 + }, + { + "epoch": 0.5787299616891038, + "grad_norm": 1.4379019737243652, + "learning_rate": 4.1156537806729364e-05, + "loss": 1.3419925689697265, + "memory(GiB)": 15.74, + "step": 1690, + "token_acc": 0.6443872176050568, + "train_speed(iter/s)": 0.134881 + }, + { + "epoch": 0.5804421805106693, + "grad_norm": 1.5007866621017456, + "learning_rate": 4.087801638451484e-05, + "loss": 1.360206127166748, + "memory(GiB)": 15.74, + "step": 1695, + "token_acc": 0.6360088461038116, + "train_speed(iter/s)": 0.134949 + }, + { + "epoch": 0.5821543993322347, + "grad_norm": 1.3432769775390625, + "learning_rate": 4.059978745495757e-05, + "loss": 1.2789844512939452, + "memory(GiB)": 15.74, + "step": 1700, + "token_acc": 0.65403668496452, + "train_speed(iter/s)": 0.135022 + }, + { + "epoch": 0.5821543993322347, + "eval_loss": 1.2787680625915527, + "eval_runtime": 45.2625, + "eval_samples_per_second": 10.251, + "eval_steps_per_second": 10.251, + "eval_token_acc": 0.6394463209484071, + "step": 1700 + }, + { + "epoch": 0.5838666181538, + "grad_norm": 1.3832672834396362, + "learning_rate": 4.0321859939353845e-05, + "loss": 1.4007149696350099, + "memory(GiB)": 15.74, + "step": 1705, + "token_acc": 0.6372449554728892, + "train_speed(iter/s)": 0.134502 + }, + { + "epoch": 0.5855788369753655, + "grad_norm": 1.1818128824234009, + "learning_rate": 4.004424274933528e-05, + "loss": 1.404418659210205, + "memory(GiB)": 15.74, + "step": 1710, + "token_acc": 0.6255577590480912, + "train_speed(iter/s)": 0.134568 + }, + { + "epoch": 0.5872910557969309, + "grad_norm": 1.4451345205307007, + "learning_rate": 3.976694478658302e-05, + "loss": 1.4602587699890137, + "memory(GiB)": 15.74, + "step": 1715, + "token_acc": 0.6158373461744249, + "train_speed(iter/s)": 0.134602 + }, + { + "epoch": 0.5890032746184962, + "grad_norm": 1.1970713138580322, + "learning_rate": 3.9489974942542355e-05, + "loss": 1.2911714553833007, + "memory(GiB)": 15.74, + "step": 1720, + "token_acc": 0.6592741935483871, + "train_speed(iter/s)": 0.134648 + }, + { + "epoch": 0.5907154934400617, + "grad_norm": 1.4316002130508423, + "learning_rate": 3.9213342098137516e-05, + "loss": 1.350833797454834, + "memory(GiB)": 15.74, + "step": 1725, + "token_acc": 0.6399550119499507, + "train_speed(iter/s)": 0.134711 + }, + { + "epoch": 0.592427712261627, + "grad_norm": 1.465791940689087, + "learning_rate": 3.893705512348705e-05, + "loss": 1.3778098106384278, + "memory(GiB)": 15.74, + "step": 1730, + "token_acc": 0.6389973548305832, + "train_speed(iter/s)": 0.134741 + }, + { + "epoch": 0.5941399310831924, + "grad_norm": 1.1226149797439575, + "learning_rate": 3.8661122877619255e-05, + "loss": 1.3883560180664063, + "memory(GiB)": 15.74, + "step": 1735, + "token_acc": 0.6357795898196195, + "train_speed(iter/s)": 0.134771 + }, + { + "epoch": 0.5958521499047579, + "grad_norm": 1.1363505125045776, + "learning_rate": 3.838555420818827e-05, + "loss": 1.3914413452148438, + "memory(GiB)": 15.74, + "step": 1740, + "token_acc": 0.633240257375258, + "train_speed(iter/s)": 0.134813 + }, + { + "epoch": 0.5975643687263232, + "grad_norm": 1.3584070205688477, + "learning_rate": 3.811035795119029e-05, + "loss": 1.363978099822998, + "memory(GiB)": 15.74, + "step": 1745, + "token_acc": 0.6440445116087374, + "train_speed(iter/s)": 0.134826 + }, + { + "epoch": 0.5992765875478886, + "grad_norm": 1.3812576532363892, + "learning_rate": 3.78355429306802e-05, + "loss": 1.4450836181640625, + "memory(GiB)": 15.74, + "step": 1750, + "token_acc": 0.620409741012756, + "train_speed(iter/s)": 0.134895 + }, + { + "epoch": 0.5992765875478886, + "eval_loss": 1.277072548866272, + "eval_runtime": 45.0007, + "eval_samples_per_second": 10.311, + "eval_steps_per_second": 10.311, + "eval_token_acc": 0.6405501468088395, + "step": 1750 + }, + { + "epoch": 0.6009888063694541, + "grad_norm": 1.3305871486663818, + "learning_rate": 3.756111795848874e-05, + "loss": 1.1959224700927735, + "memory(GiB)": 15.74, + "step": 1755, + "token_acc": 0.6452136494035935, + "train_speed(iter/s)": 0.134401 + }, + { + "epoch": 0.6027010251910194, + "grad_norm": 1.4807337522506714, + "learning_rate": 3.7287091833939945e-05, + "loss": 1.4347740173339845, + "memory(GiB)": 15.74, + "step": 1760, + "token_acc": 0.6259541984732825, + "train_speed(iter/s)": 0.134428 + }, + { + "epoch": 0.6044132440125848, + "grad_norm": 1.2514686584472656, + "learning_rate": 3.7013473343568896e-05, + "loss": 1.3172637939453125, + "memory(GiB)": 15.74, + "step": 1765, + "token_acc": 0.6499353169469599, + "train_speed(iter/s)": 0.134479 + }, + { + "epoch": 0.6061254628341503, + "grad_norm": 1.205390453338623, + "learning_rate": 3.6740271260840095e-05, + "loss": 1.3148152351379394, + "memory(GiB)": 15.74, + "step": 1770, + "token_acc": 0.6517599899787048, + "train_speed(iter/s)": 0.134529 + }, + { + "epoch": 0.6078376816557156, + "grad_norm": 1.2843999862670898, + "learning_rate": 3.646749434586606e-05, + "loss": 1.2671688079833985, + "memory(GiB)": 15.74, + "step": 1775, + "token_acc": 0.6564948453608247, + "train_speed(iter/s)": 0.134599 + }, + { + "epoch": 0.609549900477281, + "grad_norm": 1.2573988437652588, + "learning_rate": 3.619515134512656e-05, + "loss": 1.389388656616211, + "memory(GiB)": 15.74, + "step": 1780, + "token_acc": 0.6315289648622981, + "train_speed(iter/s)": 0.134618 + }, + { + "epoch": 0.6112621192988464, + "grad_norm": 1.2170222997665405, + "learning_rate": 3.5923250991188e-05, + "loss": 1.3666535377502442, + "memory(GiB)": 15.74, + "step": 1785, + "token_acc": 0.6323827544262738, + "train_speed(iter/s)": 0.134643 + }, + { + "epoch": 0.6129743381204118, + "grad_norm": 1.1232810020446777, + "learning_rate": 3.565180200242354e-05, + "loss": 1.3694002151489257, + "memory(GiB)": 15.74, + "step": 1790, + "token_acc": 0.6320690127344927, + "train_speed(iter/s)": 0.134704 + }, + { + "epoch": 0.6146865569419772, + "grad_norm": 0.9963919520378113, + "learning_rate": 3.538081308273346e-05, + "loss": 1.3329906463623047, + "memory(GiB)": 15.74, + "step": 1795, + "token_acc": 0.645523397349202, + "train_speed(iter/s)": 0.134782 + }, + { + "epoch": 0.6163987757635426, + "grad_norm": 1.07882559299469, + "learning_rate": 3.5110292921266196e-05, + "loss": 1.3543855667114257, + "memory(GiB)": 15.74, + "step": 1800, + "token_acc": 0.6307253341342544, + "train_speed(iter/s)": 0.134862 + }, + { + "epoch": 0.6163987757635426, + "eval_loss": 1.2761130332946777, + "eval_runtime": 45.0065, + "eval_samples_per_second": 10.31, + "eval_steps_per_second": 10.31, + "eval_token_acc": 0.6402410755679184, + "step": 1800 + }, + { + "epoch": 0.618110994585108, + "grad_norm": 1.1731452941894531, + "learning_rate": 3.4840250192139574e-05, + "loss": 1.3603300094604491, + "memory(GiB)": 15.74, + "step": 1805, + "token_acc": 0.639933481991005, + "train_speed(iter/s)": 0.134375 + }, + { + "epoch": 0.6198232134066733, + "grad_norm": 1.1135847568511963, + "learning_rate": 3.4570693554162746e-05, + "loss": 1.4044634819030761, + "memory(GiB)": 15.74, + "step": 1810, + "token_acc": 0.6182817869415808, + "train_speed(iter/s)": 0.134461 + }, + { + "epoch": 0.6215354322282388, + "grad_norm": 1.272207260131836, + "learning_rate": 3.430163165055859e-05, + "loss": 1.335850143432617, + "memory(GiB)": 15.74, + "step": 1815, + "token_acc": 0.6416519363650972, + "train_speed(iter/s)": 0.134524 + }, + { + "epoch": 0.6232476510498042, + "grad_norm": 1.4287700653076172, + "learning_rate": 3.4033073108686515e-05, + "loss": 1.3434399604797362, + "memory(GiB)": 15.74, + "step": 1820, + "token_acc": 0.6466976186910289, + "train_speed(iter/s)": 0.134577 + }, + { + "epoch": 0.6249598698713695, + "grad_norm": 0.9231454133987427, + "learning_rate": 3.3765026539765834e-05, + "loss": 1.263858699798584, + "memory(GiB)": 15.74, + "step": 1825, + "token_acc": 0.6619002599858189, + "train_speed(iter/s)": 0.134616 + }, + { + "epoch": 0.626672088692935, + "grad_norm": 1.1264557838439941, + "learning_rate": 3.3497500538599665e-05, + "loss": 1.3954036712646485, + "memory(GiB)": 15.74, + "step": 1830, + "token_acc": 0.633007460766658, + "train_speed(iter/s)": 0.134686 + }, + { + "epoch": 0.6283843075145004, + "grad_norm": 1.4104242324829102, + "learning_rate": 3.323050368329931e-05, + "loss": 1.3609606742858886, + "memory(GiB)": 15.74, + "step": 1835, + "token_acc": 0.6343283582089553, + "train_speed(iter/s)": 0.134729 + }, + { + "epoch": 0.6300965263360657, + "grad_norm": 1.1132456064224243, + "learning_rate": 3.2964044535009284e-05, + "loss": 1.3382993698120118, + "memory(GiB)": 15.74, + "step": 1840, + "token_acc": 0.6495195687836888, + "train_speed(iter/s)": 0.134735 + }, + { + "epoch": 0.6318087451576312, + "grad_norm": 1.1795408725738525, + "learning_rate": 3.269813163763271e-05, + "loss": 1.342089557647705, + "memory(GiB)": 15.74, + "step": 1845, + "token_acc": 0.6300553426990209, + "train_speed(iter/s)": 0.134818 + }, + { + "epoch": 0.6335209639791965, + "grad_norm": 1.294784426689148, + "learning_rate": 3.243277351755738e-05, + "loss": 1.3871091842651366, + "memory(GiB)": 15.74, + "step": 1850, + "token_acc": 0.6299535392183657, + "train_speed(iter/s)": 0.134865 + }, + { + "epoch": 0.6335209639791965, + "eval_loss": 1.2749871015548706, + "eval_runtime": 44.8152, + "eval_samples_per_second": 10.354, + "eval_steps_per_second": 10.354, + "eval_token_acc": 0.6404839172572135, + "step": 1850 + }, + { + "epoch": 0.6352331828007619, + "grad_norm": 1.361894130706787, + "learning_rate": 3.216797868338241e-05, + "loss": 1.3356594085693358, + "memory(GiB)": 15.74, + "step": 1855, + "token_acc": 0.640483963894757, + "train_speed(iter/s)": 0.134418 + }, + { + "epoch": 0.6369454016223274, + "grad_norm": 1.2814505100250244, + "learning_rate": 3.19037556256454e-05, + "loss": 1.354970932006836, + "memory(GiB)": 15.74, + "step": 1860, + "token_acc": 0.6343245967741935, + "train_speed(iter/s)": 0.134459 + }, + { + "epoch": 0.6386576204438927, + "grad_norm": 1.360011100769043, + "learning_rate": 3.1640112816550124e-05, + "loss": 1.3589017868041993, + "memory(GiB)": 15.74, + "step": 1865, + "token_acc": 0.6399553259807343, + "train_speed(iter/s)": 0.134502 + }, + { + "epoch": 0.6403698392654581, + "grad_norm": 1.0160573720932007, + "learning_rate": 3.137705870969496e-05, + "loss": 1.381830883026123, + "memory(GiB)": 15.74, + "step": 1870, + "token_acc": 0.6278438469493278, + "train_speed(iter/s)": 0.134563 + }, + { + "epoch": 0.6420820580870236, + "grad_norm": 1.401319980621338, + "learning_rate": 3.111460173980175e-05, + "loss": 1.3153087615966796, + "memory(GiB)": 15.74, + "step": 1875, + "token_acc": 0.643502609138348, + "train_speed(iter/s)": 0.134572 + }, + { + "epoch": 0.6437942769085889, + "grad_norm": 1.1555824279785156, + "learning_rate": 3.0852750322445474e-05, + "loss": 1.3455204010009765, + "memory(GiB)": 15.74, + "step": 1880, + "token_acc": 0.6436751691827174, + "train_speed(iter/s)": 0.134618 + }, + { + "epoch": 0.6455064957301543, + "grad_norm": 1.1246591806411743, + "learning_rate": 3.059151285378421e-05, + "loss": 1.4081114768981933, + "memory(GiB)": 15.74, + "step": 1885, + "token_acc": 0.6301946344029458, + "train_speed(iter/s)": 0.134643 + }, + { + "epoch": 0.6472187145517198, + "grad_norm": 1.341294765472412, + "learning_rate": 3.033089771029009e-05, + "loss": 1.310577392578125, + "memory(GiB)": 15.74, + "step": 1890, + "token_acc": 0.6602104791759964, + "train_speed(iter/s)": 0.134689 + }, + { + "epoch": 0.6489309333732851, + "grad_norm": 1.4662624597549438, + "learning_rate": 3.0070913248480604e-05, + "loss": 1.2789539337158202, + "memory(GiB)": 15.74, + "step": 1895, + "token_acc": 0.6472800925925926, + "train_speed(iter/s)": 0.134737 + }, + { + "epoch": 0.6506431521948505, + "grad_norm": 1.320601463317871, + "learning_rate": 2.9811567804650746e-05, + "loss": 1.3369815826416016, + "memory(GiB)": 15.74, + "step": 1900, + "token_acc": 0.6425245732022763, + "train_speed(iter/s)": 0.134783 + }, + { + "epoch": 0.6506431521948505, + "eval_loss": 1.2735846042633057, + "eval_runtime": 45.8064, + "eval_samples_per_second": 10.13, + "eval_steps_per_second": 10.13, + "eval_token_acc": 0.641013753670221, + "step": 1900 + }, + { + "epoch": 0.6523553710164159, + "grad_norm": 1.5006965398788452, + "learning_rate": 2.955286969460563e-05, + "loss": 1.3669855117797851, + "memory(GiB)": 15.74, + "step": 1905, + "token_acc": 0.6413896935508462, + "train_speed(iter/s)": 0.134314 + }, + { + "epoch": 0.6540675898379813, + "grad_norm": 1.171128749847412, + "learning_rate": 2.9294827213393854e-05, + "loss": 1.3277989387512208, + "memory(GiB)": 15.74, + "step": 1910, + "token_acc": 0.6399797954287157, + "train_speed(iter/s)": 0.13436 + }, + { + "epoch": 0.6557798086595467, + "grad_norm": 1.431249976158142, + "learning_rate": 2.9037448635041575e-05, + "loss": 1.3604629516601563, + "memory(GiB)": 15.74, + "step": 1915, + "token_acc": 0.639218009478673, + "train_speed(iter/s)": 0.134421 + }, + { + "epoch": 0.6574920274811121, + "grad_norm": 1.01530122756958, + "learning_rate": 2.8780742212287194e-05, + "loss": 1.3168072700500488, + "memory(GiB)": 15.74, + "step": 1920, + "token_acc": 0.645676219205631, + "train_speed(iter/s)": 0.134471 + }, + { + "epoch": 0.6592042463026775, + "grad_norm": 0.9062061905860901, + "learning_rate": 2.8524716176316713e-05, + "loss": 1.3464289665222169, + "memory(GiB)": 15.74, + "step": 1925, + "token_acc": 0.6457772651544278, + "train_speed(iter/s)": 0.134534 + }, + { + "epoch": 0.6609164651242428, + "grad_norm": 1.0723835229873657, + "learning_rate": 2.8269378736499753e-05, + "loss": 1.3291563987731934, + "memory(GiB)": 15.74, + "step": 1930, + "token_acc": 0.6496496496496497, + "train_speed(iter/s)": 0.134569 + }, + { + "epoch": 0.6626286839458083, + "grad_norm": 1.2485538721084595, + "learning_rate": 2.8014738080126425e-05, + "loss": 1.2998151779174805, + "memory(GiB)": 15.74, + "step": 1935, + "token_acc": 0.6443284015085582, + "train_speed(iter/s)": 0.134607 + }, + { + "epoch": 0.6643409027673737, + "grad_norm": 1.4759857654571533, + "learning_rate": 2.7760802372144822e-05, + "loss": 1.3513635635375976, + "memory(GiB)": 15.74, + "step": 1940, + "token_acc": 0.6325470998368194, + "train_speed(iter/s)": 0.134672 + }, + { + "epoch": 0.666053121588939, + "grad_norm": 1.331761121749878, + "learning_rate": 2.7507579754899056e-05, + "loss": 1.3231346130371093, + "memory(GiB)": 15.74, + "step": 1945, + "token_acc": 0.6395213076474022, + "train_speed(iter/s)": 0.134723 + }, + { + "epoch": 0.6677653404105045, + "grad_norm": 1.1743348836898804, + "learning_rate": 2.725507834786833e-05, + "loss": 1.4686296463012696, + "memory(GiB)": 15.74, + "step": 1950, + "token_acc": 0.6237789203084833, + "train_speed(iter/s)": 0.13476 + }, + { + "epoch": 0.6677653404105045, + "eval_loss": 1.2724452018737793, + "eval_runtime": 45.7383, + "eval_samples_per_second": 10.145, + "eval_steps_per_second": 10.145, + "eval_token_acc": 0.6406384528776741, + "step": 1950 + }, + { + "epoch": 0.6694775592320699, + "grad_norm": 1.1701655387878418, + "learning_rate": 2.7003306247406536e-05, + "loss": 1.3695281982421874, + "memory(GiB)": 15.74, + "step": 1955, + "token_acc": 0.6401136363636364, + "train_speed(iter/s)": 0.134276 + }, + { + "epoch": 0.6711897780536352, + "grad_norm": 1.1999833583831787, + "learning_rate": 2.675227152648264e-05, + "loss": 1.3321187973022461, + "memory(GiB)": 15.74, + "step": 1960, + "token_acc": 0.6405622489959839, + "train_speed(iter/s)": 0.134333 + }, + { + "epoch": 0.6729019968752007, + "grad_norm": 1.2453978061676025, + "learning_rate": 2.6501982234421852e-05, + "loss": 1.3802963256835938, + "memory(GiB)": 15.74, + "step": 1965, + "token_acc": 0.6288160833953834, + "train_speed(iter/s)": 0.134369 + }, + { + "epoch": 0.674614215696766, + "grad_norm": 1.2830731868743896, + "learning_rate": 2.62524463966475e-05, + "loss": 1.3157149314880372, + "memory(GiB)": 15.74, + "step": 1970, + "token_acc": 0.6465297355349712, + "train_speed(iter/s)": 0.13443 + }, + { + "epoch": 0.6763264345183314, + "grad_norm": 1.0469541549682617, + "learning_rate": 2.6003672014423673e-05, + "loss": 1.301660919189453, + "memory(GiB)": 15.74, + "step": 1975, + "token_acc": 0.6482569526047787, + "train_speed(iter/s)": 0.134496 + }, + { + "epoch": 0.6780386533398969, + "grad_norm": 1.145409107208252, + "learning_rate": 2.5755667064598786e-05, + "loss": 1.341436004638672, + "memory(GiB)": 15.74, + "step": 1980, + "token_acc": 0.637641047477113, + "train_speed(iter/s)": 0.134536 + }, + { + "epoch": 0.6797508721614622, + "grad_norm": 1.6011605262756348, + "learning_rate": 2.5508439499349675e-05, + "loss": 1.3730362892150878, + "memory(GiB)": 15.74, + "step": 1985, + "token_acc": 0.6293888166449935, + "train_speed(iter/s)": 0.134599 + }, + { + "epoch": 0.6814630909830276, + "grad_norm": 1.1660436391830444, + "learning_rate": 2.5261997245926612e-05, + "loss": 1.3104318618774413, + "memory(GiB)": 15.74, + "step": 1990, + "token_acc": 0.649867374005305, + "train_speed(iter/s)": 0.134632 + }, + { + "epoch": 0.6831753098045931, + "grad_norm": 1.3876395225524902, + "learning_rate": 2.5016348206399216e-05, + "loss": 1.320692253112793, + "memory(GiB)": 15.74, + "step": 1995, + "token_acc": 0.642627345844504, + "train_speed(iter/s)": 0.134697 + }, + { + "epoch": 0.6848875286261584, + "grad_norm": 1.1503660678863525, + "learning_rate": 2.4771500257403086e-05, + "loss": 1.2939563751220704, + "memory(GiB)": 15.74, + "step": 2000, + "token_acc": 0.652891520725715, + "train_speed(iter/s)": 0.134781 + }, + { + "epoch": 0.6848875286261584, + "eval_loss": 1.2708840370178223, + "eval_runtime": 45.4536, + "eval_samples_per_second": 10.208, + "eval_steps_per_second": 10.208, + "eval_token_acc": 0.6415656666004371, + "step": 2000 + }, + { + "epoch": 0.6865997474477238, + "grad_norm": 1.3885209560394287, + "learning_rate": 2.452746124988705e-05, + "loss": 1.327939224243164, + "memory(GiB)": 15.74, + "step": 2005, + "token_acc": 0.6411502554842675, + "train_speed(iter/s)": 0.134376 + }, + { + "epoch": 0.6883119662692893, + "grad_norm": 1.598423957824707, + "learning_rate": 2.4284239008861666e-05, + "loss": 1.3783462524414063, + "memory(GiB)": 15.74, + "step": 2010, + "token_acc": 0.6400758674550932, + "train_speed(iter/s)": 0.134421 + }, + { + "epoch": 0.6900241850908546, + "grad_norm": 1.4748547077178955, + "learning_rate": 2.4041841333148168e-05, + "loss": 1.411504077911377, + "memory(GiB)": 15.74, + "step": 2015, + "token_acc": 0.6301605504587156, + "train_speed(iter/s)": 0.134459 + }, + { + "epoch": 0.69173640391242, + "grad_norm": 1.1461318731307983, + "learning_rate": 2.380027599512844e-05, + "loss": 1.2850768089294433, + "memory(GiB)": 15.74, + "step": 2020, + "token_acc": 0.6512036786583717, + "train_speed(iter/s)": 0.134522 + }, + { + "epoch": 0.6934486227339854, + "grad_norm": 1.4452093839645386, + "learning_rate": 2.3559550740495824e-05, + "loss": 1.2930370330810548, + "memory(GiB)": 15.74, + "step": 2025, + "token_acc": 0.6500584795321638, + "train_speed(iter/s)": 0.134586 + }, + { + "epoch": 0.6951608415555508, + "grad_norm": 1.2744526863098145, + "learning_rate": 2.3319673288006715e-05, + "loss": 1.349466609954834, + "memory(GiB)": 15.74, + "step": 2030, + "token_acc": 0.635091496232508, + "train_speed(iter/s)": 0.13466 + }, + { + "epoch": 0.6968730603771162, + "grad_norm": 1.002764344215393, + "learning_rate": 2.308065132923305e-05, + "loss": 1.275795841217041, + "memory(GiB)": 15.74, + "step": 2035, + "token_acc": 0.6472693531283139, + "train_speed(iter/s)": 0.134738 + }, + { + "epoch": 0.6985852791986816, + "grad_norm": 1.016951084136963, + "learning_rate": 2.2842492528315783e-05, + "loss": 1.4072299003601074, + "memory(GiB)": 15.74, + "step": 2040, + "token_acc": 0.6340669357989321, + "train_speed(iter/s)": 0.134775 + }, + { + "epoch": 0.700297498020247, + "grad_norm": 0.838635265827179, + "learning_rate": 2.2605204521719037e-05, + "loss": 1.387449359893799, + "memory(GiB)": 15.74, + "step": 2045, + "token_acc": 0.6344627082634463, + "train_speed(iter/s)": 0.134798 + }, + { + "epoch": 0.7020097168418123, + "grad_norm": 1.474275827407837, + "learning_rate": 2.236879491798522e-05, + "loss": 1.3590201377868651, + "memory(GiB)": 15.74, + "step": 2050, + "token_acc": 0.6359211183225162, + "train_speed(iter/s)": 0.134851 + }, + { + "epoch": 0.7020097168418123, + "eval_loss": 1.268646001815796, + "eval_runtime": 44.77, + "eval_samples_per_second": 10.364, + "eval_steps_per_second": 10.364, + "eval_token_acc": 0.6417422787381063, + "step": 2050 + }, + { + "epoch": 0.7037219356633778, + "grad_norm": 1.288856863975525, + "learning_rate": 2.2133271297491165e-05, + "loss": 1.481348991394043, + "memory(GiB)": 15.74, + "step": 2055, + "token_acc": 0.6384577435391446, + "train_speed(iter/s)": 0.134414 + }, + { + "epoch": 0.7054341544849432, + "grad_norm": 1.5218098163604736, + "learning_rate": 2.189864121220505e-05, + "loss": 1.410487174987793, + "memory(GiB)": 15.74, + "step": 2060, + "token_acc": 0.6285425101214575, + "train_speed(iter/s)": 0.13446 + }, + { + "epoch": 0.7071463733065085, + "grad_norm": 1.0618783235549927, + "learning_rate": 2.1664912185444124e-05, + "loss": 1.3773748397827148, + "memory(GiB)": 15.74, + "step": 2065, + "token_acc": 0.6288966498512483, + "train_speed(iter/s)": 0.134515 + }, + { + "epoch": 0.708858592128074, + "grad_norm": 1.660845160484314, + "learning_rate": 2.1432091711633633e-05, + "loss": 1.3523015975952148, + "memory(GiB)": 15.74, + "step": 2070, + "token_acc": 0.624, + "train_speed(iter/s)": 0.134549 + }, + { + "epoch": 0.7105708109496394, + "grad_norm": 1.2365968227386475, + "learning_rate": 2.1200187256066424e-05, + "loss": 1.291434383392334, + "memory(GiB)": 15.74, + "step": 2075, + "token_acc": 0.6443224879132367, + "train_speed(iter/s)": 0.134614 + }, + { + "epoch": 0.7122830297712047, + "grad_norm": 1.3232195377349854, + "learning_rate": 2.096920625466359e-05, + "loss": 1.328212070465088, + "memory(GiB)": 15.74, + "step": 2080, + "token_acc": 0.6468395945140131, + "train_speed(iter/s)": 0.134661 + }, + { + "epoch": 0.7139952485927702, + "grad_norm": 1.6611577272415161, + "learning_rate": 2.0739156113736053e-05, + "loss": 1.3698694229125976, + "memory(GiB)": 15.74, + "step": 2085, + "token_acc": 0.6351796192929726, + "train_speed(iter/s)": 0.134697 + }, + { + "epoch": 0.7157074674143356, + "grad_norm": 1.4603625535964966, + "learning_rate": 2.0510044209747074e-05, + "loss": 1.3395660400390625, + "memory(GiB)": 15.74, + "step": 2090, + "token_acc": 0.6422913719943423, + "train_speed(iter/s)": 0.134754 + }, + { + "epoch": 0.7174196862359009, + "grad_norm": 1.2293682098388672, + "learning_rate": 2.028187788907574e-05, + "loss": 1.3571725845336915, + "memory(GiB)": 15.74, + "step": 2095, + "token_acc": 0.6341330425299891, + "train_speed(iter/s)": 0.134798 + }, + { + "epoch": 0.7191319050574664, + "grad_norm": 1.2185297012329102, + "learning_rate": 2.0054664467781387e-05, + "loss": 1.2488273620605468, + "memory(GiB)": 15.74, + "step": 2100, + "token_acc": 0.6659012629161883, + "train_speed(iter/s)": 0.134859 + }, + { + "epoch": 0.7191319050574664, + "eval_loss": 1.267549991607666, + "eval_runtime": 45.1434, + "eval_samples_per_second": 10.278, + "eval_steps_per_second": 10.278, + "eval_token_acc": 0.6424266507715742, + "step": 2100 + }, + { + "epoch": 0.7208441238790317, + "grad_norm": 1.3723160028457642, + "learning_rate": 1.9828411231369037e-05, + "loss": 1.3743553161621094, + "memory(GiB)": 15.74, + "step": 2105, + "token_acc": 0.6411017836407591, + "train_speed(iter/s)": 0.134453 + }, + { + "epoch": 0.7225563427005971, + "grad_norm": 1.250078558921814, + "learning_rate": 1.9603125434555746e-05, + "loss": 1.3290465354919434, + "memory(GiB)": 15.74, + "step": 2110, + "token_acc": 0.6497062279670975, + "train_speed(iter/s)": 0.134479 + }, + { + "epoch": 0.7242685615221626, + "grad_norm": 1.366199016571045, + "learning_rate": 1.9378814301038033e-05, + "loss": 1.4561357498168945, + "memory(GiB)": 15.74, + "step": 2115, + "token_acc": 0.6267857142857143, + "train_speed(iter/s)": 0.134508 + }, + { + "epoch": 0.7259807803437279, + "grad_norm": 1.4413526058197021, + "learning_rate": 1.9155485023260296e-05, + "loss": 1.361940288543701, + "memory(GiB)": 15.74, + "step": 2120, + "token_acc": 0.6430499325236168, + "train_speed(iter/s)": 0.134512 + }, + { + "epoch": 0.7276929991652933, + "grad_norm": 1.3154184818267822, + "learning_rate": 1.893314476218403e-05, + "loss": 1.4002976417541504, + "memory(GiB)": 15.74, + "step": 2125, + "token_acc": 0.6264462809917355, + "train_speed(iter/s)": 0.13457 + }, + { + "epoch": 0.7294052179868588, + "grad_norm": 1.0294270515441895, + "learning_rate": 1.8711800647058385e-05, + "loss": 1.3695240020751953, + "memory(GiB)": 15.74, + "step": 2130, + "token_acc": 0.638149392607909, + "train_speed(iter/s)": 0.1346 + }, + { + "epoch": 0.7311174368084241, + "grad_norm": 1.475088357925415, + "learning_rate": 1.8491459775191484e-05, + "loss": 1.2333250045776367, + "memory(GiB)": 15.74, + "step": 2135, + "token_acc": 0.6610446137105549, + "train_speed(iter/s)": 0.134615 + }, + { + "epoch": 0.7328296556299895, + "grad_norm": 1.1945194005966187, + "learning_rate": 1.8272129211722855e-05, + "loss": 1.293630886077881, + "memory(GiB)": 15.74, + "step": 2140, + "token_acc": 0.650517218803195, + "train_speed(iter/s)": 0.134664 + }, + { + "epoch": 0.7345418744515549, + "grad_norm": 1.258809208869934, + "learning_rate": 1.8053815989396927e-05, + "loss": 1.2797968864440918, + "memory(GiB)": 15.74, + "step": 2145, + "token_acc": 0.6559867421626847, + "train_speed(iter/s)": 0.13472 + }, + { + "epoch": 0.7362540932731203, + "grad_norm": 1.3185150623321533, + "learning_rate": 1.783652710833748e-05, + "loss": 1.381166648864746, + "memory(GiB)": 15.74, + "step": 2150, + "token_acc": 0.6339602071409103, + "train_speed(iter/s)": 0.134754 + }, + { + "epoch": 0.7362540932731203, + "eval_loss": 1.26602303981781, + "eval_runtime": 44.9582, + "eval_samples_per_second": 10.321, + "eval_steps_per_second": 10.321, + "eval_token_acc": 0.6415435900832285, + "step": 2150 + }, + { + "epoch": 0.7379663120946857, + "grad_norm": 1.1427522897720337, + "learning_rate": 1.762026953582322e-05, + "loss": 1.4028592109680176, + "memory(GiB)": 15.74, + "step": 2155, + "token_acc": 0.6402765513743025, + "train_speed(iter/s)": 0.134348 + }, + { + "epoch": 0.7396785309162511, + "grad_norm": 1.2889795303344727, + "learning_rate": 1.7405050206064373e-05, + "loss": 1.3801240921020508, + "memory(GiB)": 15.74, + "step": 2160, + "token_acc": 0.627132800381816, + "train_speed(iter/s)": 0.134429 + }, + { + "epoch": 0.7413907497378165, + "grad_norm": 1.2826013565063477, + "learning_rate": 1.7190876019980328e-05, + "loss": 1.3547374725341796, + "memory(GiB)": 15.74, + "step": 2165, + "token_acc": 0.6294808865089989, + "train_speed(iter/s)": 0.134475 + }, + { + "epoch": 0.7431029685593818, + "grad_norm": 1.555086612701416, + "learning_rate": 1.6977753844978405e-05, + "loss": 1.3988712310791016, + "memory(GiB)": 15.74, + "step": 2170, + "token_acc": 0.6290708096803107, + "train_speed(iter/s)": 0.134526 + }, + { + "epoch": 0.7448151873809473, + "grad_norm": 1.3174877166748047, + "learning_rate": 1.676569051473353e-05, + "loss": 1.3182533264160157, + "memory(GiB)": 15.74, + "step": 2175, + "token_acc": 0.6456733897202342, + "train_speed(iter/s)": 0.134571 + }, + { + "epoch": 0.7465274062025127, + "grad_norm": 1.2853842973709106, + "learning_rate": 1.655469282896932e-05, + "loss": 1.2554004669189454, + "memory(GiB)": 15.74, + "step": 2180, + "token_acc": 0.6545940329523526, + "train_speed(iter/s)": 0.134611 + }, + { + "epoch": 0.748239625024078, + "grad_norm": 1.1283583641052246, + "learning_rate": 1.6344767553239898e-05, + "loss": 1.2716236114501953, + "memory(GiB)": 15.74, + "step": 2185, + "token_acc": 0.6541554959785523, + "train_speed(iter/s)": 0.134676 + }, + { + "epoch": 0.7499518438456435, + "grad_norm": 1.2625576257705688, + "learning_rate": 1.6135921418712956e-05, + "loss": 1.3158624649047852, + "memory(GiB)": 15.74, + "step": 2190, + "token_acc": 0.6410842960082351, + "train_speed(iter/s)": 0.134715 + }, + { + "epoch": 0.7516640626672089, + "grad_norm": 1.2527461051940918, + "learning_rate": 1.592816112195401e-05, + "loss": 1.349888229370117, + "memory(GiB)": 15.74, + "step": 2195, + "token_acc": 0.6424707510378664, + "train_speed(iter/s)": 0.134727 + }, + { + "epoch": 0.7533762814887742, + "grad_norm": 1.4173930883407593, + "learning_rate": 1.5721493324711635e-05, + "loss": 1.346248245239258, + "memory(GiB)": 15.74, + "step": 2200, + "token_acc": 0.6515235457063712, + "train_speed(iter/s)": 0.134737 + }, + { + "epoch": 0.7533762814887742, + "eval_loss": 1.265234351158142, + "eval_runtime": 45.2436, + "eval_samples_per_second": 10.256, + "eval_steps_per_second": 10.256, + "eval_token_acc": 0.6415215135660198, + "step": 2200 + }, + { + "epoch": 0.7550885003103397, + "grad_norm": 1.2005822658538818, + "learning_rate": 1.551592465370384e-05, + "loss": 1.372615432739258, + "memory(GiB)": 15.74, + "step": 2205, + "token_acc": 0.6405270588235294, + "train_speed(iter/s)": 0.134335 + }, + { + "epoch": 0.756800719131905, + "grad_norm": 1.3633263111114502, + "learning_rate": 1.5311461700405616e-05, + "loss": 1.3027960777282714, + "memory(GiB)": 15.74, + "step": 2210, + "token_acc": 0.6485837734037446, + "train_speed(iter/s)": 0.134409 + }, + { + "epoch": 0.7585129379534704, + "grad_norm": 1.408820629119873, + "learning_rate": 1.5108111020837561e-05, + "loss": 1.3253872871398926, + "memory(GiB)": 15.74, + "step": 2215, + "token_acc": 0.6470042417815483, + "train_speed(iter/s)": 0.134446 + }, + { + "epoch": 0.7602251567750359, + "grad_norm": 1.366567850112915, + "learning_rate": 1.4905879135355682e-05, + "loss": 1.2841219902038574, + "memory(GiB)": 15.74, + "step": 2220, + "token_acc": 0.6472826914294413, + "train_speed(iter/s)": 0.134498 + }, + { + "epoch": 0.7619373755966012, + "grad_norm": 1.180812954902649, + "learning_rate": 1.4704772528442306e-05, + "loss": 1.330224609375, + "memory(GiB)": 15.74, + "step": 2225, + "token_acc": 0.6459463033990213, + "train_speed(iter/s)": 0.134554 + }, + { + "epoch": 0.7636495944181666, + "grad_norm": 0.942231297492981, + "learning_rate": 1.4504797648498186e-05, + "loss": 1.263327693939209, + "memory(GiB)": 15.74, + "step": 2230, + "token_acc": 0.6575696638896869, + "train_speed(iter/s)": 0.134635 + }, + { + "epoch": 0.7653618132397321, + "grad_norm": 1.1665061712265015, + "learning_rate": 1.4305960907635641e-05, + "loss": 1.2952257156372071, + "memory(GiB)": 15.74, + "step": 2235, + "token_acc": 0.6486710963455149, + "train_speed(iter/s)": 0.134692 + }, + { + "epoch": 0.7670740320612974, + "grad_norm": 1.3659709692001343, + "learning_rate": 1.4108268681473135e-05, + "loss": 1.2894656181335449, + "memory(GiB)": 15.74, + "step": 2240, + "token_acc": 0.6427033294682789, + "train_speed(iter/s)": 0.134748 + }, + { + "epoch": 0.7687862508828628, + "grad_norm": 1.1231846809387207, + "learning_rate": 1.3911727308930683e-05, + "loss": 1.2451932907104493, + "memory(GiB)": 15.74, + "step": 2245, + "token_acc": 0.6614796614796615, + "train_speed(iter/s)": 0.134794 + }, + { + "epoch": 0.7704984697044283, + "grad_norm": 1.277587652206421, + "learning_rate": 1.3716343092026629e-05, + "loss": 1.3834097862243653, + "memory(GiB)": 15.74, + "step": 2250, + "token_acc": 0.6296296296296297, + "train_speed(iter/s)": 0.134817 + }, + { + "epoch": 0.7704984697044283, + "eval_loss": 1.2641695737838745, + "eval_runtime": 44.9754, + "eval_samples_per_second": 10.317, + "eval_steps_per_second": 10.317, + "eval_token_acc": 0.6423604212199483, + "step": 2250 + }, + { + "epoch": 0.7722106885259936, + "grad_norm": 1.4374316930770874, + "learning_rate": 1.3522122295675616e-05, + "loss": 1.3191619873046876, + "memory(GiB)": 15.74, + "step": 2255, + "token_acc": 0.6424810386450472, + "train_speed(iter/s)": 0.134457 + }, + { + "epoch": 0.773922907347559, + "grad_norm": 1.4037755727767944, + "learning_rate": 1.3329071147487743e-05, + "loss": 1.3683411598205566, + "memory(GiB)": 15.74, + "step": 2260, + "token_acc": 0.6373897249610794, + "train_speed(iter/s)": 0.134478 + }, + { + "epoch": 0.7756351261691244, + "grad_norm": 1.2788645029067993, + "learning_rate": 1.3137195837568716e-05, + "loss": 1.3210175514221192, + "memory(GiB)": 15.74, + "step": 2265, + "token_acc": 0.6475143081584558, + "train_speed(iter/s)": 0.134508 + }, + { + "epoch": 0.7773473449906898, + "grad_norm": 1.1605987548828125, + "learning_rate": 1.294650251832154e-05, + "loss": 1.215747356414795, + "memory(GiB)": 15.74, + "step": 2270, + "token_acc": 0.660352922130484, + "train_speed(iter/s)": 0.134597 + }, + { + "epoch": 0.7790595638122552, + "grad_norm": 1.4906803369522095, + "learning_rate": 1.2756997304249163e-05, + "loss": 1.4509859085083008, + "memory(GiB)": 15.74, + "step": 2275, + "token_acc": 0.6196606202457577, + "train_speed(iter/s)": 0.134622 + }, + { + "epoch": 0.7807717826338206, + "grad_norm": 1.493390440940857, + "learning_rate": 1.2568686271758423e-05, + "loss": 1.2567498207092285, + "memory(GiB)": 15.74, + "step": 2280, + "token_acc": 0.6535477500352659, + "train_speed(iter/s)": 0.134676 + }, + { + "epoch": 0.782484001455386, + "grad_norm": 1.3492743968963623, + "learning_rate": 1.2381575458965216e-05, + "loss": 1.3606427192687989, + "memory(GiB)": 15.74, + "step": 2285, + "token_acc": 0.6342772782176889, + "train_speed(iter/s)": 0.134704 + }, + { + "epoch": 0.7841962202769514, + "grad_norm": 1.4635789394378662, + "learning_rate": 1.2195670865500897e-05, + "loss": 1.3723998069763184, + "memory(GiB)": 15.74, + "step": 2290, + "token_acc": 0.6362095749187968, + "train_speed(iter/s)": 0.134753 + }, + { + "epoch": 0.7859084390985168, + "grad_norm": 1.1486222743988037, + "learning_rate": 1.2010978452319843e-05, + "loss": 1.4245205879211427, + "memory(GiB)": 15.74, + "step": 2295, + "token_acc": 0.6227587180573203, + "train_speed(iter/s)": 0.134798 + }, + { + "epoch": 0.7876206579200822, + "grad_norm": 1.1271564960479736, + "learning_rate": 1.1827504141508456e-05, + "loss": 1.366541290283203, + "memory(GiB)": 15.74, + "step": 2300, + "token_acc": 0.6488962472406181, + "train_speed(iter/s)": 0.134815 + }, + { + "epoch": 0.7876206579200822, + "eval_loss": 1.2632496356964111, + "eval_runtime": 45.4924, + "eval_samples_per_second": 10.2, + "eval_steps_per_second": 10.2, + "eval_token_acc": 0.6424708038059915, + "step": 2300 + }, + { + "epoch": 0.7893328767416475, + "grad_norm": 1.1487739086151123, + "learning_rate": 1.1645253816095131e-05, + "loss": 1.342410945892334, + "memory(GiB)": 15.74, + "step": 2305, + "token_acc": 0.6417007083161563, + "train_speed(iter/s)": 0.134399 + }, + { + "epoch": 0.791045095563213, + "grad_norm": 1.1937812566757202, + "learning_rate": 1.1464233319861662e-05, + "loss": 1.2644430160522462, + "memory(GiB)": 15.74, + "step": 2310, + "token_acc": 0.6519286476597043, + "train_speed(iter/s)": 0.134451 + }, + { + "epoch": 0.7927573143847784, + "grad_norm": 1.2581509351730347, + "learning_rate": 1.1284448457155894e-05, + "loss": 1.367478084564209, + "memory(GiB)": 15.74, + "step": 2315, + "token_acc": 0.6354970494779846, + "train_speed(iter/s)": 0.134486 + }, + { + "epoch": 0.7944695332063437, + "grad_norm": 1.0667152404785156, + "learning_rate": 1.1105904992705629e-05, + "loss": 1.3414364814758302, + "memory(GiB)": 15.74, + "step": 2320, + "token_acc": 0.6301060070671378, + "train_speed(iter/s)": 0.134546 + }, + { + "epoch": 0.7961817520279092, + "grad_norm": 1.0880553722381592, + "learning_rate": 1.0928608651433676e-05, + "loss": 1.3535715103149415, + "memory(GiB)": 15.74, + "step": 2325, + "token_acc": 0.6367917586460633, + "train_speed(iter/s)": 0.134602 + }, + { + "epoch": 0.7978939708494746, + "grad_norm": 1.4338234663009644, + "learning_rate": 1.0752565118274382e-05, + "loss": 1.399939727783203, + "memory(GiB)": 15.74, + "step": 2330, + "token_acc": 0.6275309145264302, + "train_speed(iter/s)": 0.134641 + }, + { + "epoch": 0.7996061896710399, + "grad_norm": 1.5409365892410278, + "learning_rate": 1.0577780037991331e-05, + "loss": 1.340895652770996, + "memory(GiB)": 15.74, + "step": 2335, + "token_acc": 0.6382455124493341, + "train_speed(iter/s)": 0.134676 + }, + { + "epoch": 0.8013184084926054, + "grad_norm": 1.2550970315933228, + "learning_rate": 1.040425901499631e-05, + "loss": 1.3813422203063965, + "memory(GiB)": 15.74, + "step": 2340, + "token_acc": 0.631074009469225, + "train_speed(iter/s)": 0.134723 + }, + { + "epoch": 0.8030306273141707, + "grad_norm": 1.1674302816390991, + "learning_rate": 1.0232007613169636e-05, + "loss": 1.4022876739501953, + "memory(GiB)": 15.74, + "step": 2345, + "token_acc": 0.6294227188081937, + "train_speed(iter/s)": 0.13474 + }, + { + "epoch": 0.8047428461357361, + "grad_norm": 1.1433930397033691, + "learning_rate": 1.0061031355681766e-05, + "loss": 1.3312711715698242, + "memory(GiB)": 15.74, + "step": 2350, + "token_acc": 0.6397420147420148, + "train_speed(iter/s)": 0.134763 + }, + { + "epoch": 0.8047428461357361, + "eval_loss": 1.2625855207443237, + "eval_runtime": 44.9696, + "eval_samples_per_second": 10.318, + "eval_steps_per_second": 10.318, + "eval_token_acc": 0.6427136454952866, + "step": 2350 + }, + { + "epoch": 0.8064550649573016, + "grad_norm": 1.3173065185546875, + "learning_rate": 9.89133572481612e-06, + "loss": 1.3352961540222168, + "memory(GiB)": 15.74, + "step": 2355, + "token_acc": 0.6409483167377904, + "train_speed(iter/s)": 0.134421 + }, + { + "epoch": 0.8081672837788669, + "grad_norm": 1.6041507720947266, + "learning_rate": 9.722926161793416e-06, + "loss": 1.3231005668640137, + "memory(GiB)": 15.74, + "step": 2360, + "token_acc": 0.6341386884217132, + "train_speed(iter/s)": 0.134465 + }, + { + "epoch": 0.8098795026004323, + "grad_norm": 1.283277153968811, + "learning_rate": 9.555808066597121e-06, + "loss": 1.3785671234130858, + "memory(GiB)": 15.74, + "step": 2365, + "token_acc": 0.6402934241550956, + "train_speed(iter/s)": 0.134493 + }, + { + "epoch": 0.8115917214219978, + "grad_norm": 1.3088126182556152, + "learning_rate": 9.38998679780027e-06, + "loss": 1.26265869140625, + "memory(GiB)": 15.74, + "step": 2370, + "token_acc": 0.6558107167710509, + "train_speed(iter/s)": 0.134527 + }, + { + "epoch": 0.8133039402435631, + "grad_norm": 1.314698338508606, + "learning_rate": 9.225467672393729e-06, + "loss": 1.3656015396118164, + "memory(GiB)": 15.74, + "step": 2375, + "token_acc": 0.6421039417524479, + "train_speed(iter/s)": 0.134581 + }, + { + "epoch": 0.8150161590651285, + "grad_norm": 1.0786126852035522, + "learning_rate": 9.062255965615701e-06, + "loss": 1.299867820739746, + "memory(GiB)": 15.74, + "step": 2380, + "token_acc": 0.6503754446054538, + "train_speed(iter/s)": 0.134615 + }, + { + "epoch": 0.816728377886694, + "grad_norm": 1.502505898475647, + "learning_rate": 8.900356910782487e-06, + "loss": 1.3907661437988281, + "memory(GiB)": 15.74, + "step": 2385, + "token_acc": 0.6405990016638935, + "train_speed(iter/s)": 0.134654 + }, + { + "epoch": 0.8184405967082593, + "grad_norm": 1.1903234720230103, + "learning_rate": 8.739775699120773e-06, + "loss": 1.288747787475586, + "memory(GiB)": 15.74, + "step": 2390, + "token_acc": 0.6546951437775922, + "train_speed(iter/s)": 0.134687 + }, + { + "epoch": 0.8201528155298247, + "grad_norm": 1.4061367511749268, + "learning_rate": 8.580517479601146e-06, + "loss": 1.4231932640075684, + "memory(GiB)": 15.74, + "step": 2395, + "token_acc": 0.6211269635394149, + "train_speed(iter/s)": 0.134698 + }, + { + "epoch": 0.8218650343513901, + "grad_norm": 1.4037688970565796, + "learning_rate": 8.422587358773e-06, + "loss": 1.470430850982666, + "memory(GiB)": 15.74, + "step": 2400, + "token_acc": 0.6177246839838867, + "train_speed(iter/s)": 0.134744 + }, + { + "epoch": 0.8218650343513901, + "eval_loss": 1.261520266532898, + "eval_runtime": 45.1192, + "eval_samples_per_second": 10.284, + "eval_steps_per_second": 10.284, + "eval_token_acc": 0.642691568978078, + "step": 2400 + }, + { + "epoch": 0.8235772531729555, + "grad_norm": 1.3261678218841553, + "learning_rate": 8.265990400600759e-06, + "loss": 1.2831815719604491, + "memory(GiB)": 15.74, + "step": 2405, + "token_acc": 0.6444738084001888, + "train_speed(iter/s)": 0.134386 + }, + { + "epoch": 0.8252894719945209, + "grad_norm": 1.2294526100158691, + "learning_rate": 8.110731626301577e-06, + "loss": 1.3113703727722168, + "memory(GiB)": 15.74, + "step": 2410, + "token_acc": 0.6479945615227736, + "train_speed(iter/s)": 0.134417 + }, + { + "epoch": 0.8270016908160863, + "grad_norm": 1.3722708225250244, + "learning_rate": 7.956816014184255e-06, + "loss": 1.3398893356323243, + "memory(GiB)": 15.74, + "step": 2415, + "token_acc": 0.652123663681017, + "train_speed(iter/s)": 0.134443 + }, + { + "epoch": 0.8287139096376517, + "grad_norm": 1.3788970708847046, + "learning_rate": 7.80424849948967e-06, + "loss": 1.4234814643859863, + "memory(GiB)": 15.74, + "step": 2420, + "token_acc": 0.6247325631151048, + "train_speed(iter/s)": 0.13448 + }, + { + "epoch": 0.830426128459217, + "grad_norm": 1.4291893243789673, + "learning_rate": 7.653033974232505e-06, + "loss": 1.3413211822509765, + "memory(GiB)": 15.74, + "step": 2425, + "token_acc": 0.6429787892862672, + "train_speed(iter/s)": 0.134529 + }, + { + "epoch": 0.8321383472807825, + "grad_norm": 1.078174352645874, + "learning_rate": 7.5031772870444e-06, + "loss": 1.3063885688781738, + "memory(GiB)": 15.74, + "step": 2430, + "token_acc": 0.6550611021443394, + "train_speed(iter/s)": 0.134555 + }, + { + "epoch": 0.8338505661023479, + "grad_norm": 1.2182468175888062, + "learning_rate": 7.3546832430184394e-06, + "loss": 1.3086478233337402, + "memory(GiB)": 15.74, + "step": 2435, + "token_acc": 0.6496636085626911, + "train_speed(iter/s)": 0.134595 + }, + { + "epoch": 0.8355627849239132, + "grad_norm": 1.253243327140808, + "learning_rate": 7.207556603555188e-06, + "loss": 1.2322759628295898, + "memory(GiB)": 15.74, + "step": 2440, + "token_acc": 0.6602930221008194, + "train_speed(iter/s)": 0.134623 + }, + { + "epoch": 0.8372750037454787, + "grad_norm": 1.3736135959625244, + "learning_rate": 7.061802086209857e-06, + "loss": 1.3849195480346679, + "memory(GiB)": 15.74, + "step": 2445, + "token_acc": 0.6308674985145574, + "train_speed(iter/s)": 0.134672 + }, + { + "epoch": 0.838987222567044, + "grad_norm": 1.473252534866333, + "learning_rate": 6.917424364541181e-06, + "loss": 1.3837077140808105, + "memory(GiB)": 15.74, + "step": 2450, + "token_acc": 0.6207843137254901, + "train_speed(iter/s)": 0.134701 + }, + { + "epoch": 0.838987222567044, + "eval_loss": 1.261076807975769, + "eval_runtime": 45.0632, + "eval_samples_per_second": 10.297, + "eval_steps_per_second": 10.297, + "eval_token_acc": 0.6432214053910855, + "step": 2450 + }, + { + "epoch": 0.8406994413886094, + "grad_norm": 1.1063319444656372, + "learning_rate": 6.774428067961503e-06, + "loss": 1.3658445358276368, + "memory(GiB)": 15.74, + "step": 2455, + "token_acc": 0.641900352900998, + "train_speed(iter/s)": 0.134368 + }, + { + "epoch": 0.8424116602101749, + "grad_norm": 1.4162750244140625, + "learning_rate": 6.632817781588313e-06, + "loss": 1.3392247200012206, + "memory(GiB)": 15.74, + "step": 2460, + "token_acc": 0.639528354856822, + "train_speed(iter/s)": 0.134405 + }, + { + "epoch": 0.8441238790317402, + "grad_norm": 1.3479872941970825, + "learning_rate": 6.492598046097281e-06, + "loss": 1.2609476089477538, + "memory(GiB)": 15.74, + "step": 2465, + "token_acc": 0.6578986476445237, + "train_speed(iter/s)": 0.134432 + }, + { + "epoch": 0.8458360978533056, + "grad_norm": 1.3336763381958008, + "learning_rate": 6.353773357576614e-06, + "loss": 1.3712710380554198, + "memory(GiB)": 15.74, + "step": 2470, + "token_acc": 0.630315336837076, + "train_speed(iter/s)": 0.134495 + }, + { + "epoch": 0.8475483166748711, + "grad_norm": 1.7451952695846558, + "learning_rate": 6.216348167382929e-06, + "loss": 1.3323819160461425, + "memory(GiB)": 15.74, + "step": 2475, + "token_acc": 0.646716316858497, + "train_speed(iter/s)": 0.134505 + }, + { + "epoch": 0.8492605354964364, + "grad_norm": 1.313570261001587, + "learning_rate": 6.080326881998483e-06, + "loss": 1.4253095626831054, + "memory(GiB)": 15.74, + "step": 2480, + "token_acc": 0.6181412444211078, + "train_speed(iter/s)": 0.134535 + }, + { + "epoch": 0.8509727543180018, + "grad_norm": 1.312538743019104, + "learning_rate": 5.9457138628899176e-06, + "loss": 1.2920021057128905, + "memory(GiB)": 15.74, + "step": 2485, + "token_acc": 0.6491350446428571, + "train_speed(iter/s)": 0.13459 + }, + { + "epoch": 0.8526849731395673, + "grad_norm": 1.245670199394226, + "learning_rate": 5.812513426368399e-06, + "loss": 1.3483725547790528, + "memory(GiB)": 15.74, + "step": 2490, + "token_acc": 0.6404642574518596, + "train_speed(iter/s)": 0.134619 + }, + { + "epoch": 0.8543971919611326, + "grad_norm": 1.3241406679153442, + "learning_rate": 5.680729843451172e-06, + "loss": 1.3472556114196776, + "memory(GiB)": 15.74, + "step": 2495, + "token_acc": 0.6408905282571467, + "train_speed(iter/s)": 0.134648 + }, + { + "epoch": 0.856109410782698, + "grad_norm": 1.4123140573501587, + "learning_rate": 5.550367339724721e-06, + "loss": 1.4184735298156739, + "memory(GiB)": 15.74, + "step": 2500, + "token_acc": 0.628310502283105, + "train_speed(iter/s)": 0.134671 + }, + { + "epoch": 0.856109410782698, + "eval_loss": 1.2605270147323608, + "eval_runtime": 44.7733, + "eval_samples_per_second": 10.363, + "eval_steps_per_second": 10.363, + "eval_token_acc": 0.6436187827008412, + "step": 2500 + }, + { + "epoch": 0.8578216296042634, + "grad_norm": 1.3209956884384155, + "learning_rate": 5.421430095209174e-06, + "loss": 1.3851608276367187, + "memory(GiB)": 15.74, + "step": 2505, + "token_acc": 0.6410227143020203, + "train_speed(iter/s)": 0.134339 + }, + { + "epoch": 0.8595338484258288, + "grad_norm": 1.3596477508544922, + "learning_rate": 5.293922244224275e-06, + "loss": 1.3824989318847656, + "memory(GiB)": 15.74, + "step": 2510, + "token_acc": 0.6239235737351991, + "train_speed(iter/s)": 0.13438 + }, + { + "epoch": 0.8612460672473942, + "grad_norm": 1.281273603439331, + "learning_rate": 5.167847875256903e-06, + "loss": 1.3345693588256835, + "memory(GiB)": 15.74, + "step": 2515, + "token_acc": 0.6422333571939871, + "train_speed(iter/s)": 0.134431 + }, + { + "epoch": 0.8629582860689596, + "grad_norm": 1.1840367317199707, + "learning_rate": 5.043211030829919e-06, + "loss": 1.3493609428405762, + "memory(GiB)": 15.74, + "step": 2520, + "token_acc": 0.6385588348026063, + "train_speed(iter/s)": 0.134453 + }, + { + "epoch": 0.864670504890525, + "grad_norm": 1.3820964097976685, + "learning_rate": 4.9200157073725215e-06, + "loss": 1.3996575355529786, + "memory(GiB)": 15.74, + "step": 2525, + "token_acc": 0.6411019141231247, + "train_speed(iter/s)": 0.13445 + }, + { + "epoch": 0.8663827237120904, + "grad_norm": 1.571694016456604, + "learning_rate": 4.7982658550921485e-06, + "loss": 1.418349552154541, + "memory(GiB)": 15.74, + "step": 2530, + "token_acc": 0.6322903629536921, + "train_speed(iter/s)": 0.134464 + }, + { + "epoch": 0.8680949425336558, + "grad_norm": 1.4280903339385986, + "learning_rate": 4.677965377847799e-06, + "loss": 1.338277816772461, + "memory(GiB)": 15.74, + "step": 2535, + "token_acc": 0.6435658009921214, + "train_speed(iter/s)": 0.134504 + }, + { + "epoch": 0.8698071613552212, + "grad_norm": 1.3451240062713623, + "learning_rate": 4.559118133024853e-06, + "loss": 1.3945571899414062, + "memory(GiB)": 15.74, + "step": 2540, + "token_acc": 0.6330536409949056, + "train_speed(iter/s)": 0.134548 + }, + { + "epoch": 0.8715193801767865, + "grad_norm": 1.3829066753387451, + "learning_rate": 4.441727931411399e-06, + "loss": 1.2137330055236817, + "memory(GiB)": 15.74, + "step": 2545, + "token_acc": 0.66168876482903, + "train_speed(iter/s)": 0.134595 + }, + { + "epoch": 0.873231598998352, + "grad_norm": 1.1971951723098755, + "learning_rate": 4.325798537076032e-06, + "loss": 1.3273194313049317, + "memory(GiB)": 15.74, + "step": 2550, + "token_acc": 0.6455197636798424, + "train_speed(iter/s)": 0.134647 + }, + { + "epoch": 0.873231598998352, + "eval_loss": 1.2600239515304565, + "eval_runtime": 45.1148, + "eval_samples_per_second": 10.285, + "eval_steps_per_second": 10.285, + "eval_token_acc": 0.6430889462878336, + "step": 2550 + }, + { + "epoch": 0.8749438178199174, + "grad_norm": 1.2988086938858032, + "learning_rate": 4.2113336672471245e-06, + "loss": 1.409194564819336, + "memory(GiB)": 15.74, + "step": 2555, + "token_acc": 0.6405340312150348, + "train_speed(iter/s)": 0.134314 + }, + { + "epoch": 0.8766560366414827, + "grad_norm": 1.3488688468933105, + "learning_rate": 4.098336992193741e-06, + "loss": 1.2962044715881347, + "memory(GiB)": 15.74, + "step": 2560, + "token_acc": 0.6490477504830251, + "train_speed(iter/s)": 0.134368 + }, + { + "epoch": 0.8783682554630482, + "grad_norm": 1.4351921081542969, + "learning_rate": 3.986812135107843e-06, + "loss": 1.286128616333008, + "memory(GiB)": 15.74, + "step": 2565, + "token_acc": 0.6498240675580577, + "train_speed(iter/s)": 0.134412 + }, + { + "epoch": 0.8800804742846136, + "grad_norm": 1.4418823719024658, + "learning_rate": 3.876762671988132e-06, + "loss": 1.2905272483825683, + "memory(GiB)": 15.74, + "step": 2570, + "token_acc": 0.6540350877192982, + "train_speed(iter/s)": 0.134447 + }, + { + "epoch": 0.8817926931061789, + "grad_norm": 0.951321542263031, + "learning_rate": 3.7681921315254443e-06, + "loss": 1.460329532623291, + "memory(GiB)": 15.74, + "step": 2575, + "token_acc": 0.6318591763555362, + "train_speed(iter/s)": 0.134454 + }, + { + "epoch": 0.8835049119277444, + "grad_norm": 1.3315798044204712, + "learning_rate": 3.661103994989573e-06, + "loss": 1.3015137672424317, + "memory(GiB)": 15.74, + "step": 2580, + "token_acc": 0.6482994990772476, + "train_speed(iter/s)": 0.134489 + }, + { + "epoch": 0.8852171307493097, + "grad_norm": 1.3303027153015137, + "learning_rate": 3.555501696117608e-06, + "loss": 1.28341064453125, + "memory(GiB)": 15.74, + "step": 2585, + "token_acc": 0.6480354879594423, + "train_speed(iter/s)": 0.134538 + }, + { + "epoch": 0.8869293495708751, + "grad_norm": 1.4677505493164062, + "learning_rate": 3.451388621003887e-06, + "loss": 1.3655584335327149, + "memory(GiB)": 15.74, + "step": 2590, + "token_acc": 0.6361934926576447, + "train_speed(iter/s)": 0.134577 + }, + { + "epoch": 0.8886415683924406, + "grad_norm": 1.3708767890930176, + "learning_rate": 3.3487681079913903e-06, + "loss": 1.2949275016784667, + "memory(GiB)": 15.74, + "step": 2595, + "token_acc": 0.6404906021025805, + "train_speed(iter/s)": 0.134613 + }, + { + "epoch": 0.8903537872140059, + "grad_norm": 1.368079423904419, + "learning_rate": 3.2476434475647078e-06, + "loss": 1.340726089477539, + "memory(GiB)": 15.74, + "step": 2600, + "token_acc": 0.6478278093225934, + "train_speed(iter/s)": 0.134643 + }, + { + "epoch": 0.8903537872140059, + "eval_loss": 1.2596690654754639, + "eval_runtime": 45.637, + "eval_samples_per_second": 10.167, + "eval_steps_per_second": 10.167, + "eval_token_acc": 0.6433538644943374, + "step": 2600 + }, + { + "epoch": 0.8920660060355713, + "grad_norm": 1.255418300628662, + "learning_rate": 3.148017882244536e-06, + "loss": 1.3733959197998047, + "memory(GiB)": 15.74, + "step": 2605, + "token_acc": 0.6420357230242232, + "train_speed(iter/s)": 0.134297 + }, + { + "epoch": 0.8937782248571368, + "grad_norm": 1.0704141855239868, + "learning_rate": 3.0498946064837053e-06, + "loss": 1.1653754234313964, + "memory(GiB)": 15.74, + "step": 2610, + "token_acc": 0.6709594333547971, + "train_speed(iter/s)": 0.134332 + }, + { + "epoch": 0.8954904436787021, + "grad_norm": 1.3268773555755615, + "learning_rate": 2.953276766564722e-06, + "loss": 1.353217315673828, + "memory(GiB)": 15.74, + "step": 2615, + "token_acc": 0.6409261056425898, + "train_speed(iter/s)": 0.134376 + }, + { + "epoch": 0.8972026625002676, + "grad_norm": 1.2994683980941772, + "learning_rate": 2.8581674604989563e-06, + "loss": 1.3654675483703613, + "memory(GiB)": 15.74, + "step": 2620, + "token_acc": 0.6417600664176006, + "train_speed(iter/s)": 0.134418 + }, + { + "epoch": 0.898914881321833, + "grad_norm": 1.2852952480316162, + "learning_rate": 2.7645697379272304e-06, + "loss": 1.4031280517578124, + "memory(GiB)": 15.74, + "step": 2625, + "token_acc": 0.6333874796951143, + "train_speed(iter/s)": 0.134429 + }, + { + "epoch": 0.9006271001433983, + "grad_norm": 1.0720856189727783, + "learning_rate": 2.6724866000220562e-06, + "loss": 1.3376687049865723, + "memory(GiB)": 15.74, + "step": 2630, + "token_acc": 0.6374572795625427, + "train_speed(iter/s)": 0.134493 + }, + { + "epoch": 0.9023393189649638, + "grad_norm": 1.2691612243652344, + "learning_rate": 2.5819209993914184e-06, + "loss": 1.3927826881408691, + "memory(GiB)": 15.74, + "step": 2635, + "token_acc": 0.6334841628959276, + "train_speed(iter/s)": 0.134518 + }, + { + "epoch": 0.9040515377865291, + "grad_norm": 1.2793500423431396, + "learning_rate": 2.4928758399841214e-06, + "loss": 1.3870887756347656, + "memory(GiB)": 15.74, + "step": 2640, + "token_acc": 0.6293124844874659, + "train_speed(iter/s)": 0.134543 + }, + { + "epoch": 0.9057637566080945, + "grad_norm": 1.5122885704040527, + "learning_rate": 2.4053539769965993e-06, + "loss": 1.364862060546875, + "memory(GiB)": 15.74, + "step": 2645, + "token_acc": 0.6329818394844757, + "train_speed(iter/s)": 0.134573 + }, + { + "epoch": 0.90747597542966, + "grad_norm": 1.4883878231048584, + "learning_rate": 2.319358216781442e-06, + "loss": 1.3078365325927734, + "memory(GiB)": 15.74, + "step": 2650, + "token_acc": 0.6428915662650603, + "train_speed(iter/s)": 0.134633 + }, + { + "epoch": 0.90747597542966, + "eval_loss": 1.2594929933547974, + "eval_runtime": 44.7712, + "eval_samples_per_second": 10.364, + "eval_steps_per_second": 10.364, + "eval_token_acc": 0.6434642470803806, + "step": 2650 + }, + { + "epoch": 0.9091881942512253, + "grad_norm": 1.4252084493637085, + "learning_rate": 2.2348913167573705e-06, + "loss": 1.3416409492492676, + "memory(GiB)": 15.74, + "step": 2655, + "token_acc": 0.6441969630741747, + "train_speed(iter/s)": 0.134295 + }, + { + "epoch": 0.9109004130727907, + "grad_norm": 1.3295289278030396, + "learning_rate": 2.1519559853208305e-06, + "loss": 1.3008309364318849, + "memory(GiB)": 15.74, + "step": 2660, + "token_acc": 0.6566976231701558, + "train_speed(iter/s)": 0.134317 + }, + { + "epoch": 0.9126126318943562, + "grad_norm": 1.3812507390975952, + "learning_rate": 2.0705548817591624e-06, + "loss": 1.3543438911437988, + "memory(GiB)": 15.74, + "step": 2665, + "token_acc": 0.63641349054017, + "train_speed(iter/s)": 0.134352 + }, + { + "epoch": 0.9143248507159215, + "grad_norm": 1.4405107498168945, + "learning_rate": 1.9906906161653083e-06, + "loss": 1.3670412063598634, + "memory(GiB)": 15.74, + "step": 2670, + "token_acc": 0.6387388519030273, + "train_speed(iter/s)": 0.134359 + }, + { + "epoch": 0.9160370695374869, + "grad_norm": 1.1116528511047363, + "learning_rate": 1.9123657493541282e-06, + "loss": 1.3644820213317872, + "memory(GiB)": 15.74, + "step": 2675, + "token_acc": 0.6404174573055028, + "train_speed(iter/s)": 0.1344 + }, + { + "epoch": 0.9177492883590523, + "grad_norm": 1.3267613649368286, + "learning_rate": 1.8355827927803048e-06, + "loss": 1.3995431900024413, + "memory(GiB)": 15.74, + "step": 2680, + "token_acc": 0.6395732791465583, + "train_speed(iter/s)": 0.134432 + }, + { + "epoch": 0.9194615071806177, + "grad_norm": 1.1363773345947266, + "learning_rate": 1.7603442084578092e-06, + "loss": 1.3316152572631836, + "memory(GiB)": 15.74, + "step": 2685, + "token_acc": 0.6465589300640847, + "train_speed(iter/s)": 0.13446 + }, + { + "epoch": 0.9211737260021831, + "grad_norm": 1.3298614025115967, + "learning_rate": 1.6866524088809177e-06, + "loss": 1.3133241653442382, + "memory(GiB)": 15.74, + "step": 2690, + "token_acc": 0.6507722481000245, + "train_speed(iter/s)": 0.134463 + }, + { + "epoch": 0.9228859448237485, + "grad_norm": 1.1017136573791504, + "learning_rate": 1.6145097569469193e-06, + "loss": 1.349873161315918, + "memory(GiB)": 15.74, + "step": 2695, + "token_acc": 0.637273064097814, + "train_speed(iter/s)": 0.134493 + }, + { + "epoch": 0.9245981636453139, + "grad_norm": 1.1092779636383057, + "learning_rate": 1.5439185658803256e-06, + "loss": 1.3335725784301757, + "memory(GiB)": 15.74, + "step": 2700, + "token_acc": 0.6352576335877863, + "train_speed(iter/s)": 0.134527 + }, + { + "epoch": 0.9245981636453139, + "eval_loss": 1.2592636346817017, + "eval_runtime": 45.4231, + "eval_samples_per_second": 10.215, + "eval_steps_per_second": 10.215, + "eval_token_acc": 0.6436629357352585, + "step": 2700 + }, + { + "epoch": 0.9263103824668792, + "grad_norm": 1.4030323028564453, + "learning_rate": 1.4748810991586537e-06, + "loss": 1.3629247665405273, + "memory(GiB)": 15.74, + "step": 2705, + "token_acc": 0.6427411807399201, + "train_speed(iter/s)": 0.134186 + }, + { + "epoch": 0.9280226012884447, + "grad_norm": 1.4495233297348022, + "learning_rate": 1.4073995704399267e-06, + "loss": 1.4466259002685546, + "memory(GiB)": 15.74, + "step": 2710, + "token_acc": 0.6214079367492777, + "train_speed(iter/s)": 0.134214 + }, + { + "epoch": 0.9297348201100101, + "grad_norm": 1.3950856924057007, + "learning_rate": 1.3414761434916435e-06, + "loss": 1.360568904876709, + "memory(GiB)": 15.74, + "step": 2715, + "token_acc": 0.6382387022016223, + "train_speed(iter/s)": 0.134254 + }, + { + "epoch": 0.9314470389315754, + "grad_norm": 1.2819833755493164, + "learning_rate": 1.2771129321213992e-06, + "loss": 1.4097044944763184, + "memory(GiB)": 15.74, + "step": 2720, + "token_acc": 0.6269491727175336, + "train_speed(iter/s)": 0.134265 + }, + { + "epoch": 0.9331592577531409, + "grad_norm": 1.3099892139434814, + "learning_rate": 1.2143120001091301e-06, + "loss": 1.4392906188964845, + "memory(GiB)": 15.74, + "step": 2725, + "token_acc": 0.6256742179072277, + "train_speed(iter/s)": 0.134297 + }, + { + "epoch": 0.9348714765747063, + "grad_norm": 1.5486177206039429, + "learning_rate": 1.153075361140915e-06, + "loss": 1.3564040184020996, + "memory(GiB)": 15.74, + "step": 2730, + "token_acc": 0.6316229782487451, + "train_speed(iter/s)": 0.134352 + }, + { + "epoch": 0.9365836953962716, + "grad_norm": 1.194618821144104, + "learning_rate": 1.0934049787444389e-06, + "loss": 1.384223461151123, + "memory(GiB)": 15.74, + "step": 2735, + "token_acc": 0.6341066341066341, + "train_speed(iter/s)": 0.134379 + }, + { + "epoch": 0.9382959142178371, + "grad_norm": 1.243384599685669, + "learning_rate": 1.0353027662259873e-06, + "loss": 1.3777570724487305, + "memory(GiB)": 15.74, + "step": 2740, + "token_acc": 0.6239466733744183, + "train_speed(iter/s)": 0.134412 + }, + { + "epoch": 0.9400081330394024, + "grad_norm": 1.0930519104003906, + "learning_rate": 9.787705866091457e-07, + "loss": 1.3696574211120605, + "memory(GiB)": 15.74, + "step": 2745, + "token_acc": 0.6416593115622242, + "train_speed(iter/s)": 0.134444 + }, + { + "epoch": 0.9417203518609678, + "grad_norm": 1.5372965335845947, + "learning_rate": 9.238102525750247e-07, + "loss": 1.3618303298950196, + "memory(GiB)": 15.74, + "step": 2750, + "token_acc": 0.6381128848346637, + "train_speed(iter/s)": 0.134481 + }, + { + "epoch": 0.9417203518609678, + "eval_loss": 1.2589329481124878, + "eval_runtime": 44.9364, + "eval_samples_per_second": 10.326, + "eval_steps_per_second": 10.326, + "eval_token_acc": 0.6434200940459633, + "step": 2750 + }, + { + "epoch": 0.9434325706825333, + "grad_norm": 1.4653621912002563, + "learning_rate": 8.704235264041461e-07, + "loss": 1.2044730186462402, + "memory(GiB)": 15.74, + "step": 2755, + "token_acc": 0.6464527187886868, + "train_speed(iter/s)": 0.134186 + }, + { + "epoch": 0.9451447895040986, + "grad_norm": 1.4948737621307373, + "learning_rate": 8.186121199199758e-07, + "loss": 1.351217555999756, + "memory(GiB)": 15.74, + "step": 2760, + "token_acc": 0.6444909344490934, + "train_speed(iter/s)": 0.134203 + }, + { + "epoch": 0.946857008325664, + "grad_norm": 1.4061533212661743, + "learning_rate": 7.683776944339638e-07, + "loss": 1.4163630485534668, + "memory(GiB)": 15.74, + "step": 2765, + "token_acc": 0.6219227066428277, + "train_speed(iter/s)": 0.134239 + }, + { + "epoch": 0.9485692271472295, + "grad_norm": 1.4904981851577759, + "learning_rate": 7.197218606923239e-07, + "loss": 1.231451416015625, + "memory(GiB)": 15.74, + "step": 2770, + "token_acc": 0.6612065521374351, + "train_speed(iter/s)": 0.134265 + }, + { + "epoch": 0.9502814459687948, + "grad_norm": 1.4157965183258057, + "learning_rate": 6.72646178824382e-07, + "loss": 1.3163187026977539, + "memory(GiB)": 15.74, + "step": 2775, + "token_acc": 0.643201402073901, + "train_speed(iter/s)": 0.134292 + }, + { + "epoch": 0.9519936647903602, + "grad_norm": 1.3248724937438965, + "learning_rate": 6.271521582925433e-07, + "loss": 1.4009407997131347, + "memory(GiB)": 15.74, + "step": 2780, + "token_acc": 0.629837067209776, + "train_speed(iter/s)": 0.134321 + }, + { + "epoch": 0.9537058836119257, + "grad_norm": 1.1285121440887451, + "learning_rate": 5.83241257843875e-07, + "loss": 1.3576522827148438, + "memory(GiB)": 15.74, + "step": 2785, + "token_acc": 0.6369898622303093, + "train_speed(iter/s)": 0.134335 + }, + { + "epoch": 0.955418102433491, + "grad_norm": 1.1170588731765747, + "learning_rate": 5.409148854633672e-07, + "loss": 1.3034517288208007, + "memory(GiB)": 15.74, + "step": 2790, + "token_acc": 0.635017866258295, + "train_speed(iter/s)": 0.134364 + }, + { + "epoch": 0.9571303212550564, + "grad_norm": 1.387988805770874, + "learning_rate": 5.001743983287622e-07, + "loss": 1.3006996154785155, + "memory(GiB)": 15.74, + "step": 2795, + "token_acc": 0.6467991169977925, + "train_speed(iter/s)": 0.134424 + }, + { + "epoch": 0.9588425400766218, + "grad_norm": 1.4129979610443115, + "learning_rate": 4.610211027670397e-07, + "loss": 1.4134355545043946, + "memory(GiB)": 15.74, + "step": 2800, + "token_acc": 0.6224546722454672, + "train_speed(iter/s)": 0.134454 + }, + { + "epoch": 0.9588425400766218, + "eval_loss": 1.2588714361190796, + "eval_runtime": 45.8018, + "eval_samples_per_second": 10.131, + "eval_steps_per_second": 10.131, + "eval_token_acc": 0.6438837009073448, + "step": 2800 + }, + { + "epoch": 0.9605547588981872, + "grad_norm": 1.126222848892212, + "learning_rate": 4.234562542125453e-07, + "loss": 1.3093680381774901, + "memory(GiB)": 15.74, + "step": 2805, + "token_acc": 0.6450396076952094, + "train_speed(iter/s)": 0.134144 + }, + { + "epoch": 0.9622669777197526, + "grad_norm": 1.5096651315689087, + "learning_rate": 3.874810571667109e-07, + "loss": 1.4375462532043457, + "memory(GiB)": 15.74, + "step": 2810, + "token_acc": 0.6240186457311089, + "train_speed(iter/s)": 0.134145 + }, + { + "epoch": 0.963979196541318, + "grad_norm": 1.1924041509628296, + "learning_rate": 3.5309666515944694e-07, + "loss": 1.2375926971435547, + "memory(GiB)": 15.74, + "step": 2815, + "token_acc": 0.6601916601916602, + "train_speed(iter/s)": 0.134196 + }, + { + "epoch": 0.9656914153628834, + "grad_norm": 1.4358608722686768, + "learning_rate": 3.203041807121665e-07, + "loss": 1.3173930168151855, + "memory(GiB)": 15.74, + "step": 2820, + "token_acc": 0.655339135239674, + "train_speed(iter/s)": 0.134231 + }, + { + "epoch": 0.9674036341844487, + "grad_norm": 1.3502205610275269, + "learning_rate": 2.8910465530240793e-07, + "loss": 1.3274590492248535, + "memory(GiB)": 15.74, + "step": 2825, + "token_acc": 0.641396933560477, + "train_speed(iter/s)": 0.13424 + }, + { + "epoch": 0.9691158530060142, + "grad_norm": 1.2146613597869873, + "learning_rate": 2.5949908933012854e-07, + "loss": 1.3345701217651367, + "memory(GiB)": 15.74, + "step": 2830, + "token_acc": 0.6464673193966658, + "train_speed(iter/s)": 0.134275 + }, + { + "epoch": 0.9708280718275796, + "grad_norm": 1.109632134437561, + "learning_rate": 2.3148843208564696e-07, + "loss": 1.4304051399230957, + "memory(GiB)": 15.74, + "step": 2835, + "token_acc": 0.6225751697381183, + "train_speed(iter/s)": 0.134287 + }, + { + "epoch": 0.9725402906491449, + "grad_norm": 1.2578781843185425, + "learning_rate": 2.05073581719184e-07, + "loss": 1.3986212730407714, + "memory(GiB)": 15.74, + "step": 2840, + "token_acc": 0.6226165546125868, + "train_speed(iter/s)": 0.134308 + }, + { + "epoch": 0.9742525094707104, + "grad_norm": 1.3672924041748047, + "learning_rate": 1.8025538521206363e-07, + "loss": 1.4606602668762207, + "memory(GiB)": 15.74, + "step": 2845, + "token_acc": 0.6228070175438597, + "train_speed(iter/s)": 0.134322 + }, + { + "epoch": 0.9759647282922758, + "grad_norm": 1.4471640586853027, + "learning_rate": 1.5703463834955135e-07, + "loss": 1.3435998916625977, + "memory(GiB)": 15.74, + "step": 2850, + "token_acc": 0.6415640839971035, + "train_speed(iter/s)": 0.134355 + }, + { + "epoch": 0.9759647282922758, + "eval_loss": 1.259412169456482, + "eval_runtime": 44.9384, + "eval_samples_per_second": 10.325, + "eval_steps_per_second": 10.325, + "eval_token_acc": 0.6431330993222509, + "step": 2850 + }, + { + "epoch": 0.9776769471138411, + "grad_norm": 1.1818948984146118, + "learning_rate": 1.3541208569536335e-07, + "loss": 1.3029318809509278, + "memory(GiB)": 15.74, + "step": 2855, + "token_acc": 0.6433390987018418, + "train_speed(iter/s)": 0.134053 + }, + { + "epoch": 0.9793891659354066, + "grad_norm": 1.4115979671478271, + "learning_rate": 1.1538842056777466e-07, + "loss": 1.3996414184570312, + "memory(GiB)": 15.74, + "step": 2860, + "token_acc": 0.6264330799948473, + "train_speed(iter/s)": 0.134097 + }, + { + "epoch": 0.981101384756972, + "grad_norm": 1.3347187042236328, + "learning_rate": 9.696428501736465e-08, + "loss": 1.421574878692627, + "memory(GiB)": 15.74, + "step": 2865, + "token_acc": 0.6272295649942256, + "train_speed(iter/s)": 0.134111 + }, + { + "epoch": 0.9828136035785373, + "grad_norm": 1.3180640935897827, + "learning_rate": 8.014026980648904e-08, + "loss": 1.3679192543029786, + "memory(GiB)": 15.74, + "step": 2870, + "token_acc": 0.6328891492165764, + "train_speed(iter/s)": 0.134137 + }, + { + "epoch": 0.9845258224001028, + "grad_norm": 1.4273744821548462, + "learning_rate": 6.49169143902728e-08, + "loss": 1.3558645248413086, + "memory(GiB)": 15.74, + "step": 2875, + "token_acc": 0.6349938916791096, + "train_speed(iter/s)": 0.134196 + }, + { + "epoch": 0.9862380412216681, + "grad_norm": 1.3805627822875977, + "learning_rate": 5.129470689935745e-08, + "loss": 1.3488743782043457, + "memory(GiB)": 15.74, + "step": 2880, + "token_acc": 0.6409814323607427, + "train_speed(iter/s)": 0.134211 + }, + { + "epoch": 0.9879502600432335, + "grad_norm": 1.4824892282485962, + "learning_rate": 3.927408412422451e-08, + "loss": 1.3672945022583007, + "memory(GiB)": 15.74, + "step": 2885, + "token_acc": 0.6348511383537653, + "train_speed(iter/s)": 0.134247 + }, + { + "epoch": 0.989662478864799, + "grad_norm": 1.0865414142608643, + "learning_rate": 2.8855431501212395e-08, + "loss": 1.2805446624755858, + "memory(GiB)": 15.74, + "step": 2890, + "token_acc": 0.6411664705113116, + "train_speed(iter/s)": 0.134309 + }, + { + "epoch": 0.9913746976863643, + "grad_norm": 1.4725053310394287, + "learning_rate": 2.0039083100137358e-08, + "loss": 1.4713275909423829, + "memory(GiB)": 15.74, + "step": 2895, + "token_acc": 0.6193840067815767, + "train_speed(iter/s)": 0.134325 + }, + { + "epoch": 0.9930869165079297, + "grad_norm": 1.1291801929473877, + "learning_rate": 1.2825321613585407e-08, + "loss": 1.2731526374816895, + "memory(GiB)": 15.74, + "step": 2900, + "token_acc": 0.6518362511517705, + "train_speed(iter/s)": 0.134361 + }, + { + "epoch": 0.9930869165079297, + "eval_loss": 1.2591519355773926, + "eval_runtime": 44.9178, + "eval_samples_per_second": 10.33, + "eval_steps_per_second": 10.33, + "eval_token_acc": 0.6441265425966399, + "step": 2900 + }, + { + "epoch": 0.9947991353294952, + "grad_norm": 1.5716931819915771, + "learning_rate": 7.214378347852879e-09, + "loss": 1.3714940071105957, + "memory(GiB)": 15.74, + "step": 2905, + "token_acc": 0.643627781523938, + "train_speed(iter/s)": 0.134047 + }, + { + "epoch": 0.9965113541510605, + "grad_norm": 1.4367042779922485, + "learning_rate": 3.206433215535709e-09, + "loss": 1.3602388381958008, + "memory(GiB)": 15.74, + "step": 2910, + "token_acc": 0.6421131886058477, + "train_speed(iter/s)": 0.13408 + }, + { + "epoch": 0.9982235729726259, + "grad_norm": 1.1470365524291992, + "learning_rate": 8.016147297451593e-10, + "loss": 1.3899625778198241, + "memory(GiB)": 15.74, + "step": 2915, + "token_acc": 0.6254579836898712, + "train_speed(iter/s)": 0.134118 + }, + { + "epoch": 0.9999357917941913, + "grad_norm": 1.3943896293640137, + "learning_rate": 0.0, + "loss": 1.2591609954833984, + "memory(GiB)": 15.74, + "step": 2920, + "token_acc": 0.6577408433364113, + "train_speed(iter/s)": 0.134151 + }, + { + "epoch": 0.9999357917941913, + "eval_loss": 1.259133219718933, + "eval_runtime": 45.0562, + "eval_samples_per_second": 10.298, + "eval_steps_per_second": 10.298, + "eval_token_acc": 0.6443473077687264, + "step": 2920 + } + ], + "logging_steps": 5, + "max_steps": 2920, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.2347656024450662e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}