Sankhaya_Indic_ITN / trainer_state.json
bijoy
Adding model dependencies
ac66417
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.030517578125,
"eval_steps": 500,
"global_step": 8000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003814697265625,
"grad_norm": 1.8202160596847534,
"learning_rate": 4e-05,
"loss": 1.7732,
"step": 100
},
{
"epoch": 0.000762939453125,
"grad_norm": 1.0439223051071167,
"learning_rate": 8e-05,
"loss": 0.3055,
"step": 200
},
{
"epoch": 0.0011444091796875,
"grad_norm": 0.49549755454063416,
"learning_rate": 0.00012,
"loss": 0.2596,
"step": 300
},
{
"epoch": 0.00152587890625,
"grad_norm": 0.9390599727630615,
"learning_rate": 0.00016,
"loss": 0.2547,
"step": 400
},
{
"epoch": 0.0019073486328125,
"grad_norm": 0.4973730444908142,
"learning_rate": 0.0002,
"loss": 0.2534,
"step": 500
},
{
"epoch": 0.002288818359375,
"grad_norm": 0.45400354266166687,
"learning_rate": 0.000199748427672956,
"loss": 0.2446,
"step": 600
},
{
"epoch": 0.0026702880859375,
"grad_norm": 0.39165329933166504,
"learning_rate": 0.00019949685534591195,
"loss": 0.2411,
"step": 700
},
{
"epoch": 0.0030517578125,
"grad_norm": 0.4248354434967041,
"learning_rate": 0.00019924528301886794,
"loss": 0.2392,
"step": 800
},
{
"epoch": 0.0034332275390625,
"grad_norm": 0.35752373933792114,
"learning_rate": 0.0001989937106918239,
"loss": 0.236,
"step": 900
},
{
"epoch": 0.003814697265625,
"grad_norm": 0.39206448197364807,
"learning_rate": 0.00019874213836477988,
"loss": 0.2322,
"step": 1000
},
{
"epoch": 0.0041961669921875,
"grad_norm": 0.3509558439254761,
"learning_rate": 0.00019849056603773587,
"loss": 0.2312,
"step": 1100
},
{
"epoch": 0.00457763671875,
"grad_norm": 0.3513820171356201,
"learning_rate": 0.00019823899371069183,
"loss": 0.2308,
"step": 1200
},
{
"epoch": 0.0049591064453125,
"grad_norm": 0.434176504611969,
"learning_rate": 0.0001979874213836478,
"loss": 0.2284,
"step": 1300
},
{
"epoch": 0.005340576171875,
"grad_norm": 0.37612399458885193,
"learning_rate": 0.0001977358490566038,
"loss": 0.2289,
"step": 1400
},
{
"epoch": 0.0057220458984375,
"grad_norm": 0.3991953134536743,
"learning_rate": 0.00019748427672955975,
"loss": 0.23,
"step": 1500
},
{
"epoch": 0.006103515625,
"grad_norm": 0.4121605157852173,
"learning_rate": 0.00019723270440251574,
"loss": 0.2284,
"step": 1600
},
{
"epoch": 0.0064849853515625,
"grad_norm": 0.3937987983226776,
"learning_rate": 0.0001969811320754717,
"loss": 0.2249,
"step": 1700
},
{
"epoch": 0.006866455078125,
"grad_norm": 0.2995181083679199,
"learning_rate": 0.00019672955974842768,
"loss": 0.2257,
"step": 1800
},
{
"epoch": 0.0072479248046875,
"grad_norm": 0.5119357705116272,
"learning_rate": 0.00019647798742138367,
"loss": 0.2292,
"step": 1900
},
{
"epoch": 0.00762939453125,
"grad_norm": 0.31295427680015564,
"learning_rate": 0.00019622641509433963,
"loss": 0.2289,
"step": 2000
},
{
"epoch": 0.0080108642578125,
"grad_norm": 0.2797456979751587,
"learning_rate": 0.0001959748427672956,
"loss": 0.2232,
"step": 2100
},
{
"epoch": 0.008392333984375,
"grad_norm": 0.45458996295928955,
"learning_rate": 0.00019572327044025157,
"loss": 0.2244,
"step": 2200
},
{
"epoch": 0.0087738037109375,
"grad_norm": 0.29631954431533813,
"learning_rate": 0.00019547169811320755,
"loss": 0.2234,
"step": 2300
},
{
"epoch": 0.0091552734375,
"grad_norm": 0.5060445070266724,
"learning_rate": 0.00019522012578616354,
"loss": 0.2265,
"step": 2400
},
{
"epoch": 0.0095367431640625,
"grad_norm": 0.28566980361938477,
"learning_rate": 0.0001949685534591195,
"loss": 0.2279,
"step": 2500
},
{
"epoch": 0.009918212890625,
"grad_norm": 0.24325500428676605,
"learning_rate": 0.00019471698113207548,
"loss": 0.2306,
"step": 2600
},
{
"epoch": 0.0102996826171875,
"grad_norm": 0.3140350878238678,
"learning_rate": 0.00019446540880503147,
"loss": 0.2234,
"step": 2700
},
{
"epoch": 0.01068115234375,
"grad_norm": 0.4366394877433777,
"learning_rate": 0.00019421383647798743,
"loss": 0.2224,
"step": 2800
},
{
"epoch": 0.0110626220703125,
"grad_norm": 0.27782708406448364,
"learning_rate": 0.0001939622641509434,
"loss": 0.2236,
"step": 2900
},
{
"epoch": 0.011444091796875,
"grad_norm": 0.3332788944244385,
"learning_rate": 0.00019371069182389937,
"loss": 0.2241,
"step": 3000
},
{
"epoch": 0.0118255615234375,
"grad_norm": 0.3888827860355377,
"learning_rate": 0.00019345911949685536,
"loss": 0.2217,
"step": 3100
},
{
"epoch": 0.01220703125,
"grad_norm": 0.24029745161533356,
"learning_rate": 0.00019320754716981134,
"loss": 0.2216,
"step": 3200
},
{
"epoch": 0.0125885009765625,
"grad_norm": 1.8477509021759033,
"learning_rate": 0.0001929559748427673,
"loss": 0.2252,
"step": 3300
},
{
"epoch": 0.012969970703125,
"grad_norm": 0.5924927592277527,
"learning_rate": 0.00019270440251572328,
"loss": 0.2352,
"step": 3400
},
{
"epoch": 0.0133514404296875,
"grad_norm": 0.33940935134887695,
"learning_rate": 0.00019245283018867927,
"loss": 0.2253,
"step": 3500
},
{
"epoch": 0.01373291015625,
"grad_norm": 0.3898316025733948,
"learning_rate": 0.00019220125786163523,
"loss": 0.2216,
"step": 3600
},
{
"epoch": 0.0141143798828125,
"grad_norm": 0.2601265609264374,
"learning_rate": 0.0001919496855345912,
"loss": 0.2261,
"step": 3700
},
{
"epoch": 0.014495849609375,
"grad_norm": 0.32615959644317627,
"learning_rate": 0.00019169811320754717,
"loss": 0.2225,
"step": 3800
},
{
"epoch": 0.0148773193359375,
"grad_norm": 0.2891947627067566,
"learning_rate": 0.00019144654088050316,
"loss": 0.2216,
"step": 3900
},
{
"epoch": 0.0152587890625,
"grad_norm": 0.2846430242061615,
"learning_rate": 0.00019119496855345914,
"loss": 0.2197,
"step": 4000
},
{
"epoch": 0.0156402587890625,
"grad_norm": 0.2938269078731537,
"learning_rate": 0.0001909433962264151,
"loss": 0.2212,
"step": 4100
},
{
"epoch": 0.016021728515625,
"grad_norm": 0.2718958258628845,
"learning_rate": 0.00019069182389937108,
"loss": 0.2205,
"step": 4200
},
{
"epoch": 0.0164031982421875,
"grad_norm": 0.3561397194862366,
"learning_rate": 0.00019044025157232704,
"loss": 0.2205,
"step": 4300
},
{
"epoch": 0.01678466796875,
"grad_norm": 0.4546607732772827,
"learning_rate": 0.00019018867924528303,
"loss": 0.2234,
"step": 4400
},
{
"epoch": 0.0171661376953125,
"grad_norm": 0.29250577092170715,
"learning_rate": 0.00018993710691823901,
"loss": 0.2197,
"step": 4500
},
{
"epoch": 0.017547607421875,
"grad_norm": 1.6952908039093018,
"learning_rate": 0.00018968553459119497,
"loss": 0.2217,
"step": 4600
},
{
"epoch": 0.0179290771484375,
"grad_norm": 0.3261864483356476,
"learning_rate": 0.00018943396226415096,
"loss": 0.2269,
"step": 4700
},
{
"epoch": 0.018310546875,
"grad_norm": 0.2668060064315796,
"learning_rate": 0.00018918238993710694,
"loss": 0.2203,
"step": 4800
},
{
"epoch": 0.0186920166015625,
"grad_norm": 0.31689000129699707,
"learning_rate": 0.0001889308176100629,
"loss": 0.2201,
"step": 4900
},
{
"epoch": 0.019073486328125,
"grad_norm": 0.26320216059684753,
"learning_rate": 0.00018867924528301889,
"loss": 0.2214,
"step": 5000
},
{
"epoch": 0.0194549560546875,
"grad_norm": 0.26768413186073303,
"learning_rate": 0.00018842767295597484,
"loss": 0.2225,
"step": 5100
},
{
"epoch": 0.01983642578125,
"grad_norm": 0.2808452248573303,
"learning_rate": 0.00018817610062893083,
"loss": 0.2208,
"step": 5200
},
{
"epoch": 0.0202178955078125,
"grad_norm": 0.25958341360092163,
"learning_rate": 0.00018792452830188681,
"loss": 0.2207,
"step": 5300
},
{
"epoch": 0.020599365234375,
"grad_norm": 0.22953402996063232,
"learning_rate": 0.00018767295597484277,
"loss": 0.2193,
"step": 5400
},
{
"epoch": 0.0209808349609375,
"grad_norm": 0.9375737905502319,
"learning_rate": 0.00018742138364779876,
"loss": 0.2206,
"step": 5500
},
{
"epoch": 0.0213623046875,
"grad_norm": 0.2852359712123871,
"learning_rate": 0.00018716981132075472,
"loss": 0.2211,
"step": 5600
},
{
"epoch": 0.0217437744140625,
"grad_norm": 0.25367122888565063,
"learning_rate": 0.0001869182389937107,
"loss": 0.2191,
"step": 5700
},
{
"epoch": 0.022125244140625,
"grad_norm": 0.2215207815170288,
"learning_rate": 0.0001866666666666667,
"loss": 0.2218,
"step": 5800
},
{
"epoch": 0.0225067138671875,
"grad_norm": 0.24178574979305267,
"learning_rate": 0.00018641509433962264,
"loss": 0.2283,
"step": 5900
},
{
"epoch": 0.02288818359375,
"grad_norm": 0.3638046979904175,
"learning_rate": 0.00018616352201257863,
"loss": 0.2217,
"step": 6000
},
{
"epoch": 0.0232696533203125,
"grad_norm": 0.40834301710128784,
"learning_rate": 0.00018591194968553462,
"loss": 0.2204,
"step": 6100
},
{
"epoch": 0.023651123046875,
"grad_norm": 0.24277737736701965,
"learning_rate": 0.00018566037735849057,
"loss": 0.2178,
"step": 6200
},
{
"epoch": 0.0240325927734375,
"grad_norm": 0.3276098370552063,
"learning_rate": 0.00018540880503144656,
"loss": 0.2245,
"step": 6300
},
{
"epoch": 0.0244140625,
"grad_norm": 0.40407466888427734,
"learning_rate": 0.00018515723270440252,
"loss": 0.2258,
"step": 6400
},
{
"epoch": 0.0247955322265625,
"grad_norm": 0.31675395369529724,
"learning_rate": 0.0001849056603773585,
"loss": 0.223,
"step": 6500
},
{
"epoch": 0.025177001953125,
"grad_norm": 0.2858389616012573,
"learning_rate": 0.0001846540880503145,
"loss": 0.2201,
"step": 6600
},
{
"epoch": 0.0255584716796875,
"grad_norm": 0.2711004912853241,
"learning_rate": 0.00018440251572327045,
"loss": 0.2175,
"step": 6700
},
{
"epoch": 0.02593994140625,
"grad_norm": 0.24398334324359894,
"learning_rate": 0.00018415094339622643,
"loss": 0.2195,
"step": 6800
},
{
"epoch": 0.0263214111328125,
"grad_norm": 0.29580453038215637,
"learning_rate": 0.0001838993710691824,
"loss": 0.2198,
"step": 6900
},
{
"epoch": 0.026702880859375,
"grad_norm": 0.2624952495098114,
"learning_rate": 0.00018364779874213837,
"loss": 0.217,
"step": 7000
},
{
"epoch": 0.0270843505859375,
"grad_norm": 0.2129925936460495,
"learning_rate": 0.00018339622641509436,
"loss": 0.2188,
"step": 7100
},
{
"epoch": 0.0274658203125,
"grad_norm": 0.27471479773521423,
"learning_rate": 0.00018314465408805032,
"loss": 0.2174,
"step": 7200
},
{
"epoch": 0.0278472900390625,
"grad_norm": 1.0204274654388428,
"learning_rate": 0.0001828930817610063,
"loss": 0.2186,
"step": 7300
},
{
"epoch": 0.028228759765625,
"grad_norm": 0.5174055695533752,
"learning_rate": 0.0001826415094339623,
"loss": 0.2198,
"step": 7400
},
{
"epoch": 0.0286102294921875,
"grad_norm": 1.7667677402496338,
"learning_rate": 0.00018238993710691825,
"loss": 0.2221,
"step": 7500
},
{
"epoch": 0.02899169921875,
"grad_norm": 0.34651100635528564,
"learning_rate": 0.00018213836477987423,
"loss": 0.2215,
"step": 7600
},
{
"epoch": 0.0293731689453125,
"grad_norm": 0.2900320589542389,
"learning_rate": 0.0001818867924528302,
"loss": 0.2184,
"step": 7700
},
{
"epoch": 0.029754638671875,
"grad_norm": 0.21523432433605194,
"learning_rate": 0.00018163522012578617,
"loss": 0.2171,
"step": 7800
},
{
"epoch": 0.0301361083984375,
"grad_norm": 0.28846126794815063,
"learning_rate": 0.00018138364779874216,
"loss": 0.2175,
"step": 7900
},
{
"epoch": 0.030517578125,
"grad_norm": 0.27318933606147766,
"learning_rate": 0.00018113207547169812,
"loss": 0.218,
"step": 8000
}
],
"logging_steps": 100,
"max_steps": 80000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.909320422780109e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}