Sankhaya_Indic_ITN / trainer_state.json

bijoy

Adding model dependencies

ac66417 19 days ago

14.6 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.030517578125,
	"eval_steps": 500,
	"global_step": 8000,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.0003814697265625,
	"grad_norm": 1.8202160596847534,
	"learning_rate": 4e-05,
	"loss": 1.7732,
	"step": 100
	},
	{
	"epoch": 0.000762939453125,
	"grad_norm": 1.0439223051071167,
	"learning_rate": 8e-05,
	"loss": 0.3055,
	"step": 200
	},
	{
	"epoch": 0.0011444091796875,
	"grad_norm": 0.49549755454063416,
	"learning_rate": 0.00012,
	"loss": 0.2596,
	"step": 300
	},
	{
	"epoch": 0.00152587890625,
	"grad_norm": 0.9390599727630615,
	"learning_rate": 0.00016,
	"loss": 0.2547,
	"step": 400
	},
	{
	"epoch": 0.0019073486328125,
	"grad_norm": 0.4973730444908142,
	"learning_rate": 0.0002,
	"loss": 0.2534,
	"step": 500
	},
	{
	"epoch": 0.002288818359375,
	"grad_norm": 0.45400354266166687,
	"learning_rate": 0.000199748427672956,
	"loss": 0.2446,
	"step": 600
	},
	{
	"epoch": 0.0026702880859375,
	"grad_norm": 0.39165329933166504,
	"learning_rate": 0.00019949685534591195,
	"loss": 0.2411,
	"step": 700
	},
	{
	"epoch": 0.0030517578125,
	"grad_norm": 0.4248354434967041,
	"learning_rate": 0.00019924528301886794,
	"loss": 0.2392,
	"step": 800
	},
	{
	"epoch": 0.0034332275390625,
	"grad_norm": 0.35752373933792114,
	"learning_rate": 0.0001989937106918239,
	"loss": 0.236,
	"step": 900
	},
	{
	"epoch": 0.003814697265625,
	"grad_norm": 0.39206448197364807,
	"learning_rate": 0.00019874213836477988,
	"loss": 0.2322,
	"step": 1000
	},
	{
	"epoch": 0.0041961669921875,
	"grad_norm": 0.3509558439254761,
	"learning_rate": 0.00019849056603773587,
	"loss": 0.2312,
	"step": 1100
	},
	{
	"epoch": 0.00457763671875,
	"grad_norm": 0.3513820171356201,
	"learning_rate": 0.00019823899371069183,
	"loss": 0.2308,
	"step": 1200
	},
	{
	"epoch": 0.0049591064453125,
	"grad_norm": 0.434176504611969,
	"learning_rate": 0.0001979874213836478,
	"loss": 0.2284,
	"step": 1300
	},
	{
	"epoch": 0.005340576171875,
	"grad_norm": 0.37612399458885193,
	"learning_rate": 0.0001977358490566038,
	"loss": 0.2289,
	"step": 1400
	},
	{
	"epoch": 0.0057220458984375,
	"grad_norm": 0.3991953134536743,
	"learning_rate": 0.00019748427672955975,
	"loss": 0.23,
	"step": 1500
	},
	{
	"epoch": 0.006103515625,
	"grad_norm": 0.4121605157852173,
	"learning_rate": 0.00019723270440251574,
	"loss": 0.2284,
	"step": 1600
	},
	{
	"epoch": 0.0064849853515625,
	"grad_norm": 0.3937987983226776,
	"learning_rate": 0.0001969811320754717,
	"loss": 0.2249,
	"step": 1700
	},
	{
	"epoch": 0.006866455078125,
	"grad_norm": 0.2995181083679199,
	"learning_rate": 0.00019672955974842768,
	"loss": 0.2257,
	"step": 1800
	},
	{
	"epoch": 0.0072479248046875,
	"grad_norm": 0.5119357705116272,
	"learning_rate": 0.00019647798742138367,
	"loss": 0.2292,
	"step": 1900
	},
	{
	"epoch": 0.00762939453125,
	"grad_norm": 0.31295427680015564,
	"learning_rate": 0.00019622641509433963,
	"loss": 0.2289,
	"step": 2000
	},
	{
	"epoch": 0.0080108642578125,
	"grad_norm": 0.2797456979751587,
	"learning_rate": 0.0001959748427672956,
	"loss": 0.2232,
	"step": 2100
	},
	{
	"epoch": 0.008392333984375,
	"grad_norm": 0.45458996295928955,
	"learning_rate": 0.00019572327044025157,
	"loss": 0.2244,
	"step": 2200
	},
	{
	"epoch": 0.0087738037109375,
	"grad_norm": 0.29631954431533813,
	"learning_rate": 0.00019547169811320755,
	"loss": 0.2234,
	"step": 2300
	},
	{
	"epoch": 0.0091552734375,
	"grad_norm": 0.5060445070266724,
	"learning_rate": 0.00019522012578616354,
	"loss": 0.2265,
	"step": 2400
	},
	{
	"epoch": 0.0095367431640625,
	"grad_norm": 0.28566980361938477,
	"learning_rate": 0.0001949685534591195,
	"loss": 0.2279,
	"step": 2500
	},
	{
	"epoch": 0.009918212890625,
	"grad_norm": 0.24325500428676605,
	"learning_rate": 0.00019471698113207548,
	"loss": 0.2306,
	"step": 2600
	},
	{
	"epoch": 0.0102996826171875,
	"grad_norm": 0.3140350878238678,
	"learning_rate": 0.00019446540880503147,
	"loss": 0.2234,
	"step": 2700
	},
	{
	"epoch": 0.01068115234375,
	"grad_norm": 0.4366394877433777,
	"learning_rate": 0.00019421383647798743,
	"loss": 0.2224,
	"step": 2800
	},
	{
	"epoch": 0.0110626220703125,
	"grad_norm": 0.27782708406448364,
	"learning_rate": 0.0001939622641509434,
	"loss": 0.2236,
	"step": 2900
	},
	{
	"epoch": 0.011444091796875,
	"grad_norm": 0.3332788944244385,
	"learning_rate": 0.00019371069182389937,
	"loss": 0.2241,
	"step": 3000
	},
	{
	"epoch": 0.0118255615234375,
	"grad_norm": 0.3888827860355377,
	"learning_rate": 0.00019345911949685536,
	"loss": 0.2217,
	"step": 3100
	},
	{
	"epoch": 0.01220703125,
	"grad_norm": 0.24029745161533356,
	"learning_rate": 0.00019320754716981134,
	"loss": 0.2216,
	"step": 3200
	},
	{
	"epoch": 0.0125885009765625,
	"grad_norm": 1.8477509021759033,
	"learning_rate": 0.0001929559748427673,
	"loss": 0.2252,
	"step": 3300
	},
	{
	"epoch": 0.012969970703125,
	"grad_norm": 0.5924927592277527,
	"learning_rate": 0.00019270440251572328,
	"loss": 0.2352,
	"step": 3400
	},
	{
	"epoch": 0.0133514404296875,
	"grad_norm": 0.33940935134887695,
	"learning_rate": 0.00019245283018867927,
	"loss": 0.2253,
	"step": 3500
	},
	{
	"epoch": 0.01373291015625,
	"grad_norm": 0.3898316025733948,
	"learning_rate": 0.00019220125786163523,
	"loss": 0.2216,
	"step": 3600
	},
	{
	"epoch": 0.0141143798828125,
	"grad_norm": 0.2601265609264374,
	"learning_rate": 0.0001919496855345912,
	"loss": 0.2261,
	"step": 3700
	},
	{
	"epoch": 0.014495849609375,
	"grad_norm": 0.32615959644317627,
	"learning_rate": 0.00019169811320754717,
	"loss": 0.2225,
	"step": 3800
	},
	{
	"epoch": 0.0148773193359375,
	"grad_norm": 0.2891947627067566,
	"learning_rate": 0.00019144654088050316,
	"loss": 0.2216,
	"step": 3900
	},
	{
	"epoch": 0.0152587890625,
	"grad_norm": 0.2846430242061615,
	"learning_rate": 0.00019119496855345914,
	"loss": 0.2197,
	"step": 4000
	},
	{
	"epoch": 0.0156402587890625,
	"grad_norm": 0.2938269078731537,
	"learning_rate": 0.0001909433962264151,
	"loss": 0.2212,
	"step": 4100
	},
	{
	"epoch": 0.016021728515625,
	"grad_norm": 0.2718958258628845,
	"learning_rate": 0.00019069182389937108,
	"loss": 0.2205,
	"step": 4200
	},
	{
	"epoch": 0.0164031982421875,
	"grad_norm": 0.3561397194862366,
	"learning_rate": 0.00019044025157232704,
	"loss": 0.2205,
	"step": 4300
	},
	{
	"epoch": 0.01678466796875,
	"grad_norm": 0.4546607732772827,
	"learning_rate": 0.00019018867924528303,
	"loss": 0.2234,
	"step": 4400
	},
	{
	"epoch": 0.0171661376953125,
	"grad_norm": 0.29250577092170715,
	"learning_rate": 0.00018993710691823901,
	"loss": 0.2197,
	"step": 4500
	},
	{
	"epoch": 0.017547607421875,
	"grad_norm": 1.6952908039093018,
	"learning_rate": 0.00018968553459119497,
	"loss": 0.2217,
	"step": 4600
	},
	{
	"epoch": 0.0179290771484375,
	"grad_norm": 0.3261864483356476,
	"learning_rate": 0.00018943396226415096,
	"loss": 0.2269,
	"step": 4700
	},
	{
	"epoch": 0.018310546875,
	"grad_norm": 0.2668060064315796,
	"learning_rate": 0.00018918238993710694,
	"loss": 0.2203,
	"step": 4800
	},
	{
	"epoch": 0.0186920166015625,
	"grad_norm": 0.31689000129699707,
	"learning_rate": 0.0001889308176100629,
	"loss": 0.2201,
	"step": 4900
	},
	{
	"epoch": 0.019073486328125,
	"grad_norm": 0.26320216059684753,
	"learning_rate": 0.00018867924528301889,
	"loss": 0.2214,
	"step": 5000
	},
	{
	"epoch": 0.0194549560546875,
	"grad_norm": 0.26768413186073303,
	"learning_rate": 0.00018842767295597484,
	"loss": 0.2225,
	"step": 5100
	},
	{
	"epoch": 0.01983642578125,
	"grad_norm": 0.2808452248573303,
	"learning_rate": 0.00018817610062893083,
	"loss": 0.2208,
	"step": 5200
	},
	{
	"epoch": 0.0202178955078125,
	"grad_norm": 0.25958341360092163,
	"learning_rate": 0.00018792452830188681,
	"loss": 0.2207,
	"step": 5300
	},
	{
	"epoch": 0.020599365234375,
	"grad_norm": 0.22953402996063232,
	"learning_rate": 0.00018767295597484277,
	"loss": 0.2193,
	"step": 5400
	},
	{
	"epoch": 0.0209808349609375,
	"grad_norm": 0.9375737905502319,
	"learning_rate": 0.00018742138364779876,
	"loss": 0.2206,
	"step": 5500
	},
	{
	"epoch": 0.0213623046875,
	"grad_norm": 0.2852359712123871,
	"learning_rate": 0.00018716981132075472,
	"loss": 0.2211,
	"step": 5600
	},
	{
	"epoch": 0.0217437744140625,
	"grad_norm": 0.25367122888565063,
	"learning_rate": 0.0001869182389937107,
	"loss": 0.2191,
	"step": 5700
	},
	{
	"epoch": 0.022125244140625,
	"grad_norm": 0.2215207815170288,
	"learning_rate": 0.0001866666666666667,
	"loss": 0.2218,
	"step": 5800
	},
	{
	"epoch": 0.0225067138671875,
	"grad_norm": 0.24178574979305267,
	"learning_rate": 0.00018641509433962264,
	"loss": 0.2283,
	"step": 5900
	},
	{
	"epoch": 0.02288818359375,
	"grad_norm": 0.3638046979904175,
	"learning_rate": 0.00018616352201257863,
	"loss": 0.2217,
	"step": 6000
	},
	{
	"epoch": 0.0232696533203125,
	"grad_norm": 0.40834301710128784,
	"learning_rate": 0.00018591194968553462,
	"loss": 0.2204,
	"step": 6100
	},
	{
	"epoch": 0.023651123046875,
	"grad_norm": 0.24277737736701965,
	"learning_rate": 0.00018566037735849057,
	"loss": 0.2178,
	"step": 6200
	},
	{
	"epoch": 0.0240325927734375,
	"grad_norm": 0.3276098370552063,
	"learning_rate": 0.00018540880503144656,
	"loss": 0.2245,
	"step": 6300
	},
	{
	"epoch": 0.0244140625,
	"grad_norm": 0.40407466888427734,
	"learning_rate": 0.00018515723270440252,
	"loss": 0.2258,
	"step": 6400
	},
	{
	"epoch": 0.0247955322265625,
	"grad_norm": 0.31675395369529724,
	"learning_rate": 0.0001849056603773585,
	"loss": 0.223,
	"step": 6500
	},
	{
	"epoch": 0.025177001953125,
	"grad_norm": 0.2858389616012573,
	"learning_rate": 0.0001846540880503145,
	"loss": 0.2201,
	"step": 6600
	},
	{
	"epoch": 0.0255584716796875,
	"grad_norm": 0.2711004912853241,
	"learning_rate": 0.00018440251572327045,
	"loss": 0.2175,
	"step": 6700
	},
	{
	"epoch": 0.02593994140625,
	"grad_norm": 0.24398334324359894,
	"learning_rate": 0.00018415094339622643,
	"loss": 0.2195,
	"step": 6800
	},
	{
	"epoch": 0.0263214111328125,
	"grad_norm": 0.29580453038215637,
	"learning_rate": 0.0001838993710691824,
	"loss": 0.2198,
	"step": 6900
	},
	{
	"epoch": 0.026702880859375,
	"grad_norm": 0.2624952495098114,
	"learning_rate": 0.00018364779874213837,
	"loss": 0.217,
	"step": 7000
	},
	{
	"epoch": 0.0270843505859375,
	"grad_norm": 0.2129925936460495,
	"learning_rate": 0.00018339622641509436,
	"loss": 0.2188,
	"step": 7100
	},
	{
	"epoch": 0.0274658203125,
	"grad_norm": 0.27471479773521423,
	"learning_rate": 0.00018314465408805032,
	"loss": 0.2174,
	"step": 7200
	},
	{
	"epoch": 0.0278472900390625,
	"grad_norm": 1.0204274654388428,
	"learning_rate": 0.0001828930817610063,
	"loss": 0.2186,
	"step": 7300
	},
	{
	"epoch": 0.028228759765625,
	"grad_norm": 0.5174055695533752,
	"learning_rate": 0.0001826415094339623,
	"loss": 0.2198,
	"step": 7400
	},
	{
	"epoch": 0.0286102294921875,
	"grad_norm": 1.7667677402496338,
	"learning_rate": 0.00018238993710691825,
	"loss": 0.2221,
	"step": 7500
	},
	{
	"epoch": 0.02899169921875,
	"grad_norm": 0.34651100635528564,
	"learning_rate": 0.00018213836477987423,
	"loss": 0.2215,
	"step": 7600
	},
	{
	"epoch": 0.0293731689453125,
	"grad_norm": 0.2900320589542389,
	"learning_rate": 0.0001818867924528302,
	"loss": 0.2184,
	"step": 7700
	},
	{
	"epoch": 0.029754638671875,
	"grad_norm": 0.21523432433605194,
	"learning_rate": 0.00018163522012578617,
	"loss": 0.2171,
	"step": 7800
	},
	{
	"epoch": 0.0301361083984375,
	"grad_norm": 0.28846126794815063,
	"learning_rate": 0.00018138364779874216,
	"loss": 0.2175,
	"step": 7900
	},
	{
	"epoch": 0.030517578125,
	"grad_norm": 0.27318933606147766,
	"learning_rate": 0.00018113207547169812,
	"loss": 0.218,
	"step": 8000
	}
	],
	"logging_steps": 100,
	"max_steps": 80000,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 9.909320422780109e+16,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}