kxnguyen's picture
Model save
552edc7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9985228951255539,
"eval_steps": 500,
"global_step": 338,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014771048744460856,
"grad_norm": 2.5938093662261963,
"learning_rate": 0.0002,
"loss": 0.9809,
"step": 5
},
{
"epoch": 0.029542097488921712,
"grad_norm": 0.9637813568115234,
"learning_rate": 0.0002,
"loss": 0.4769,
"step": 10
},
{
"epoch": 0.04431314623338257,
"grad_norm": 1.666123628616333,
"learning_rate": 0.0002,
"loss": 0.4077,
"step": 15
},
{
"epoch": 0.059084194977843424,
"grad_norm": 0.5881379246711731,
"learning_rate": 0.0002,
"loss": 0.4035,
"step": 20
},
{
"epoch": 0.07385524372230429,
"grad_norm": 0.5366687774658203,
"learning_rate": 0.0002,
"loss": 0.3695,
"step": 25
},
{
"epoch": 0.08862629246676514,
"grad_norm": 0.5429549217224121,
"learning_rate": 0.0002,
"loss": 0.3579,
"step": 30
},
{
"epoch": 0.103397341211226,
"grad_norm": 0.5302016735076904,
"learning_rate": 0.0002,
"loss": 0.3623,
"step": 35
},
{
"epoch": 0.11816838995568685,
"grad_norm": 0.4706576466560364,
"learning_rate": 0.0002,
"loss": 0.3423,
"step": 40
},
{
"epoch": 0.1329394387001477,
"grad_norm": 0.47507619857788086,
"learning_rate": 0.0002,
"loss": 0.3301,
"step": 45
},
{
"epoch": 0.14771048744460857,
"grad_norm": 0.487821102142334,
"learning_rate": 0.0002,
"loss": 0.3262,
"step": 50
},
{
"epoch": 0.16248153618906944,
"grad_norm": 0.46188947558403015,
"learning_rate": 0.0002,
"loss": 0.3381,
"step": 55
},
{
"epoch": 0.17725258493353027,
"grad_norm": 0.49672871828079224,
"learning_rate": 0.0002,
"loss": 0.3474,
"step": 60
},
{
"epoch": 0.19202363367799113,
"grad_norm": 0.45688968896865845,
"learning_rate": 0.0002,
"loss": 0.3356,
"step": 65
},
{
"epoch": 0.206794682422452,
"grad_norm": 0.5083580017089844,
"learning_rate": 0.0002,
"loss": 0.317,
"step": 70
},
{
"epoch": 0.22156573116691286,
"grad_norm": 0.4326242506504059,
"learning_rate": 0.0002,
"loss": 0.3107,
"step": 75
},
{
"epoch": 0.2363367799113737,
"grad_norm": 0.7657620906829834,
"learning_rate": 0.0002,
"loss": 0.3055,
"step": 80
},
{
"epoch": 0.2511078286558346,
"grad_norm": 0.4073372483253479,
"learning_rate": 0.0002,
"loss": 0.3041,
"step": 85
},
{
"epoch": 0.2658788774002954,
"grad_norm": 0.4194050431251526,
"learning_rate": 0.0002,
"loss": 0.3121,
"step": 90
},
{
"epoch": 0.28064992614475626,
"grad_norm": 0.4937780499458313,
"learning_rate": 0.0002,
"loss": 0.3065,
"step": 95
},
{
"epoch": 0.29542097488921715,
"grad_norm": 0.39246585965156555,
"learning_rate": 0.0002,
"loss": 0.3081,
"step": 100
},
{
"epoch": 0.310192023633678,
"grad_norm": 0.4153652787208557,
"learning_rate": 0.0002,
"loss": 0.3074,
"step": 105
},
{
"epoch": 0.3249630723781389,
"grad_norm": 0.39885184168815613,
"learning_rate": 0.0002,
"loss": 0.3016,
"step": 110
},
{
"epoch": 0.3397341211225997,
"grad_norm": 0.3999512195587158,
"learning_rate": 0.0002,
"loss": 0.302,
"step": 115
},
{
"epoch": 0.35450516986706054,
"grad_norm": 0.40937578678131104,
"learning_rate": 0.0002,
"loss": 0.2964,
"step": 120
},
{
"epoch": 0.36927621861152143,
"grad_norm": 1.0849940776824951,
"learning_rate": 0.0002,
"loss": 0.3098,
"step": 125
},
{
"epoch": 0.38404726735598227,
"grad_norm": 0.36466699838638306,
"learning_rate": 0.0002,
"loss": 0.2964,
"step": 130
},
{
"epoch": 0.3988183161004431,
"grad_norm": 0.32518795132637024,
"learning_rate": 0.0002,
"loss": 0.2788,
"step": 135
},
{
"epoch": 0.413589364844904,
"grad_norm": 0.3508060872554779,
"learning_rate": 0.0002,
"loss": 0.2758,
"step": 140
},
{
"epoch": 0.42836041358936483,
"grad_norm": 0.34023162722587585,
"learning_rate": 0.0002,
"loss": 0.2955,
"step": 145
},
{
"epoch": 0.4431314623338257,
"grad_norm": 0.3429297208786011,
"learning_rate": 0.0002,
"loss": 0.2812,
"step": 150
},
{
"epoch": 0.45790251107828656,
"grad_norm": 0.3394342064857483,
"learning_rate": 0.0002,
"loss": 0.2751,
"step": 155
},
{
"epoch": 0.4726735598227474,
"grad_norm": 0.3172396421432495,
"learning_rate": 0.0002,
"loss": 0.2813,
"step": 160
},
{
"epoch": 0.4874446085672083,
"grad_norm": 0.5636305809020996,
"learning_rate": 0.0002,
"loss": 0.2714,
"step": 165
},
{
"epoch": 0.5022156573116692,
"grad_norm": 0.33329370617866516,
"learning_rate": 0.0002,
"loss": 0.2759,
"step": 170
},
{
"epoch": 0.51698670605613,
"grad_norm": 0.34862470626831055,
"learning_rate": 0.0002,
"loss": 0.2875,
"step": 175
},
{
"epoch": 0.5317577548005908,
"grad_norm": 0.41521379351615906,
"learning_rate": 0.0002,
"loss": 0.2744,
"step": 180
},
{
"epoch": 0.5465288035450517,
"grad_norm": 0.3359523117542267,
"learning_rate": 0.0002,
"loss": 0.282,
"step": 185
},
{
"epoch": 0.5612998522895125,
"grad_norm": 0.3089170455932617,
"learning_rate": 0.0002,
"loss": 0.2628,
"step": 190
},
{
"epoch": 0.5760709010339734,
"grad_norm": 0.36551329493522644,
"learning_rate": 0.0002,
"loss": 0.2776,
"step": 195
},
{
"epoch": 0.5908419497784343,
"grad_norm": 0.32992231845855713,
"learning_rate": 0.0002,
"loss": 0.2599,
"step": 200
},
{
"epoch": 0.6056129985228951,
"grad_norm": 0.3119284510612488,
"learning_rate": 0.0002,
"loss": 0.2699,
"step": 205
},
{
"epoch": 0.620384047267356,
"grad_norm": 0.2953311800956726,
"learning_rate": 0.0002,
"loss": 0.2705,
"step": 210
},
{
"epoch": 0.6351550960118169,
"grad_norm": 0.3757329285144806,
"learning_rate": 0.0002,
"loss": 0.2918,
"step": 215
},
{
"epoch": 0.6499261447562777,
"grad_norm": 0.36705055832862854,
"learning_rate": 0.0002,
"loss": 0.2545,
"step": 220
},
{
"epoch": 0.6646971935007385,
"grad_norm": 0.3092058002948761,
"learning_rate": 0.0002,
"loss": 0.2624,
"step": 225
},
{
"epoch": 0.6794682422451994,
"grad_norm": 0.31742286682128906,
"learning_rate": 0.0002,
"loss": 0.2602,
"step": 230
},
{
"epoch": 0.6942392909896603,
"grad_norm": 0.2955617308616638,
"learning_rate": 0.0002,
"loss": 0.256,
"step": 235
},
{
"epoch": 0.7090103397341211,
"grad_norm": 0.3345969617366791,
"learning_rate": 0.0002,
"loss": 0.2687,
"step": 240
},
{
"epoch": 0.723781388478582,
"grad_norm": 0.2796613276004791,
"learning_rate": 0.0002,
"loss": 0.2526,
"step": 245
},
{
"epoch": 0.7385524372230429,
"grad_norm": 0.5415365695953369,
"learning_rate": 0.0002,
"loss": 0.2545,
"step": 250
},
{
"epoch": 0.7533234859675036,
"grad_norm": 0.3844436705112457,
"learning_rate": 0.0002,
"loss": 0.2599,
"step": 255
},
{
"epoch": 0.7680945347119645,
"grad_norm": 0.3186696171760559,
"learning_rate": 0.0002,
"loss": 0.2477,
"step": 260
},
{
"epoch": 0.7828655834564254,
"grad_norm": 0.38170936703681946,
"learning_rate": 0.0002,
"loss": 0.2582,
"step": 265
},
{
"epoch": 0.7976366322008862,
"grad_norm": 0.29369300603866577,
"learning_rate": 0.0002,
"loss": 0.2505,
"step": 270
},
{
"epoch": 0.8124076809453471,
"grad_norm": 0.29856300354003906,
"learning_rate": 0.0002,
"loss": 0.2675,
"step": 275
},
{
"epoch": 0.827178729689808,
"grad_norm": 0.2721855342388153,
"learning_rate": 0.0002,
"loss": 0.2489,
"step": 280
},
{
"epoch": 0.8419497784342689,
"grad_norm": 0.3029973804950714,
"learning_rate": 0.0002,
"loss": 0.2575,
"step": 285
},
{
"epoch": 0.8567208271787297,
"grad_norm": 0.2983309030532837,
"learning_rate": 0.0002,
"loss": 0.2628,
"step": 290
},
{
"epoch": 0.8714918759231906,
"grad_norm": 0.5093730092048645,
"learning_rate": 0.0002,
"loss": 0.2552,
"step": 295
},
{
"epoch": 0.8862629246676514,
"grad_norm": 0.28230157494544983,
"learning_rate": 0.0002,
"loss": 0.2592,
"step": 300
},
{
"epoch": 0.9010339734121122,
"grad_norm": 0.371902197599411,
"learning_rate": 0.0002,
"loss": 0.2596,
"step": 305
},
{
"epoch": 0.9158050221565731,
"grad_norm": 0.3786104619503021,
"learning_rate": 0.0002,
"loss": 0.25,
"step": 310
},
{
"epoch": 0.930576070901034,
"grad_norm": 0.4518865942955017,
"learning_rate": 0.0002,
"loss": 0.2546,
"step": 315
},
{
"epoch": 0.9453471196454948,
"grad_norm": 0.29951682686805725,
"learning_rate": 0.0002,
"loss": 0.2433,
"step": 320
},
{
"epoch": 0.9601181683899557,
"grad_norm": 0.2999703884124756,
"learning_rate": 0.0002,
"loss": 0.2419,
"step": 325
},
{
"epoch": 0.9748892171344166,
"grad_norm": 0.2904799282550812,
"learning_rate": 0.0002,
"loss": 0.2474,
"step": 330
},
{
"epoch": 0.9896602658788775,
"grad_norm": 0.28127652406692505,
"learning_rate": 0.0002,
"loss": 0.2458,
"step": 335
},
{
"epoch": 0.9985228951255539,
"step": 338,
"total_flos": 2.6821312997071258e+17,
"train_loss": 0.3007896387365443,
"train_runtime": 3313.8561,
"train_samples_per_second": 1.632,
"train_steps_per_second": 0.102
}
],
"logging_steps": 5,
"max_steps": 338,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.6821312997071258e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}