inter-play-sim-assistant-dpo / trainer_state.json
jeromeramos's picture
Model save
63bb3e0 verified
raw
history blame
7.66 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9938900203665988,
"eval_steps": 500,
"global_step": 122,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016293279022403257,
"grad_norm": 1.4111297130584717,
"learning_rate": 3.846153846153846e-08,
"logits/chosen": -3.2578125,
"logits/rejected": -3.19140625,
"logps/chosen": -46.375,
"logps/rejected": -45.75,
"loss": 0.6914,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.1629327902240326,
"grad_norm": 0.7594385147094727,
"learning_rate": 3.8461538461538463e-07,
"logits/chosen": -3.2265625,
"logits/rejected": -3.2200520038604736,
"logps/chosen": -42.77604293823242,
"logps/rejected": -41.88541793823242,
"loss": 0.6924,
"rewards/accuracies": 0.2222222238779068,
"rewards/chosen": 0.0023810069542378187,
"rewards/margins": -0.0003809928894042969,
"rewards/rejected": 0.0027594566345214844,
"step": 10
},
{
"epoch": 0.3258655804480652,
"grad_norm": 0.7135093808174133,
"learning_rate": 4.949291683053768e-07,
"logits/chosen": -3.2763671875,
"logits/rejected": -3.2466797828674316,
"logps/chosen": -40.720314025878906,
"logps/rejected": -39.4296875,
"loss": 0.6872,
"rewards/accuracies": 0.421875,
"rewards/chosen": 0.033158015459775925,
"rewards/margins": 0.012033844366669655,
"rewards/rejected": 0.02114267274737358,
"step": 20
},
{
"epoch": 0.48879837067209775,
"grad_norm": 0.699894368648529,
"learning_rate": 4.70586371748506e-07,
"logits/chosen": -3.2841796875,
"logits/rejected": -3.2529296875,
"logps/chosen": -40.8515625,
"logps/rejected": -40.571876525878906,
"loss": 0.6765,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 0.0396418571472168,
"rewards/margins": 0.034914396703243256,
"rewards/rejected": 0.004716300871223211,
"step": 30
},
{
"epoch": 0.6517311608961304,
"grad_norm": 1.1233805418014526,
"learning_rate": 4.280458575653296e-07,
"logits/chosen": -3.195117235183716,
"logits/rejected": -3.162890672683716,
"logps/chosen": -43.59375,
"logps/rejected": -44.532814025878906,
"loss": 0.6415,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -0.06923361122608185,
"rewards/margins": 0.11445312201976776,
"rewards/rejected": -0.1837112456560135,
"step": 40
},
{
"epoch": 0.814663951120163,
"grad_norm": 1.2118208408355713,
"learning_rate": 3.7081709127108767e-07,
"logits/chosen": -3.0589842796325684,
"logits/rejected": NaN,
"logps/chosen": -50.13593673706055,
"logps/rejected": -52.04999923706055,
"loss": 0.6138,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.3750244081020355,
"rewards/margins": 0.18903808295726776,
"rewards/rejected": -0.5643554925918579,
"step": 50
},
{
"epoch": 0.9775967413441955,
"grad_norm": 1.2914516925811768,
"learning_rate": 3.0362127536287636e-07,
"logits/chosen": -3.135546922683716,
"logits/rejected": -3.056835889816284,
"logps/chosen": -50.14531326293945,
"logps/rejected": -56.23749923706055,
"loss": 0.566,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.4345153868198395,
"rewards/margins": 0.36333543062210083,
"rewards/rejected": -0.7979736328125,
"step": 60
},
{
"epoch": 1.1466395112016294,
"grad_norm": 1.4273715019226074,
"learning_rate": 2.3200186419770823e-07,
"logits/chosen": -3.161651134490967,
"logits/rejected": -3.0788965225219727,
"logps/chosen": -53.49691390991211,
"logps/rejected": -62.85802459716797,
"loss": 0.5371,
"rewards/accuracies": 0.6095678806304932,
"rewards/chosen": -0.5622889995574951,
"rewards/margins": 0.5249747037887573,
"rewards/rejected": -1.0871431827545166,
"step": 70
},
{
"epoch": 1.309572301425662,
"grad_norm": 1.3244953155517578,
"learning_rate": 1.6186724554503237e-07,
"logits/chosen": -3.133984327316284,
"logits/rejected": -3.0191407203674316,
"logps/chosen": -55.234375,
"logps/rejected": -68.234375,
"loss": 0.4905,
"rewards/accuracies": 0.660937488079071,
"rewards/chosen": -0.6110439300537109,
"rewards/margins": 0.7057861089706421,
"rewards/rejected": -1.3170897960662842,
"step": 80
},
{
"epoch": 1.4725050916496945,
"grad_norm": 1.8561619520187378,
"learning_rate": 9.900331622138063e-08,
"logits/chosen": -3.107617139816284,
"logits/rejected": -2.9839844703674316,
"logps/chosen": -55.421875,
"logps/rejected": -68.80937194824219,
"loss": 0.4936,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.6593307256698608,
"rewards/margins": 0.737597644329071,
"rewards/rejected": -1.3976562023162842,
"step": 90
},
{
"epoch": 1.635437881873727,
"grad_norm": 0.9447304606437683,
"learning_rate": 4.859616286322094e-08,
"logits/chosen": -3.1148438453674316,
"logits/rejected": -2.9876952171325684,
"logps/chosen": -53.092185974121094,
"logps/rejected": -68.9312515258789,
"loss": 0.468,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -0.5302764773368835,
"rewards/margins": 0.887438952922821,
"rewards/rejected": -1.417944312095642,
"step": 100
},
{
"epoch": 1.7983706720977597,
"grad_norm": 1.593487024307251,
"learning_rate": 1.4804225250339281e-08,
"logits/chosen": -3.1207032203674316,
"logits/rejected": NaN,
"logps/chosen": -54.428123474121094,
"logps/rejected": -68.76249694824219,
"loss": 0.4758,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.6090973019599915,
"rewards/margins": 0.817614734172821,
"rewards/rejected": -1.4268066883087158,
"step": 110
},
{
"epoch": 1.9613034623217924,
"grad_norm": 2.265634059906006,
"learning_rate": 4.152374292708538e-10,
"logits/chosen": -3.109179735183716,
"logits/rejected": -2.985156297683716,
"logps/chosen": -54.99687576293945,
"logps/rejected": -71.015625,
"loss": 0.4654,
"rewards/accuracies": 0.667187511920929,
"rewards/chosen": -0.6471847295761108,
"rewards/margins": 0.863818347454071,
"rewards/rejected": -1.510644555091858,
"step": 120
},
{
"epoch": 1.9938900203665988,
"step": 122,
"total_flos": 0.0,
"train_loss": 0.5645458033827485,
"train_runtime": 476.0531,
"train_samples_per_second": 16.473,
"train_steps_per_second": 0.256
}
],
"logging_steps": 10,
"max_steps": 122,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}