AlleSpezza's picture
Upload LoRa adapter model with Rank = 64. Using oversampling technique to tackle the issue of class imbalance.
a03b535 verified
{
"best_global_step": 2500,
"best_metric": 0.3258962035179138,
"best_model_checkpoint": "output/checkpoint-2500",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2607,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11514104778353483,
"grad_norm": 0.36587607860565186,
"learning_rate": 7.586206896551724e-05,
"loss": 1.271,
"mean_token_accuracy": 0.7241690966486931,
"num_tokens": 1095302.0,
"step": 100
},
{
"epoch": 0.23028209556706966,
"grad_norm": 0.41935256123542786,
"learning_rate": 0.0001524904214559387,
"loss": 0.8833,
"mean_token_accuracy": 0.7880920493602752,
"num_tokens": 2219842.0,
"step": 200
},
{
"epoch": 0.3454231433506045,
"grad_norm": 0.4040184020996094,
"learning_rate": 0.0001998705544249015,
"loss": 0.6926,
"mean_token_accuracy": 0.8305454832315445,
"num_tokens": 3321210.0,
"step": 300
},
{
"epoch": 0.4605641911341393,
"grad_norm": 0.3564732074737549,
"learning_rate": 0.0001982973099683902,
"loss": 0.5373,
"mean_token_accuracy": 0.8680069527029991,
"num_tokens": 4464927.0,
"step": 400
},
{
"epoch": 0.5757052389176741,
"grad_norm": 0.4126855731010437,
"learning_rate": 0.00019496396989003193,
"loss": 0.453,
"mean_token_accuracy": 0.8878908574581146,
"num_tokens": 5583328.0,
"step": 500
},
{
"epoch": 0.5757052389176741,
"eval_loss": 0.6618072986602783,
"eval_mean_token_accuracy": 0.8427139545351731,
"eval_num_tokens": 5583328.0,
"eval_runtime": 303.7163,
"eval_samples_per_second": 5.084,
"eval_steps_per_second": 1.271,
"step": 500
},
{
"epoch": 0.690846286701209,
"grad_norm": 0.3499382436275482,
"learning_rate": 0.0001899302204343428,
"loss": 0.3756,
"mean_token_accuracy": 0.9073699393868446,
"num_tokens": 6704920.0,
"step": 600
},
{
"epoch": 0.8059873344847438,
"grad_norm": 0.38613247871398926,
"learning_rate": 0.00018328619509919044,
"loss": 0.332,
"mean_token_accuracy": 0.9180173775553704,
"num_tokens": 7818546.0,
"step": 700
},
{
"epoch": 0.9211283822682786,
"grad_norm": 0.3983283042907715,
"learning_rate": 0.00017515086072006204,
"loss": 0.285,
"mean_token_accuracy": 0.9301327157020569,
"num_tokens": 8936663.0,
"step": 800
},
{
"epoch": 1.035693724812896,
"grad_norm": 0.4746868312358856,
"learning_rate": 0.00016566988726928513,
"loss": 0.2345,
"mean_token_accuracy": 0.9422861801919027,
"num_tokens": 10063248.0,
"step": 900
},
{
"epoch": 1.1508347725964305,
"grad_norm": 0.3728397488594055,
"learning_rate": 0.00015501303951322943,
"loss": 0.2199,
"mean_token_accuracy": 0.9462704074382782,
"num_tokens": 11183984.0,
"step": 1000
},
{
"epoch": 1.1508347725964305,
"eval_loss": 0.4751618206501007,
"eval_mean_token_accuracy": 0.8873606966567164,
"eval_num_tokens": 11183984.0,
"eval_runtime": 304.4089,
"eval_samples_per_second": 5.072,
"eval_steps_per_second": 1.268,
"step": 1000
},
{
"epoch": 1.2659758203799654,
"grad_norm": 0.38173842430114746,
"learning_rate": 0.00014337113723205126,
"loss": 0.2083,
"mean_token_accuracy": 0.9480547454953193,
"num_tokens": 12290743.0,
"step": 1100
},
{
"epoch": 1.3811168681635002,
"grad_norm": 0.32138150930404663,
"learning_rate": 0.00013095263843179028,
"loss": 0.1801,
"mean_token_accuracy": 0.9549228352308273,
"num_tokens": 13386739.0,
"step": 1200
},
{
"epoch": 1.496257915947035,
"grad_norm": 0.27364546060562134,
"learning_rate": 0.00011797990672926652,
"loss": 0.1706,
"mean_token_accuracy": 0.9577119436860084,
"num_tokens": 14529074.0,
"step": 1300
},
{
"epoch": 1.61139896373057,
"grad_norm": 0.3040049970149994,
"learning_rate": 0.00010468522974537567,
"loss": 0.1571,
"mean_token_accuracy": 0.9611357498168945,
"num_tokens": 15631838.0,
"step": 1400
},
{
"epoch": 1.7265400115141047,
"grad_norm": 0.30038943886756897,
"learning_rate": 9.130665980078394e-05,
"loss": 0.1403,
"mean_token_accuracy": 0.9660706561803818,
"num_tokens": 16764080.0,
"step": 1500
},
{
"epoch": 1.7265400115141047,
"eval_loss": 0.38552024960517883,
"eval_mean_token_accuracy": 0.9092679991932113,
"eval_num_tokens": 16764080.0,
"eval_runtime": 301.8929,
"eval_samples_per_second": 5.114,
"eval_steps_per_second": 1.279,
"step": 1500
},
{
"epoch": 1.8416810592976396,
"grad_norm": 0.09564235061407089,
"learning_rate": 7.808375138984745e-05,
"loss": 0.132,
"mean_token_accuracy": 0.967877941429615,
"num_tokens": 17861001.0,
"step": 1600
},
{
"epoch": 1.9568221070811744,
"grad_norm": 0.29065728187561035,
"learning_rate": 6.525327175685459e-05,
"loss": 0.1325,
"mean_token_accuracy": 0.9674064460396766,
"num_tokens": 18955484.0,
"step": 1700
},
{
"epoch": 2.071387449625792,
"grad_norm": 0.14087554812431335,
"learning_rate": 5.304496138031373e-05,
"loss": 0.128,
"mean_token_accuracy": 0.9687896742293584,
"num_tokens": 20065177.0,
"step": 1800
},
{
"epoch": 2.186528497409326,
"grad_norm": 0.3411475718021393,
"learning_rate": 4.167742027736482e-05,
"loss": 0.1113,
"mean_token_accuracy": 0.9726087141036988,
"num_tokens": 21183569.0,
"step": 1900
},
{
"epoch": 2.301669545192861,
"grad_norm": 0.12446445226669312,
"learning_rate": 3.135419378747742e-05,
"loss": 0.0979,
"mean_token_accuracy": 0.9754682299494744,
"num_tokens": 22291098.0,
"step": 2000
},
{
"epoch": 2.301669545192861,
"eval_loss": 0.3393489718437195,
"eval_mean_token_accuracy": 0.9214305071633097,
"eval_num_tokens": 22291098.0,
"eval_runtime": 279.3526,
"eval_samples_per_second": 5.527,
"eval_steps_per_second": 1.382,
"step": 2000
},
{
"epoch": 2.416810592976396,
"grad_norm": 0.3788171708583832,
"learning_rate": 2.226012792275538e-05,
"loss": 0.0892,
"mean_token_accuracy": 0.9781047487258911,
"num_tokens": 23410315.0,
"step": 2100
},
{
"epoch": 2.5319516407599307,
"grad_norm": 0.17727908492088318,
"learning_rate": 1.4558059545351143e-05,
"loss": 0.0908,
"mean_token_accuracy": 0.9779339152574539,
"num_tokens": 24545121.0,
"step": 2200
},
{
"epoch": 2.6470926885434656,
"grad_norm": 0.16450349986553192,
"learning_rate": 8.385900637134792e-06,
"loss": 0.0924,
"mean_token_accuracy": 0.9776055815815926,
"num_tokens": 25675956.0,
"step": 2300
},
{
"epoch": 2.7622337363270004,
"grad_norm": 0.2806933522224426,
"learning_rate": 3.85416887020934e-06,
"loss": 0.1021,
"mean_token_accuracy": 0.9756536969542503,
"num_tokens": 26787936.0,
"step": 2400
},
{
"epoch": 2.8773747841105353,
"grad_norm": 0.23432117700576782,
"learning_rate": 1.0440086954749517e-06,
"loss": 0.0969,
"mean_token_accuracy": 0.9766348168253899,
"num_tokens": 27883884.0,
"step": 2500
},
{
"epoch": 2.8773747841105353,
"eval_loss": 0.3258962035179138,
"eval_mean_token_accuracy": 0.9253271395060683,
"eval_num_tokens": 27883884.0,
"eval_runtime": 278.746,
"eval_samples_per_second": 5.539,
"eval_steps_per_second": 1.385,
"step": 2500
},
{
"epoch": 2.99251583189407,
"grad_norm": 0.3019677996635437,
"learning_rate": 5.738383307818396e-09,
"loss": 0.0937,
"mean_token_accuracy": 0.9774355563521385,
"num_tokens": 28979973.0,
"step": 2600
}
],
"logging_steps": 100,
"max_steps": 2607,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.979053737195946e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}