|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.08587376556462001, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0021468441391155, |
|
"grad_norm": 1.7282733917236328, |
|
"learning_rate": 0.0004989265779304422, |
|
"loss": 1.4129, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004293688278231, |
|
"grad_norm": 2.1508498191833496, |
|
"learning_rate": 0.0004978531558608846, |
|
"loss": 1.2225, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006440532417346501, |
|
"grad_norm": 1.6386512517929077, |
|
"learning_rate": 0.0004967797337913268, |
|
"loss": 1.1663, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.008587376556462, |
|
"grad_norm": 1.2367421388626099, |
|
"learning_rate": 0.000495706311721769, |
|
"loss": 1.1373, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.010734220695577501, |
|
"grad_norm": 1.2300989627838135, |
|
"learning_rate": 0.0004946328896522112, |
|
"loss": 1.1143, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.012881064834693002, |
|
"grad_norm": 1.1807990074157715, |
|
"learning_rate": 0.0004935594675826536, |
|
"loss": 1.0937, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.015027908973808502, |
|
"grad_norm": 0.9375188946723938, |
|
"learning_rate": 0.0004924860455130958, |
|
"loss": 1.0732, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.017174753112924, |
|
"grad_norm": 0.9801538586616516, |
|
"learning_rate": 0.000491412623443538, |
|
"loss": 1.0369, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.019321597252039503, |
|
"grad_norm": 0.9229792356491089, |
|
"learning_rate": 0.0004903392013739802, |
|
"loss": 1.0093, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.021468441391155002, |
|
"grad_norm": 1.011305570602417, |
|
"learning_rate": 0.0004892657793044225, |
|
"loss": 1.0161, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0236152855302705, |
|
"grad_norm": 0.9356452822685242, |
|
"learning_rate": 0.00048819235723486477, |
|
"loss": 0.9939, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.025762129669386003, |
|
"grad_norm": 1.0092449188232422, |
|
"learning_rate": 0.00048711893516530704, |
|
"loss": 0.9647, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.027908973808501502, |
|
"grad_norm": 0.9663442373275757, |
|
"learning_rate": 0.0004860455130957492, |
|
"loss": 0.9595, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.030055817947617004, |
|
"grad_norm": 1.1502243280410767, |
|
"learning_rate": 0.0004849720910261915, |
|
"loss": 0.9422, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0322026620867325, |
|
"grad_norm": 0.970102846622467, |
|
"learning_rate": 0.00048389866895663376, |
|
"loss": 0.945, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.034349506225848, |
|
"grad_norm": 1.2466392517089844, |
|
"learning_rate": 0.00048282524688707604, |
|
"loss": 0.9385, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0364963503649635, |
|
"grad_norm": 1.0010186433792114, |
|
"learning_rate": 0.00048175182481751826, |
|
"loss": 0.9301, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.038643194504079006, |
|
"grad_norm": 1.2516905069351196, |
|
"learning_rate": 0.0004806784027479605, |
|
"loss": 0.919, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.040790038643194505, |
|
"grad_norm": 0.8497525453567505, |
|
"learning_rate": 0.00047960498067840275, |
|
"loss": 0.9054, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.042936882782310004, |
|
"grad_norm": 1.0371205806732178, |
|
"learning_rate": 0.00047853155860884503, |
|
"loss": 0.9109, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0450837269214255, |
|
"grad_norm": 1.3313541412353516, |
|
"learning_rate": 0.00047745813653928725, |
|
"loss": 0.9131, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.047230571060541, |
|
"grad_norm": 0.9448315501213074, |
|
"learning_rate": 0.0004763847144697295, |
|
"loss": 0.9014, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.04937741519965651, |
|
"grad_norm": 1.274882435798645, |
|
"learning_rate": 0.00047531129240017175, |
|
"loss": 0.8786, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.051524259338772006, |
|
"grad_norm": 1.3116368055343628, |
|
"learning_rate": 0.000474237870330614, |
|
"loss": 0.9075, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.053671103477887505, |
|
"grad_norm": 0.9970440864562988, |
|
"learning_rate": 0.00047316444826105624, |
|
"loss": 0.8932, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.055817947617003004, |
|
"grad_norm": 1.698472499847412, |
|
"learning_rate": 0.0004720910261914985, |
|
"loss": 0.8838, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0579647917561185, |
|
"grad_norm": 1.0129982233047485, |
|
"learning_rate": 0.0004710176041219408, |
|
"loss": 0.8779, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06011163589523401, |
|
"grad_norm": 1.0594947338104248, |
|
"learning_rate": 0.00046994418205238296, |
|
"loss": 0.8631, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.06225848003434951, |
|
"grad_norm": 0.7768178582191467, |
|
"learning_rate": 0.00046887075998282524, |
|
"loss": 0.8666, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.064405324173465, |
|
"grad_norm": 0.9108049869537354, |
|
"learning_rate": 0.0004677973379132675, |
|
"loss": 0.8676, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06655216831258051, |
|
"grad_norm": 1.4127992391586304, |
|
"learning_rate": 0.0004667239158437098, |
|
"loss": 0.8951, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.068699012451696, |
|
"grad_norm": 1.1507939100265503, |
|
"learning_rate": 0.000465650493774152, |
|
"loss": 0.863, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.07084585659081151, |
|
"grad_norm": 1.1579265594482422, |
|
"learning_rate": 0.00046457707170459423, |
|
"loss": 0.8716, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.072992700729927, |
|
"grad_norm": 0.9873006343841553, |
|
"learning_rate": 0.0004635036496350365, |
|
"loss": 0.8569, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.07513954486904251, |
|
"grad_norm": 1.1990203857421875, |
|
"learning_rate": 0.0004624302275654788, |
|
"loss": 0.8776, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07728638900815801, |
|
"grad_norm": 1.1173065900802612, |
|
"learning_rate": 0.000461356805495921, |
|
"loss": 0.865, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0794332331472735, |
|
"grad_norm": 1.2493510246276855, |
|
"learning_rate": 0.0004602833834263633, |
|
"loss": 0.8609, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.08158007728638901, |
|
"grad_norm": 1.1254737377166748, |
|
"learning_rate": 0.0004592099613568055, |
|
"loss": 0.8697, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0837269214255045, |
|
"grad_norm": 1.1009331941604614, |
|
"learning_rate": 0.0004581365392872477, |
|
"loss": 0.8653, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.08587376556462001, |
|
"grad_norm": 1.3970990180969238, |
|
"learning_rate": 0.00045706311721769, |
|
"loss": 0.8542, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4658, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"total_flos": 2.976872891771059e+16, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|