{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08587376556462001, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021468441391155, "grad_norm": 1.7282733917236328, "learning_rate": 0.0004989265779304422, "loss": 1.4129, "step": 10 }, { "epoch": 0.004293688278231, "grad_norm": 2.1508498191833496, "learning_rate": 0.0004978531558608846, "loss": 1.2225, "step": 20 }, { "epoch": 0.006440532417346501, "grad_norm": 1.6386512517929077, "learning_rate": 0.0004967797337913268, "loss": 1.1663, "step": 30 }, { "epoch": 0.008587376556462, "grad_norm": 1.2367421388626099, "learning_rate": 0.000495706311721769, "loss": 1.1373, "step": 40 }, { "epoch": 0.010734220695577501, "grad_norm": 1.2300989627838135, "learning_rate": 0.0004946328896522112, "loss": 1.1143, "step": 50 }, { "epoch": 0.012881064834693002, "grad_norm": 1.1807990074157715, "learning_rate": 0.0004935594675826536, "loss": 1.0937, "step": 60 }, { "epoch": 0.015027908973808502, "grad_norm": 0.9375188946723938, "learning_rate": 0.0004924860455130958, "loss": 1.0732, "step": 70 }, { "epoch": 0.017174753112924, "grad_norm": 0.9801538586616516, "learning_rate": 0.000491412623443538, "loss": 1.0369, "step": 80 }, { "epoch": 0.019321597252039503, "grad_norm": 0.9229792356491089, "learning_rate": 0.0004903392013739802, "loss": 1.0093, "step": 90 }, { "epoch": 0.021468441391155002, "grad_norm": 1.011305570602417, "learning_rate": 0.0004892657793044225, "loss": 1.0161, "step": 100 }, { "epoch": 0.0236152855302705, "grad_norm": 0.9356452822685242, "learning_rate": 0.00048819235723486477, "loss": 0.9939, "step": 110 }, { "epoch": 0.025762129669386003, "grad_norm": 1.0092449188232422, "learning_rate": 0.00048711893516530704, "loss": 0.9647, "step": 120 }, { "epoch": 0.027908973808501502, "grad_norm": 0.9663442373275757, "learning_rate": 0.0004860455130957492, "loss": 0.9595, "step": 130 }, { "epoch": 0.030055817947617004, "grad_norm": 1.1502243280410767, "learning_rate": 0.0004849720910261915, "loss": 0.9422, "step": 140 }, { "epoch": 0.0322026620867325, "grad_norm": 0.970102846622467, "learning_rate": 0.00048389866895663376, "loss": 0.945, "step": 150 }, { "epoch": 0.034349506225848, "grad_norm": 1.2466392517089844, "learning_rate": 0.00048282524688707604, "loss": 0.9385, "step": 160 }, { "epoch": 0.0364963503649635, "grad_norm": 1.0010186433792114, "learning_rate": 0.00048175182481751826, "loss": 0.9301, "step": 170 }, { "epoch": 0.038643194504079006, "grad_norm": 1.2516905069351196, "learning_rate": 0.0004806784027479605, "loss": 0.919, "step": 180 }, { "epoch": 0.040790038643194505, "grad_norm": 0.8497525453567505, "learning_rate": 0.00047960498067840275, "loss": 0.9054, "step": 190 }, { "epoch": 0.042936882782310004, "grad_norm": 1.0371205806732178, "learning_rate": 0.00047853155860884503, "loss": 0.9109, "step": 200 }, { "epoch": 0.0450837269214255, "grad_norm": 1.3313541412353516, "learning_rate": 0.00047745813653928725, "loss": 0.9131, "step": 210 }, { "epoch": 0.047230571060541, "grad_norm": 0.9448315501213074, "learning_rate": 0.0004763847144697295, "loss": 0.9014, "step": 220 }, { "epoch": 0.04937741519965651, "grad_norm": 1.274882435798645, "learning_rate": 0.00047531129240017175, "loss": 0.8786, "step": 230 }, { "epoch": 0.051524259338772006, "grad_norm": 1.3116368055343628, "learning_rate": 0.000474237870330614, "loss": 0.9075, "step": 240 }, { "epoch": 0.053671103477887505, "grad_norm": 0.9970440864562988, "learning_rate": 0.00047316444826105624, "loss": 0.8932, "step": 250 }, { "epoch": 0.055817947617003004, "grad_norm": 1.698472499847412, "learning_rate": 0.0004720910261914985, "loss": 0.8838, "step": 260 }, { "epoch": 0.0579647917561185, "grad_norm": 1.0129982233047485, "learning_rate": 0.0004710176041219408, "loss": 0.8779, "step": 270 }, { "epoch": 0.06011163589523401, "grad_norm": 1.0594947338104248, "learning_rate": 0.00046994418205238296, "loss": 0.8631, "step": 280 }, { "epoch": 0.06225848003434951, "grad_norm": 0.7768178582191467, "learning_rate": 0.00046887075998282524, "loss": 0.8666, "step": 290 }, { "epoch": 0.064405324173465, "grad_norm": 0.9108049869537354, "learning_rate": 0.0004677973379132675, "loss": 0.8676, "step": 300 }, { "epoch": 0.06655216831258051, "grad_norm": 1.4127992391586304, "learning_rate": 0.0004667239158437098, "loss": 0.8951, "step": 310 }, { "epoch": 0.068699012451696, "grad_norm": 1.1507939100265503, "learning_rate": 0.000465650493774152, "loss": 0.863, "step": 320 }, { "epoch": 0.07084585659081151, "grad_norm": 1.1579265594482422, "learning_rate": 0.00046457707170459423, "loss": 0.8716, "step": 330 }, { "epoch": 0.072992700729927, "grad_norm": 0.9873006343841553, "learning_rate": 0.0004635036496350365, "loss": 0.8569, "step": 340 }, { "epoch": 0.07513954486904251, "grad_norm": 1.1990203857421875, "learning_rate": 0.0004624302275654788, "loss": 0.8776, "step": 350 }, { "epoch": 0.07728638900815801, "grad_norm": 1.1173065900802612, "learning_rate": 0.000461356805495921, "loss": 0.865, "step": 360 }, { "epoch": 0.0794332331472735, "grad_norm": 1.2493510246276855, "learning_rate": 0.0004602833834263633, "loss": 0.8609, "step": 370 }, { "epoch": 0.08158007728638901, "grad_norm": 1.1254737377166748, "learning_rate": 0.0004592099613568055, "loss": 0.8697, "step": 380 }, { "epoch": 0.0837269214255045, "grad_norm": 1.1009331941604614, "learning_rate": 0.0004581365392872477, "loss": 0.8653, "step": 390 }, { "epoch": 0.08587376556462001, "grad_norm": 1.3970990180969238, "learning_rate": 0.00045706311721769, "loss": 0.8542, "step": 400 } ], "logging_steps": 10, "max_steps": 4658, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "total_flos": 2.976872891771059e+16, "train_batch_size": 3, "trial_name": null, "trial_params": null }