sandernotenbaert commited on
Commit
05372a9
·
verified ·
1 Parent(s): 8515fe9

Training in progress, step 808, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7629bac28a01650e0c47aae68065dd14a3871a16dc19535a5f47d97ef7f593e3
3
  size 362303176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fea8921cf12d4a5d9e2fddeda14297240eb814e6b2f5cb6284bdf0760cf8536
3
  size 362303176
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56c5762af4816dd09402e536fddd14c4f886af7c31765384de8fda2510100c78
3
  size 724761914
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b0a27fa7ceb085a02c901324986360a90328487e76a2815c4e3396f9977dca4
3
  size 724761914
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f29f1a356a90bc512795986655867fd11582b804d45eacb9816a4ff5d2939220
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11ddf4fb347b7d3d57d8a71705558f16da3c5a60f302bda9ec16f52d333df642
3
  size 14244
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18b984273ea2d45b7ffb1d047bb359d93111e41fcad70d16a1b453fd38f72636
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98c7a0a1cf08c2ee753de1f791f907cd20c65ab05d5d3b10185646939c648d81
3
  size 988
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ee86abee0989df8e1c5461d5ecfc6d42b43c8879a30063d7c1ee114f0c589f6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9e16866d142d224011d2a90c46b632f4935f23811dc8dbb420e91a6b2340c0e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.4764267990074442,
6
  "eval_steps": 500,
7
- "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -366,6 +366,216 @@
366
  "eval_samples_per_second": 37.43,
367
  "eval_steps_per_second": 4.687,
368
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  }
370
  ],
371
  "logging_steps": 10,
@@ -380,12 +590,12 @@
380
  "should_evaluate": false,
381
  "should_log": false,
382
  "should_save": true,
383
- "should_training_stop": false
384
  },
385
  "attributes": {}
386
  }
387
  },
388
- "total_flos": 5.182556306040422e+16,
389
  "train_batch_size": 24,
390
  "trial_name": null,
391
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
  "eval_steps": 500,
7
+ "global_step": 808,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
366
  "eval_samples_per_second": 37.43,
367
  "eval_steps_per_second": 4.687,
368
  "step": 500
369
+ },
370
+ {
371
+ "epoch": 2.5260545905707197,
372
+ "grad_norm": 1.716925859451294,
373
+ "learning_rate": 0.0001,
374
+ "loss": 1.5984,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 2.575682382133995,
379
+ "grad_norm": 1.4898698329925537,
380
+ "learning_rate": 0.0001,
381
+ "loss": 1.5929,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 2.6253101736972706,
386
+ "grad_norm": 1.3290361166000366,
387
+ "learning_rate": 0.0001,
388
+ "loss": 1.5847,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 2.674937965260546,
393
+ "grad_norm": 1.2288880348205566,
394
+ "learning_rate": 0.0001,
395
+ "loss": 1.5859,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 2.7245657568238215,
400
+ "grad_norm": 1.0679349899291992,
401
+ "learning_rate": 0.0001,
402
+ "loss": 1.5851,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 2.774193548387097,
407
+ "grad_norm": 1.0576269626617432,
408
+ "learning_rate": 0.0001,
409
+ "loss": 1.5777,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 2.8238213399503724,
414
+ "grad_norm": 1.5344107151031494,
415
+ "learning_rate": 0.0001,
416
+ "loss": 1.5861,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 2.873449131513648,
421
+ "grad_norm": 1.2966816425323486,
422
+ "learning_rate": 0.0001,
423
+ "loss": 1.5907,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 2.9230769230769234,
428
+ "grad_norm": 1.2389014959335327,
429
+ "learning_rate": 0.0001,
430
+ "loss": 1.5683,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 2.9727047146401984,
435
+ "grad_norm": 1.6558314561843872,
436
+ "learning_rate": 0.0001,
437
+ "loss": 1.5772,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 3.0198511166253104,
442
+ "grad_norm": 1.3844249248504639,
443
+ "learning_rate": 0.0001,
444
+ "loss": 1.4848,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 3.069478908188586,
449
+ "grad_norm": 1.4529865980148315,
450
+ "learning_rate": 0.0001,
451
+ "loss": 1.5532,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 3.119106699751861,
456
+ "grad_norm": 2.1029598712921143,
457
+ "learning_rate": 0.0001,
458
+ "loss": 1.5762,
459
+ "step": 630
460
+ },
461
+ {
462
+ "epoch": 3.1687344913151363,
463
+ "grad_norm": 1.028609275817871,
464
+ "learning_rate": 0.0001,
465
+ "loss": 1.5452,
466
+ "step": 640
467
+ },
468
+ {
469
+ "epoch": 3.2183622828784118,
470
+ "grad_norm": 1.214414358139038,
471
+ "learning_rate": 0.0001,
472
+ "loss": 1.5548,
473
+ "step": 650
474
+ },
475
+ {
476
+ "epoch": 3.267990074441687,
477
+ "grad_norm": 1.6931719779968262,
478
+ "learning_rate": 0.0001,
479
+ "loss": 1.545,
480
+ "step": 660
481
+ },
482
+ {
483
+ "epoch": 3.3176178660049627,
484
+ "grad_norm": 1.1534652709960938,
485
+ "learning_rate": 0.0001,
486
+ "loss": 1.5385,
487
+ "step": 670
488
+ },
489
+ {
490
+ "epoch": 3.367245657568238,
491
+ "grad_norm": 1.2802734375,
492
+ "learning_rate": 0.0001,
493
+ "loss": 1.5327,
494
+ "step": 680
495
+ },
496
+ {
497
+ "epoch": 3.4168734491315136,
498
+ "grad_norm": 1.7800501585006714,
499
+ "learning_rate": 0.0001,
500
+ "loss": 1.5577,
501
+ "step": 690
502
+ },
503
+ {
504
+ "epoch": 3.466501240694789,
505
+ "grad_norm": 1.2474421262741089,
506
+ "learning_rate": 0.0001,
507
+ "loss": 1.5394,
508
+ "step": 700
509
+ },
510
+ {
511
+ "epoch": 3.5161290322580645,
512
+ "grad_norm": 1.0985565185546875,
513
+ "learning_rate": 0.0001,
514
+ "loss": 1.557,
515
+ "step": 710
516
+ },
517
+ {
518
+ "epoch": 3.56575682382134,
519
+ "grad_norm": 1.0926990509033203,
520
+ "learning_rate": 0.0001,
521
+ "loss": 1.566,
522
+ "step": 720
523
+ },
524
+ {
525
+ "epoch": 3.6153846153846154,
526
+ "grad_norm": 1.0945656299591064,
527
+ "learning_rate": 0.0001,
528
+ "loss": 1.5391,
529
+ "step": 730
530
+ },
531
+ {
532
+ "epoch": 3.665012406947891,
533
+ "grad_norm": 1.6432324647903442,
534
+ "learning_rate": 0.0001,
535
+ "loss": 1.5572,
536
+ "step": 740
537
+ },
538
+ {
539
+ "epoch": 3.7146401985111663,
540
+ "grad_norm": 1.3223881721496582,
541
+ "learning_rate": 0.0001,
542
+ "loss": 1.538,
543
+ "step": 750
544
+ },
545
+ {
546
+ "epoch": 3.764267990074442,
547
+ "grad_norm": 1.4920518398284912,
548
+ "learning_rate": 0.0001,
549
+ "loss": 1.5552,
550
+ "step": 760
551
+ },
552
+ {
553
+ "epoch": 3.8138957816377173,
554
+ "grad_norm": 0.9920836687088013,
555
+ "learning_rate": 0.0001,
556
+ "loss": 1.5224,
557
+ "step": 770
558
+ },
559
+ {
560
+ "epoch": 3.8635235732009927,
561
+ "grad_norm": 1.6817526817321777,
562
+ "learning_rate": 0.0001,
563
+ "loss": 1.5647,
564
+ "step": 780
565
+ },
566
+ {
567
+ "epoch": 3.9131513647642677,
568
+ "grad_norm": 1.54438054561615,
569
+ "learning_rate": 0.0001,
570
+ "loss": 1.5519,
571
+ "step": 790
572
+ },
573
+ {
574
+ "epoch": 3.962779156327543,
575
+ "grad_norm": 1.1157947778701782,
576
+ "learning_rate": 0.0001,
577
+ "loss": 1.5455,
578
+ "step": 800
579
  }
580
  ],
581
  "logging_steps": 10,
 
590
  "should_evaluate": false,
591
  "should_log": false,
592
  "should_save": true,
593
+ "should_training_stop": true
594
  },
595
  "attributes": {}
596
  }
597
  },
598
+ "total_flos": 8.36943973711872e+16,
599
  "train_batch_size": 24,
600
  "trial_name": null,
601
  "trial_params": null