Model save
Browse files- README.md +5 -4
- all_results.json +5 -5
- train_results.json +5 -5
- trainer_state.json +509 -12
README.md
CHANGED
@@ -1,16 +1,17 @@
|
|
1 |
---
|
2 |
base_model: Qwen/Qwen2.5-Math-1.5B-Instruct
|
3 |
-
datasets: GAIR/LIMO
|
4 |
library_name: transformers
|
|
|
5 |
tags:
|
6 |
- generated_from_trainer
|
7 |
-
-
|
|
|
8 |
licence: license
|
9 |
---
|
10 |
|
11 |
-
# Model Card for
|
12 |
|
13 |
-
This model is a fine-tuned version of [Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct)
|
14 |
It has been trained using [TRL](https://github.com/huggingface/trl).
|
15 |
|
16 |
## Quick start
|
|
|
1 |
---
|
2 |
base_model: Qwen/Qwen2.5-Math-1.5B-Instruct
|
|
|
3 |
library_name: transformers
|
4 |
+
model_name: Qwen2.5-1.5B-Math-Instruct-LIMO
|
5 |
tags:
|
6 |
- generated_from_trainer
|
7 |
+
- trl
|
8 |
+
- sft
|
9 |
licence: license
|
10 |
---
|
11 |
|
12 |
+
# Model Card for Qwen2.5-1.5B-Math-Instruct-LIMO
|
13 |
|
14 |
+
This model is a fine-tuned version of [Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct).
|
15 |
It has been trained using [TRL](https://github.com/huggingface/trl).
|
16 |
|
17 |
## Quick start
|
all_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
-
"total_flos":
|
3 |
-
"train_loss": 0.
|
4 |
-
"train_runtime":
|
5 |
"train_samples": 817,
|
6 |
-
"train_samples_per_second":
|
7 |
-
"train_steps_per_second": 0.
|
8 |
}
|
|
|
1 |
{
|
2 |
+
"total_flos": 40566502981632.0,
|
3 |
+
"train_loss": 0.25444099340545995,
|
4 |
+
"train_runtime": 2064.0536,
|
5 |
"train_samples": 817,
|
6 |
+
"train_samples_per_second": 22.042,
|
7 |
+
"train_steps_per_second": 0.345
|
8 |
}
|
train_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
-
"total_flos":
|
3 |
-
"train_loss": 0.
|
4 |
-
"train_runtime":
|
5 |
"train_samples": 817,
|
6 |
-
"train_samples_per_second":
|
7 |
-
"train_steps_per_second": 0.
|
8 |
}
|
|
|
1 |
{
|
2 |
+
"total_flos": 40566502981632.0,
|
3 |
+
"train_loss": 0.25444099340545995,
|
4 |
+
"train_runtime": 2064.0536,
|
5 |
"train_samples": 817,
|
6 |
+
"train_samples_per_second": 22.042,
|
7 |
+
"train_steps_per_second": 0.345
|
8 |
}
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch":
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -506,19 +506,516 @@
|
|
506 |
"step": 355
|
507 |
},
|
508 |
{
|
509 |
-
"epoch": 4.
|
510 |
-
"
|
511 |
-
"
|
512 |
-
"
|
513 |
-
"
|
514 |
-
|
515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
516 |
}
|
517 |
],
|
518 |
"logging_steps": 5,
|
519 |
-
"max_steps":
|
520 |
"num_input_tokens_seen": 0,
|
521 |
-
"num_train_epochs":
|
522 |
"save_steps": 200,
|
523 |
"stateful_callbacks": {
|
524 |
"TrainerControl": {
|
@@ -532,7 +1029,7 @@
|
|
532 |
"attributes": {}
|
533 |
}
|
534 |
},
|
535 |
-
"total_flos":
|
536 |
"train_batch_size": 8,
|
537 |
"trial_name": null,
|
538 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 8.0,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 712,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
506 |
"step": 355
|
507 |
},
|
508 |
{
|
509 |
+
"epoch": 4.044943820224719,
|
510 |
+
"grad_norm": 0.4074217412543327,
|
511 |
+
"learning_rate": 1.1585150086919896e-05,
|
512 |
+
"loss": 0.6009,
|
513 |
+
"step": 360
|
514 |
+
},
|
515 |
+
{
|
516 |
+
"epoch": 4.101123595505618,
|
517 |
+
"grad_norm": 0.4271665264127198,
|
518 |
+
"learning_rate": 1.137632369284973e-05,
|
519 |
+
"loss": 0.6207,
|
520 |
+
"step": 365
|
521 |
+
},
|
522 |
+
{
|
523 |
+
"epoch": 4.157303370786517,
|
524 |
+
"grad_norm": 0.43109911858630096,
|
525 |
+
"learning_rate": 1.1167294115304992e-05,
|
526 |
+
"loss": 0.5911,
|
527 |
+
"step": 370
|
528 |
+
},
|
529 |
+
{
|
530 |
+
"epoch": 4.213483146067416,
|
531 |
+
"grad_norm": 0.43839912783355806,
|
532 |
+
"learning_rate": 1.0958174212861062e-05,
|
533 |
+
"loss": 0.6011,
|
534 |
+
"step": 375
|
535 |
+
},
|
536 |
+
{
|
537 |
+
"epoch": 4.269662921348314,
|
538 |
+
"grad_norm": 0.4190687745455898,
|
539 |
+
"learning_rate": 1.0749076892861227e-05,
|
540 |
+
"loss": 0.5983,
|
541 |
+
"step": 380
|
542 |
+
},
|
543 |
+
{
|
544 |
+
"epoch": 4.325842696629214,
|
545 |
+
"grad_norm": 0.4272292754884261,
|
546 |
+
"learning_rate": 1.0540115050456152e-05,
|
547 |
+
"loss": 0.6298,
|
548 |
+
"step": 385
|
549 |
+
},
|
550 |
+
{
|
551 |
+
"epoch": 4.382022471910112,
|
552 |
+
"grad_norm": 0.4188166195248199,
|
553 |
+
"learning_rate": 1.0331401507649868e-05,
|
554 |
+
"loss": 0.614,
|
555 |
+
"step": 390
|
556 |
+
},
|
557 |
+
{
|
558 |
+
"epoch": 4.438202247191011,
|
559 |
+
"grad_norm": 0.46031386712411543,
|
560 |
+
"learning_rate": 1.012304895238529e-05,
|
561 |
+
"loss": 0.5934,
|
562 |
+
"step": 395
|
563 |
+
},
|
564 |
+
{
|
565 |
+
"epoch": 4.49438202247191,
|
566 |
+
"grad_norm": 0.4198160623739865,
|
567 |
+
"learning_rate": 9.915169877702096e-06,
|
568 |
+
"loss": 0.5816,
|
569 |
+
"step": 400
|
570 |
+
},
|
571 |
+
{
|
572 |
+
"epoch": 4.550561797752809,
|
573 |
+
"grad_norm": 0.41877479862746064,
|
574 |
+
"learning_rate": 9.707876520999864e-06,
|
575 |
+
"loss": 0.6071,
|
576 |
+
"step": 405
|
577 |
+
},
|
578 |
+
{
|
579 |
+
"epoch": 4.606741573033708,
|
580 |
+
"grad_norm": 0.445785010864897,
|
581 |
+
"learning_rate": 9.501280803439204e-06,
|
582 |
+
"loss": 0.5816,
|
583 |
+
"step": 410
|
584 |
+
},
|
585 |
+
{
|
586 |
+
"epoch": 4.662921348314606,
|
587 |
+
"grad_norm": 0.4481588101026953,
|
588 |
+
"learning_rate": 9.295494269513659e-06,
|
589 |
+
"loss": 0.6006,
|
590 |
+
"step": 415
|
591 |
+
},
|
592 |
+
{
|
593 |
+
"epoch": 4.719101123595506,
|
594 |
+
"grad_norm": 0.4260960071459342,
|
595 |
+
"learning_rate": 9.090628026824941e-06,
|
596 |
+
"loss": 0.6357,
|
597 |
+
"step": 420
|
598 |
+
},
|
599 |
+
{
|
600 |
+
"epoch": 4.775280898876405,
|
601 |
+
"grad_norm": 0.4192037207998735,
|
602 |
+
"learning_rate": 8.886792686094096e-06,
|
603 |
+
"loss": 0.6006,
|
604 |
+
"step": 425
|
605 |
+
},
|
606 |
+
{
|
607 |
+
"epoch": 4.831460674157303,
|
608 |
+
"grad_norm": 0.43566168980415704,
|
609 |
+
"learning_rate": 8.684098301440903e-06,
|
610 |
+
"loss": 0.6267,
|
611 |
+
"step": 430
|
612 |
+
},
|
613 |
+
{
|
614 |
+
"epoch": 4.887640449438202,
|
615 |
+
"grad_norm": 0.42117325698617175,
|
616 |
+
"learning_rate": 8.482654310963817e-06,
|
617 |
+
"loss": 0.595,
|
618 |
+
"step": 435
|
619 |
+
},
|
620 |
+
{
|
621 |
+
"epoch": 4.943820224719101,
|
622 |
+
"grad_norm": 0.4414097385180421,
|
623 |
+
"learning_rate": 8.2825694776525e-06,
|
624 |
+
"loss": 0.5871,
|
625 |
+
"step": 440
|
626 |
+
},
|
627 |
+
{
|
628 |
+
"epoch": 5.0,
|
629 |
+
"grad_norm": 0.41448225752045526,
|
630 |
+
"learning_rate": 8.083951830664867e-06,
|
631 |
+
"loss": 0.5838,
|
632 |
+
"step": 445
|
633 |
+
},
|
634 |
+
{
|
635 |
+
"epoch": 5.056179775280899,
|
636 |
+
"grad_norm": 0.44467080989005614,
|
637 |
+
"learning_rate": 7.886908607000321e-06,
|
638 |
+
"loss": 0.5612,
|
639 |
+
"step": 450
|
640 |
+
},
|
641 |
+
{
|
642 |
+
"epoch": 5.112359550561798,
|
643 |
+
"grad_norm": 0.48267692647228083,
|
644 |
+
"learning_rate": 7.691546193600702e-06,
|
645 |
+
"loss": 0.5438,
|
646 |
+
"step": 455
|
647 |
+
},
|
648 |
+
{
|
649 |
+
"epoch": 5.168539325842697,
|
650 |
+
"grad_norm": 0.46882770217949304,
|
651 |
+
"learning_rate": 7.497970069910192e-06,
|
652 |
+
"loss": 0.5145,
|
653 |
+
"step": 460
|
654 |
+
},
|
655 |
+
{
|
656 |
+
"epoch": 5.224719101123595,
|
657 |
+
"grad_norm": 0.45016082689033277,
|
658 |
+
"learning_rate": 7.306284750925192e-06,
|
659 |
+
"loss": 0.5392,
|
660 |
+
"step": 465
|
661 |
+
},
|
662 |
+
{
|
663 |
+
"epoch": 5.280898876404494,
|
664 |
+
"grad_norm": 0.4904333208280953,
|
665 |
+
"learning_rate": 7.116593730764929e-06,
|
666 |
+
"loss": 0.5172,
|
667 |
+
"step": 470
|
668 |
+
},
|
669 |
+
{
|
670 |
+
"epoch": 5.337078651685394,
|
671 |
+
"grad_norm": 0.5047681974857045,
|
672 |
+
"learning_rate": 6.928999426793234e-06,
|
673 |
+
"loss": 0.5408,
|
674 |
+
"step": 475
|
675 |
+
},
|
676 |
+
{
|
677 |
+
"epoch": 5.393258426966292,
|
678 |
+
"grad_norm": 0.46939981998911995,
|
679 |
+
"learning_rate": 6.743603124321712e-06,
|
680 |
+
"loss": 0.5408,
|
681 |
+
"step": 480
|
682 |
+
},
|
683 |
+
{
|
684 |
+
"epoch": 5.449438202247191,
|
685 |
+
"grad_norm": 0.48583542280728176,
|
686 |
+
"learning_rate": 6.5605049219240635e-06,
|
687 |
+
"loss": 0.5564,
|
688 |
+
"step": 485
|
689 |
+
},
|
690 |
+
{
|
691 |
+
"epoch": 5.50561797752809,
|
692 |
+
"grad_norm": 0.445507769617856,
|
693 |
+
"learning_rate": 6.379803677391223e-06,
|
694 |
+
"loss": 0.5272,
|
695 |
+
"step": 490
|
696 |
+
},
|
697 |
+
{
|
698 |
+
"epoch": 5.561797752808989,
|
699 |
+
"grad_norm": 0.48127792996455504,
|
700 |
+
"learning_rate": 6.201596954356362e-06,
|
701 |
+
"loss": 0.5221,
|
702 |
+
"step": 495
|
703 |
+
},
|
704 |
+
{
|
705 |
+
"epoch": 5.617977528089888,
|
706 |
+
"grad_norm": 0.48763800709480937,
|
707 |
+
"learning_rate": 6.02598096961865e-06,
|
708 |
+
"loss": 0.5017,
|
709 |
+
"step": 500
|
710 |
+
},
|
711 |
+
{
|
712 |
+
"epoch": 5.674157303370786,
|
713 |
+
"grad_norm": 0.45141538643328166,
|
714 |
+
"learning_rate": 5.853050541194187e-06,
|
715 |
+
"loss": 0.5301,
|
716 |
+
"step": 505
|
717 |
+
},
|
718 |
+
{
|
719 |
+
"epoch": 5.730337078651686,
|
720 |
+
"grad_norm": 0.46888316703181054,
|
721 |
+
"learning_rate": 5.682899037122178e-06,
|
722 |
+
"loss": 0.5088,
|
723 |
+
"step": 510
|
724 |
+
},
|
725 |
+
{
|
726 |
+
"epoch": 5.786516853932584,
|
727 |
+
"grad_norm": 0.49578196304683,
|
728 |
+
"learning_rate": 5.515618325053952e-06,
|
729 |
+
"loss": 0.5051,
|
730 |
+
"step": 515
|
731 |
+
},
|
732 |
+
{
|
733 |
+
"epoch": 5.842696629213483,
|
734 |
+
"grad_norm": 0.4766026956952897,
|
735 |
+
"learning_rate": 5.351298722652064e-06,
|
736 |
+
"loss": 0.509,
|
737 |
+
"step": 520
|
738 |
+
},
|
739 |
+
{
|
740 |
+
"epoch": 5.898876404494382,
|
741 |
+
"grad_norm": 0.5255442360578876,
|
742 |
+
"learning_rate": 5.190028948826304e-06,
|
743 |
+
"loss": 0.5465,
|
744 |
+
"step": 525
|
745 |
+
},
|
746 |
+
{
|
747 |
+
"epoch": 5.955056179775281,
|
748 |
+
"grad_norm": 0.5102538127102871,
|
749 |
+
"learning_rate": 5.031896075832846e-06,
|
750 |
+
"loss": 0.5186,
|
751 |
+
"step": 530
|
752 |
+
},
|
753 |
+
{
|
754 |
+
"epoch": 6.01123595505618,
|
755 |
+
"grad_norm": 0.5022519456849229,
|
756 |
+
"learning_rate": 4.876985482262482e-06,
|
757 |
+
"loss": 0.5204,
|
758 |
+
"step": 535
|
759 |
+
},
|
760 |
+
{
|
761 |
+
"epoch": 6.067415730337078,
|
762 |
+
"grad_norm": 0.46834498920984025,
|
763 |
+
"learning_rate": 4.725380806943299e-06,
|
764 |
+
"loss": 0.4707,
|
765 |
+
"step": 540
|
766 |
+
},
|
767 |
+
{
|
768 |
+
"epoch": 6.123595505617978,
|
769 |
+
"grad_norm": 0.5403111606747254,
|
770 |
+
"learning_rate": 4.577163903782655e-06,
|
771 |
+
"loss": 0.4491,
|
772 |
+
"step": 545
|
773 |
+
},
|
774 |
+
{
|
775 |
+
"epoch": 6.179775280898877,
|
776 |
+
"grad_norm": 0.5964108863972761,
|
777 |
+
"learning_rate": 4.432414797572894e-06,
|
778 |
+
"loss": 0.4709,
|
779 |
+
"step": 550
|
780 |
+
},
|
781 |
+
{
|
782 |
+
"epoch": 6.235955056179775,
|
783 |
+
"grad_norm": 0.4994302712693112,
|
784 |
+
"learning_rate": 4.291211640784608e-06,
|
785 |
+
"loss": 0.4708,
|
786 |
+
"step": 555
|
787 |
+
},
|
788 |
+
{
|
789 |
+
"epoch": 6.292134831460674,
|
790 |
+
"grad_norm": 0.49243733113864285,
|
791 |
+
"learning_rate": 4.153630671370821e-06,
|
792 |
+
"loss": 0.4926,
|
793 |
+
"step": 560
|
794 |
+
},
|
795 |
+
{
|
796 |
+
"epoch": 6.348314606741573,
|
797 |
+
"grad_norm": 0.49658882787661995,
|
798 |
+
"learning_rate": 4.019746171604824e-06,
|
799 |
+
"loss": 0.502,
|
800 |
+
"step": 565
|
801 |
+
},
|
802 |
+
{
|
803 |
+
"epoch": 6.404494382022472,
|
804 |
+
"grad_norm": 0.5418840142061598,
|
805 |
+
"learning_rate": 3.889630427973951e-06,
|
806 |
+
"loss": 0.4602,
|
807 |
+
"step": 570
|
808 |
+
},
|
809 |
+
{
|
810 |
+
"epoch": 6.460674157303371,
|
811 |
+
"grad_norm": 0.5008443330249364,
|
812 |
+
"learning_rate": 3.763353692150864e-06,
|
813 |
+
"loss": 0.4665,
|
814 |
+
"step": 575
|
815 |
+
},
|
816 |
+
{
|
817 |
+
"epoch": 6.51685393258427,
|
818 |
+
"grad_norm": 0.48589030681678613,
|
819 |
+
"learning_rate": 3.6409841430635166e-06,
|
820 |
+
"loss": 0.4497,
|
821 |
+
"step": 580
|
822 |
+
},
|
823 |
+
{
|
824 |
+
"epoch": 6.573033707865169,
|
825 |
+
"grad_norm": 0.5820871330261026,
|
826 |
+
"learning_rate": 3.522587850084197e-06,
|
827 |
+
"loss": 0.4629,
|
828 |
+
"step": 585
|
829 |
+
},
|
830 |
+
{
|
831 |
+
"epoch": 6.629213483146067,
|
832 |
+
"grad_norm": 0.49834548702881326,
|
833 |
+
"learning_rate": 3.408228737357575e-06,
|
834 |
+
"loss": 0.483,
|
835 |
+
"step": 590
|
836 |
+
},
|
837 |
+
{
|
838 |
+
"epoch": 6.685393258426966,
|
839 |
+
"grad_norm": 0.5052799618260936,
|
840 |
+
"learning_rate": 3.297968549286974e-06,
|
841 |
+
"loss": 0.4808,
|
842 |
+
"step": 595
|
843 |
+
},
|
844 |
+
{
|
845 |
+
"epoch": 6.741573033707866,
|
846 |
+
"grad_norm": 0.492591898344833,
|
847 |
+
"learning_rate": 3.191866817197539e-06,
|
848 |
+
"loss": 0.472,
|
849 |
+
"step": 600
|
850 |
+
},
|
851 |
+
{
|
852 |
+
"epoch": 6.797752808988764,
|
853 |
+
"grad_norm": 0.49032082408314204,
|
854 |
+
"learning_rate": 3.089980827194276e-06,
|
855 |
+
"loss": 0.4611,
|
856 |
+
"step": 605
|
857 |
+
},
|
858 |
+
{
|
859 |
+
"epoch": 6.853932584269663,
|
860 |
+
"grad_norm": 0.4959126643842184,
|
861 |
+
"learning_rate": 2.9923655892323144e-06,
|
862 |
+
"loss": 0.4726,
|
863 |
+
"step": 610
|
864 |
+
},
|
865 |
+
{
|
866 |
+
"epoch": 6.910112359550562,
|
867 |
+
"grad_norm": 0.5335774103596451,
|
868 |
+
"learning_rate": 2.8990738074161196e-06,
|
869 |
+
"loss": 0.4792,
|
870 |
+
"step": 615
|
871 |
+
},
|
872 |
+
{
|
873 |
+
"epoch": 6.966292134831461,
|
874 |
+
"grad_norm": 0.5201706543464556,
|
875 |
+
"learning_rate": 2.8101558515436506e-06,
|
876 |
+
"loss": 0.4711,
|
877 |
+
"step": 620
|
878 |
+
},
|
879 |
+
{
|
880 |
+
"epoch": 7.022471910112359,
|
881 |
+
"grad_norm": 0.4623475019520009,
|
882 |
+
"learning_rate": 2.725659729910878e-06,
|
883 |
+
"loss": 0.4545,
|
884 |
+
"step": 625
|
885 |
+
},
|
886 |
+
{
|
887 |
+
"epoch": 7.078651685393258,
|
888 |
+
"grad_norm": 0.4834608390252829,
|
889 |
+
"learning_rate": 2.645631063391285e-06,
|
890 |
+
"loss": 0.443,
|
891 |
+
"step": 630
|
892 |
+
},
|
893 |
+
{
|
894 |
+
"epoch": 7.134831460674158,
|
895 |
+
"grad_norm": 0.5359592571307704,
|
896 |
+
"learning_rate": 2.570113060804401e-06,
|
897 |
+
"loss": 0.441,
|
898 |
+
"step": 635
|
899 |
+
},
|
900 |
+
{
|
901 |
+
"epoch": 7.191011235955056,
|
902 |
+
"grad_norm": 0.5433362481606256,
|
903 |
+
"learning_rate": 2.4991464955866314e-06,
|
904 |
+
"loss": 0.4264,
|
905 |
+
"step": 640
|
906 |
+
},
|
907 |
+
{
|
908 |
+
"epoch": 7.247191011235955,
|
909 |
+
"grad_norm": 0.5225953610583668,
|
910 |
+
"learning_rate": 2.432769683776995e-06,
|
911 |
+
"loss": 0.4145,
|
912 |
+
"step": 645
|
913 |
+
},
|
914 |
+
{
|
915 |
+
"epoch": 7.303370786516854,
|
916 |
+
"grad_norm": 0.5205000317985969,
|
917 |
+
"learning_rate": 2.371018463329651e-06,
|
918 |
+
"loss": 0.435,
|
919 |
+
"step": 650
|
920 |
+
},
|
921 |
+
{
|
922 |
+
"epoch": 7.359550561797753,
|
923 |
+
"grad_norm": 0.5007819526288158,
|
924 |
+
"learning_rate": 2.3139261747643925e-06,
|
925 |
+
"loss": 0.41,
|
926 |
+
"step": 655
|
927 |
+
},
|
928 |
+
{
|
929 |
+
"epoch": 7.415730337078652,
|
930 |
+
"grad_norm": 0.518942310039327,
|
931 |
+
"learning_rate": 2.261523643165532e-06,
|
932 |
+
"loss": 0.4502,
|
933 |
+
"step": 660
|
934 |
+
},
|
935 |
+
{
|
936 |
+
"epoch": 7.47191011235955,
|
937 |
+
"grad_norm": 0.49873700284836797,
|
938 |
+
"learning_rate": 2.2138391615389306e-06,
|
939 |
+
"loss": 0.4324,
|
940 |
+
"step": 665
|
941 |
+
},
|
942 |
+
{
|
943 |
+
"epoch": 7.52808988764045,
|
944 |
+
"grad_norm": 0.5208414783528387,
|
945 |
+
"learning_rate": 2.1708984755361205e-06,
|
946 |
+
"loss": 0.4449,
|
947 |
+
"step": 670
|
948 |
+
},
|
949 |
+
{
|
950 |
+
"epoch": 7.584269662921348,
|
951 |
+
"grad_norm": 0.5791900845785815,
|
952 |
+
"learning_rate": 2.1327247695538015e-06,
|
953 |
+
"loss": 0.4484,
|
954 |
+
"step": 675
|
955 |
+
},
|
956 |
+
{
|
957 |
+
"epoch": 7.640449438202247,
|
958 |
+
"grad_norm": 0.5438940813210439,
|
959 |
+
"learning_rate": 2.0993386542161944e-06,
|
960 |
+
"loss": 0.4685,
|
961 |
+
"step": 680
|
962 |
+
},
|
963 |
+
{
|
964 |
+
"epoch": 7.696629213483146,
|
965 |
+
"grad_norm": 0.5127534158556458,
|
966 |
+
"learning_rate": 2.070758155247017e-06,
|
967 |
+
"loss": 0.4339,
|
968 |
+
"step": 685
|
969 |
+
},
|
970 |
+
{
|
971 |
+
"epoch": 7.752808988764045,
|
972 |
+
"grad_norm": 0.5415209164546988,
|
973 |
+
"learning_rate": 2.0469987037371005e-06,
|
974 |
+
"loss": 0.4103,
|
975 |
+
"step": 690
|
976 |
+
},
|
977 |
+
{
|
978 |
+
"epoch": 7.808988764044944,
|
979 |
+
"grad_norm": 0.5136419675345572,
|
980 |
+
"learning_rate": 2.028073127812876e-06,
|
981 |
+
"loss": 0.4169,
|
982 |
+
"step": 695
|
983 |
+
},
|
984 |
+
{
|
985 |
+
"epoch": 7.865168539325842,
|
986 |
+
"grad_norm": 0.5319921953080505,
|
987 |
+
"learning_rate": 2.013991645710262e-06,
|
988 |
+
"loss": 0.4286,
|
989 |
+
"step": 700
|
990 |
+
},
|
991 |
+
{
|
992 |
+
"epoch": 7.921348314606742,
|
993 |
+
"grad_norm": 0.5733506841186511,
|
994 |
+
"learning_rate": 2.0047618602576594e-06,
|
995 |
+
"loss": 0.4505,
|
996 |
+
"step": 705
|
997 |
+
},
|
998 |
+
{
|
999 |
+
"epoch": 7.97752808988764,
|
1000 |
+
"grad_norm": 0.5142139821667071,
|
1001 |
+
"learning_rate": 2.0003887547710647e-06,
|
1002 |
+
"loss": 0.4202,
|
1003 |
+
"step": 710
|
1004 |
+
},
|
1005 |
+
{
|
1006 |
+
"epoch": 8.0,
|
1007 |
+
"step": 712,
|
1008 |
+
"total_flos": 40566502981632.0,
|
1009 |
+
"train_loss": 0.25444099340545995,
|
1010 |
+
"train_runtime": 2064.0536,
|
1011 |
+
"train_samples_per_second": 22.042,
|
1012 |
+
"train_steps_per_second": 0.345
|
1013 |
}
|
1014 |
],
|
1015 |
"logging_steps": 5,
|
1016 |
+
"max_steps": 712,
|
1017 |
"num_input_tokens_seen": 0,
|
1018 |
+
"num_train_epochs": 8,
|
1019 |
"save_steps": 200,
|
1020 |
"stateful_callbacks": {
|
1021 |
"TrainerControl": {
|
|
|
1029 |
"attributes": {}
|
1030 |
}
|
1031 |
},
|
1032 |
+
"total_flos": 40566502981632.0,
|
1033 |
"train_batch_size": 8,
|
1034 |
"trial_name": null,
|
1035 |
"trial_params": null
|