li-muyang commited on
Commit
dda55a0
·
verified ·
1 Parent(s): ce4b95c

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.0243
24
 
25
  ## Model description
26
 
@@ -44,39 +44,34 @@ The following hyperparameters were used during training:
44
  - eval_batch_size: 16
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
- - num_devices: 16
 
48
  - total_train_batch_size: 128
49
- - total_eval_batch_size: 256
50
- - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
51
  - lr_scheduler_type: cosine
52
  - lr_scheduler_warmup_ratio: 0.1
53
- - num_epochs: 3
54
 
55
  ### Training results
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:------:|:----:|:---------------:|
59
- | 0.9838 | 0.1845 | 200 | 0.9937 |
60
- | 1.0162 | 0.3690 | 400 | 1.0329 |
61
- | 1.0095 | 0.5535 | 600 | 1.0302 |
62
- | 0.9857 | 0.7380 | 800 | 1.0204 |
63
- | 0.9803 | 0.9225 | 1000 | 1.0051 |
64
- | 0.736 | 1.1070 | 1200 | 1.0061 |
65
- | 0.7249 | 1.2915 | 1400 | 1.0004 |
66
- | 0.7355 | 1.4760 | 1600 | 0.9855 |
67
- | 0.7151 | 1.6605 | 1800 | 0.9713 |
68
- | 0.7023 | 1.8450 | 2000 | 0.9557 |
69
- | 0.3925 | 2.0295 | 2200 | 1.0150 |
70
- | 0.3871 | 2.2140 | 2400 | 1.0319 |
71
- | 0.3927 | 2.3985 | 2600 | 1.0269 |
72
- | 0.3872 | 2.5830 | 2800 | 1.0267 |
73
- | 0.3918 | 2.7675 | 3000 | 1.0242 |
74
- | 0.3764 | 2.9520 | 3200 | 1.0243 |
75
 
76
 
77
  ### Framework versions
78
 
79
- - Transformers 4.51.3
80
  - Pytorch 2.5.1+rocm6.2
81
  - Datasets 3.5.0
82
- - Tokenizers 0.21.1
 
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.9423
24
 
25
  ## Model description
26
 
 
44
  - eval_batch_size: 16
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
+ - num_devices: 8
48
+ - gradient_accumulation_steps: 2
49
  - total_train_batch_size: 128
50
+ - total_eval_batch_size: 128
51
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 1
55
 
56
  ### Training results
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.0186 | 0.0923 | 100 | 1.0212 |
61
+ | 1.029 | 0.1846 | 200 | 1.0410 |
62
+ | 1.0367 | 0.2769 | 300 | 1.0391 |
63
+ | 1.0094 | 0.3692 | 400 | 1.0263 |
64
+ | 1.0163 | 0.4615 | 500 | 1.0116 |
65
+ | 0.9715 | 0.5538 | 600 | 0.9919 |
66
+ | 0.9408 | 0.6461 | 700 | 0.9743 |
67
+ | 0.925 | 0.7383 | 800 | 0.9587 |
68
+ | 0.936 | 0.8306 | 900 | 0.9477 |
69
+ | 0.9192 | 0.9229 | 1000 | 0.9423 |
 
 
 
 
 
 
70
 
71
 
72
  ### Framework versions
73
 
74
+ - Transformers 4.45.2
75
  - Pytorch 2.5.1+rocm6.2
76
  - Datasets 3.5.0
77
+ - Tokenizers 0.20.3
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 1361805280542720.0,
4
- "train_loss": 0.7043063408920832,
5
- "train_runtime": 81819.3933,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 5.085,
8
- "train_steps_per_second": 0.04
9
  }
 
1
  {
2
+ "epoch": 0.9995385325334564,
3
+ "total_flos": 453306954547200.0,
4
+ "train_loss": 0.9835369018966802,
5
+ "train_runtime": 35743.0085,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 3.88,
8
+ "train_steps_per_second": 0.03
9
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.51.3"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.45.2"
6
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 3.0,
3
- "total_flos": 1361805280542720.0,
4
- "train_loss": 0.7043063408920832,
5
- "train_runtime": 81819.3933,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 5.085,
8
- "train_steps_per_second": 0.04
9
  }
 
1
  {
2
+ "epoch": 0.9995385325334564,
3
+ "total_flos": 453306954547200.0,
4
+ "train_loss": 0.9835369018966802,
5
+ "train_runtime": 35743.0085,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 3.88,
8
+ "train_steps_per_second": 0.03
9
  }
trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff