Set `gradient_clipping` to `auto` in DeepSpeed configs (#1382) [skip ci]
Browse files
deepspeed_configs/zero1.json
CHANGED
|
@@ -16,6 +16,7 @@
|
|
| 16 |
"min_loss_scale": 1
|
| 17 |
},
|
| 18 |
"gradient_accumulation_steps": "auto",
|
|
|
|
| 19 |
"train_batch_size": "auto",
|
| 20 |
"train_micro_batch_size_per_gpu": "auto",
|
| 21 |
"wall_clock_breakdown": false
|
|
|
|
| 16 |
"min_loss_scale": 1
|
| 17 |
},
|
| 18 |
"gradient_accumulation_steps": "auto",
|
| 19 |
+
"gradient_clipping": "auto",
|
| 20 |
"train_batch_size": "auto",
|
| 21 |
"train_micro_batch_size_per_gpu": "auto",
|
| 22 |
"wall_clock_breakdown": false
|
deepspeed_configs/zero2.json
CHANGED
|
@@ -20,6 +20,7 @@
|
|
| 20 |
"min_loss_scale": 1
|
| 21 |
},
|
| 22 |
"gradient_accumulation_steps": "auto",
|
|
|
|
| 23 |
"train_batch_size": "auto",
|
| 24 |
"train_micro_batch_size_per_gpu": "auto",
|
| 25 |
"wall_clock_breakdown": false
|
|
|
|
| 20 |
"min_loss_scale": 1
|
| 21 |
},
|
| 22 |
"gradient_accumulation_steps": "auto",
|
| 23 |
+
"gradient_clipping": "auto",
|
| 24 |
"train_batch_size": "auto",
|
| 25 |
"train_micro_batch_size_per_gpu": "auto",
|
| 26 |
"wall_clock_breakdown": false
|
deepspeed_configs/zero3.json
CHANGED
|
@@ -24,6 +24,7 @@
|
|
| 24 |
"min_loss_scale": 1
|
| 25 |
},
|
| 26 |
"gradient_accumulation_steps": "auto",
|
|
|
|
| 27 |
"train_batch_size": "auto",
|
| 28 |
"train_micro_batch_size_per_gpu": "auto",
|
| 29 |
"wall_clock_breakdown": false
|
|
|
|
| 24 |
"min_loss_scale": 1
|
| 25 |
},
|
| 26 |
"gradient_accumulation_steps": "auto",
|
| 27 |
+
"gradient_clipping": "auto",
|
| 28 |
"train_batch_size": "auto",
|
| 29 |
"train_micro_batch_size_per_gpu": "auto",
|
| 30 |
"wall_clock_breakdown": false
|
deepspeed_configs/zero3_bf16.json
CHANGED
|
@@ -24,6 +24,7 @@
|
|
| 24 |
"min_loss_scale": 1
|
| 25 |
},
|
| 26 |
"gradient_accumulation_steps": "auto",
|
|
|
|
| 27 |
"train_batch_size": "auto",
|
| 28 |
"train_micro_batch_size_per_gpu": "auto",
|
| 29 |
"wall_clock_breakdown": false
|
|
|
|
| 24 |
"min_loss_scale": 1
|
| 25 |
},
|
| 26 |
"gradient_accumulation_steps": "auto",
|
| 27 |
+
"gradient_clipping": "auto",
|
| 28 |
"train_batch_size": "auto",
|
| 29 |
"train_micro_batch_size_per_gpu": "auto",
|
| 30 |
"wall_clock_breakdown": false
|