| { | |
| "zero_optimization": { | |
| "stage": 3, | |
| "offload_optimizer": { | |
| "device": "cpu", | |
| "pin_memory": true | |
| }, | |
| "offload_param": { | |
| "device": "cpu", | |
| "pin_memory": true | |
| }, | |
| "overlap_comm": true, | |
| "contiguous_gradients": true, | |
| "sub_group_size": 0, | |
| "reduce_bucket_size": "auto", | |
| "stage3_prefetch_bucket_size": "auto", | |
| "stage3_param_persistence_threshold": "auto", | |
| "stage3_max_live_parameters": 0, | |
| "stage3_max_reuse_distance": 0, | |
| "stage3_gather_16bit_weights_on_model_save": true | |
| }, | |
| "bf16": { | |
| "enabled": "auto" | |
| }, | |
| "fp16": { | |
| "enabled": "auto", | |
| "auto_cast": false, | |
| "loss_scale": 0, | |
| "initial_scale_power": 32, | |
| "loss_scale_window": 1000, | |
| "hysteresis": 2, | |
| "min_loss_scale": 1 | |
| }, | |
| "optimizer": { | |
| "type": "AdamW", | |
| "params": { | |
| "lr": "auto", | |
| "betas": [ | |
| 0.9, | |
| 0.999 | |
| ], | |
| "eps": 1e-8, | |
| "weight_decay": "auto" | |
| } | |
| }, | |
| "scheduler": { | |
| "type": "OneCycle", | |
| "params": { | |
| "cycle_min_lr": 0.00001, | |
| "cycle_max_lr": 0.00003, | |
| "cycle_first_step_size": 120 | |
| } | |
| }, | |
| "train_batch_size": "auto", | |
| "train_micro_batch_size_per_gpu": "auto", | |
| "wall_clock_breakdown": false | |
| } | |