mtasic85 commited on
Commit
5286198
·
1 Parent(s): 2f59073
Files changed (2) hide show
  1. README.md +33 -13
  2. scripts/pretrain_base_model_0.yaml +4 -4
README.md CHANGED
@@ -49,14 +49,34 @@ tags:
49
  ![logo](./misc/logo.jpg)
50
 
51
  ```bash
52
- time python -B prepare_core_datasets.py
53
  ```
54
 
55
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  ```
57
 
58
  ```bash
59
- CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_0.yaml
60
  ```
61
 
62
  ```
@@ -65,54 +85,54 @@ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable
65
  Backup `wandb`:
66
 
67
  ```bash
68
- mv wandb wandb-pretrain-core-0
69
  ```
70
 
71
  Copy config:
72
 
73
  ```bash
74
- cp ../config-0.json ../out/pretrain-core-0/final/config.json
75
  ```
76
 
77
  Chat with model:
78
 
79
  ```bash
80
- CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat ../out/pretrain-core-0/final
81
  ```
82
 
83
  ```bash
84
- CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True time litgpt evaluate --tasks 'leaderboard' --out_dir '../evaluate/pretrain-core-0/leaderboard/' --batch_size '4' --dtype 'bfloat16' '../out/pretrain-core-0/final'
85
  ```
86
 
87
  ```
88
  ```
89
 
90
  ```bash
91
- litgpt convert_pretrained_checkpoint ../out/pretrain-core-0/final ../out/pretrain-core-0/checkpoint
92
  ```
93
 
94
  ```bash
95
- CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_1.yaml
96
  ```
97
 
98
  ```bash
99
- litgpt convert_pretrained_checkpoint ../out/pretrain-core-1/final ../out/pretrain-core-1/checkpoint
100
  ```
101
 
102
  ```bash
103
- CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_2.yaml
104
  ```
105
 
106
  ```bash
107
- litgpt convert_pretrained_checkpoint ../out/pretrain-core-2/final ../out/pretrain-core-2/checkpoint
108
  ```
109
 
110
  ```bash
111
- CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_3.yaml
112
  ```
113
 
114
  ```bash
115
- CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True time litgpt evaluate --tasks 'leaderboard' --out_dir '../evaluate/pretrain-core-3/leaderboard/' --batch_size '4' --dtype 'bfloat16' '../out/pretrain-core-3/final'
116
  ```
117
 
118
  ```
 
49
  ![logo](./misc/logo.jpg)
50
 
51
  ```bash
52
+ time python -B prepare_base_datasets.py
53
  ```
54
 
55
  ```
56
+ i=0, min_len=0, max_len=1073741824, block_size=8193, chunk_size=16386000, len(dataset)=1496631, len(dataset) * block_size=12261897783
57
+ Total number of tokens in the optimized dataset '../base-data-0-0-1073741824-8193-2000' is 12261897783
58
+
59
+ i=1, min_len=8193, max_len=16385, block_size=16385, chunk_size=16385000, len(dataset)=78802, len(dataset) * block_size=1291170770
60
+ Total number of tokens in the optimized dataset '../base-data-1-8193-16385-16385-1000' is 1291170770
61
+
62
+ i=2, min_len=16385, max_len=32769, block_size=32769, chunk_size=16384500, len(dataset)=23511, len(dataset) * block_size=770431959
63
+ Total number of tokens in the optimized dataset '../base-data-2-16385-32769-32769-500' is 770431959
64
+
65
+ i=3, min_len=32769, max_len=65537, block_size=65537, chunk_size=16384250, len(dataset)=5128, len(dataset) * block_size=336073736
66
+ Total number of tokens in the optimized dataset '../base-data-3-32769-65537-65537-250' is 336073736
67
+
68
+ i=4, min_len=65537, max_len=131073, block_size=131073, chunk_size=16384125, len(dataset)=1169, len(dataset) * block_size=153224337
69
+ Total number of tokens in the optimized dataset '../base-data-4-65537-131073-131073-125' is 153224337
70
+
71
+ 46G ../base-data-0-0-1073741824-8193-2000
72
+ 4.9G ../base-data-1-8193-16385-16385-1000
73
+ 2.9G ../base-data-2-16385-32769-32769-500
74
+ 1.3G ../base-data-3-32769-65537-65537-250
75
+ 589M ../base-data-4-65537-131073-131073-125
76
  ```
77
 
78
  ```bash
79
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_base_model_0.yaml
80
  ```
81
 
82
  ```
 
85
  Backup `wandb`:
86
 
87
  ```bash
88
+ mv wandb wandb-pretrain-base-0
89
  ```
90
 
91
  Copy config:
92
 
93
  ```bash
94
+ cp ../config-0.json ../out/pretrain-base-0/final/config.json
95
  ```
96
 
97
  Chat with model:
98
 
99
  ```bash
100
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat ../out/pretrain-base-0/final
101
  ```
102
 
103
  ```bash
104
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True time litgpt evaluate --tasks 'leaderboard' --out_dir '../evaluate/pretrain-base-0/leaderboard/' --batch_size '4' --dtype 'bfloat16' '../out/pretrain-base-0/final'
105
  ```
106
 
107
  ```
108
  ```
109
 
110
  ```bash
111
+ litgpt convert_pretrained_checkpoint ../out/pretrain-base-0/final ../out/pretrain-base-0/checkpoint
112
  ```
113
 
114
  ```bash
115
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_base_model_1.yaml
116
  ```
117
 
118
  ```bash
119
+ litgpt convert_pretrained_checkpoint ../out/pretrain-base-1/final ../out/pretrain-base-1/checkpoint
120
  ```
121
 
122
  ```bash
123
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_base_model_2.yaml
124
  ```
125
 
126
  ```bash
127
+ litgpt convert_pretrained_checkpoint ../out/pretrain-base-2/final ../out/pretrain-base-2/checkpoint
128
  ```
129
 
130
  ```bash
131
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_base_model_3.yaml
132
  ```
133
 
134
  ```bash
135
+ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True time litgpt evaluate --tasks 'leaderboard' --out_dir '../evaluate/pretrain-base-3/leaderboard/' --batch_size '4' --dtype 'bfloat16' '../out/pretrain-base-3/final'
136
  ```
137
 
138
  ```
scripts/pretrain_base_model_0.yaml CHANGED
@@ -58,19 +58,19 @@ train:
58
  log_interval: 1
59
 
60
  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
61
- global_batch_size: 512
62
 
63
  # Number of samples per data-parallel rank (type: int, default: 4)
64
- micro_batch_size: 1
65
 
66
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
67
- lr_warmup_steps: 2000
68
 
69
  # Number of epochs to train on (type: Optional[int], default: null)
70
  epochs:
71
 
72
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
73
- max_tokens: 32706456
74
 
75
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
76
  max_steps:
 
58
  log_interval: 1
59
 
60
  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
61
+ global_batch_size: 64
62
 
63
  # Number of samples per data-parallel rank (type: int, default: 4)
64
+ micro_batch_size: 4
65
 
66
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
67
+ lr_warmup_steps: 100
68
 
69
  # Number of epochs to train on (type: Optional[int], default: null)
70
  epochs:
71
 
72
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
73
+ max_tokens: 12261897783
74
 
75
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
76
  max_steps: