pretrain
Browse files
scripts/base_datasets.py
CHANGED
@@ -147,15 +147,3 @@ base_datasets = [
|
|
147 |
# 15.6 MB, 24,926
|
148 |
{'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
|
149 |
]
|
150 |
-
|
151 |
-
base_datasets = [
|
152 |
-
#
|
153 |
-
# light instructions
|
154 |
-
#
|
155 |
-
# 44.3 MB, 51,760
|
156 |
-
{'kind': 'base', 'path': 'yahma/alpaca-cleaned', 'split': 'train', 'format': '{instruction}\n{input}\n{output}'},
|
157 |
-
# 11 MB, 12,564
|
158 |
-
{'kind': 'base', 'path': 'Cleanlab/databricks-dolly-15k-cleanset', 'split': 'train', 'format': '{instruction}\n{context}\n{response}'},
|
159 |
-
# 15.6 MB, 24,926
|
160 |
-
{'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
|
161 |
-
]
|
|
|
147 |
# 15.6 MB, 24,926
|
148 |
{'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
|
149 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/base_instruct_datasets.py
CHANGED
@@ -13,7 +13,7 @@ You are an AI assistant.
|
|
13 |
Your primary directive is to provide well-reasoned, structured, and extensively detailed responses.
|
14 |
|
15 |
Formatting Requirements:
|
16 |
-
-
|
17 |
- The <think></think> block should contain at least six reasoning steps when applicable.
|
18 |
- If the answer requires minimal thought, the <think></think> block may be left empty.
|
19 |
- The user does not see the <think></think> section. Any information critical to the response must be included in the answer.
|
@@ -24,7 +24,10 @@ Response Guidelines:
|
|
24 |
- Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
|
25 |
- Prioritize Reasoning: Always reason through the problem first, unless the answer is trivial.
|
26 |
- Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
|
27 |
-
- Maintain a professional, intelligent, and analytical tone in all interactions.
|
|
|
|
|
|
|
28 |
|
29 |
base_instruct_datasets = [
|
30 |
# 65.7 MB, 11,578
|
|
|
13 |
Your primary directive is to provide well-reasoned, structured, and extensively detailed responses.
|
14 |
|
15 |
Formatting Requirements:
|
16 |
+
- Structure your replies using: <think>{reasoning}</think>{answer}
|
17 |
- The <think></think> block should contain at least six reasoning steps when applicable.
|
18 |
- If the answer requires minimal thought, the <think></think> block may be left empty.
|
19 |
- The user does not see the <think></think> section. Any information critical to the response must be included in the answer.
|
|
|
24 |
- Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
|
25 |
- Prioritize Reasoning: Always reason through the problem first, unless the answer is trivial.
|
26 |
- Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
|
27 |
+
- Maintain a professional, intelligent, and analytical tone in all interactions.
|
28 |
+
|
29 |
+
If user provides <question>...</question> and expects <answer>...</answer>:
|
30 |
+
- Structure your replies using: <question>{User’s exact input}</question><think>{reasoning}</think><answer>{answer}</answer>'''
|
31 |
|
32 |
base_instruct_datasets = [
|
33 |
# 65.7 MB, 11,578
|
scripts/prepare_base_datasets.py
CHANGED
@@ -20,7 +20,12 @@ seqs = [
|
|
20 |
# (16385, 32769, 32769, 500),
|
21 |
# (32769, 65537, 65537, 250),
|
22 |
# (65537, 131073, 131073, 125),
|
|
|
23 |
(0, 1073741824, 8193, 2000),
|
|
|
|
|
|
|
|
|
24 |
]
|
25 |
|
26 |
#
|
|
|
20 |
# (16385, 32769, 32769, 500),
|
21 |
# (32769, 65537, 65537, 250),
|
22 |
# (65537, 131073, 131073, 125),
|
23 |
+
|
24 |
(0, 1073741824, 8193, 2000),
|
25 |
+
(8193, 16385, 16385, 1000),
|
26 |
+
(16385, 32769, 32769, 500),
|
27 |
+
(32769, 65537, 65537, 250),
|
28 |
+
(65537, 131073, 131073, 125),
|
29 |
]
|
30 |
|
31 |
#
|
scripts/pretrain_base_model_0.yaml
CHANGED
@@ -61,7 +61,7 @@ train:
|
|
61 |
global_batch_size: 512
|
62 |
|
63 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
64 |
-
micro_batch_size:
|
65 |
|
66 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
67 |
lr_warmup_steps: 2000
|
|
|
61 |
global_batch_size: 512
|
62 |
|
63 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
64 |
+
micro_batch_size: 1
|
65 |
|
66 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
67 |
lr_warmup_steps: 2000
|