pretrain
Browse files
scripts/base_instruct_datasets.py
CHANGED
@@ -38,9 +38,14 @@ base_instruct_datasets = [
|
|
38 |
]},
|
39 |
|
40 |
# 21.1 MB, 1,000
|
41 |
-
{'kind': 'instruct', 'path': 'simplescaling/s1K-1.1', 'split': 'train', 'transform': lambda r: [
|
42 |
{'role': 'system', 'content': R1_SYSTEM_PROMPT},
|
43 |
{'role': 'user', 'content': r.get('question') or ''},
|
44 |
{'role': 'assistant', 'content': '<think>\n' + (r.get('deepseek_thinking_trajectory') or '') + '\n</think>\n' + (r.get('solution') or '')},
|
45 |
-
]}
|
|
|
|
|
|
|
|
|
|
|
46 |
]
|
|
|
38 |
]},
|
39 |
|
40 |
# 21.1 MB, 1,000
|
41 |
+
{'kind': 'instruct', 'path': 'simplescaling/s1K-1.1', 'split': 'train[0%:50%]', 'transform': lambda r: [
|
42 |
{'role': 'system', 'content': R1_SYSTEM_PROMPT},
|
43 |
{'role': 'user', 'content': r.get('question') or ''},
|
44 |
{'role': 'assistant', 'content': '<think>\n' + (r.get('deepseek_thinking_trajectory') or '') + '\n</think>\n' + (r.get('solution') or '')},
|
45 |
+
]},
|
46 |
+
{'kind': 'instruct', 'path': 'simplescaling/s1K-1.1', 'split': 'train[50%:100%]', 'transform': lambda r: [
|
47 |
+
{'role': 'system', 'content': R1_SYSTEM_PROMPT},
|
48 |
+
{'role': 'user', 'content': r.get('question') or ''},
|
49 |
+
{'role': 'assistant', 'content': '<question>\n' + (r.get('question') or '') + '\n</question>\n<think>\n' + (r.get('deepseek_thinking_trajectory') or '') + '\n</think>\n<answer>\n' + (r.get('solution') or '') + '\n</answer>'},
|
50 |
+
]},
|
51 |
]
|