pretrain
Browse files
scripts/base_instruct_datasets.py
CHANGED
|
@@ -38,9 +38,14 @@ base_instruct_datasets = [
|
|
| 38 |
]},
|
| 39 |
|
| 40 |
# 21.1 MB, 1,000
|
| 41 |
-
{'kind': 'instruct', 'path': 'simplescaling/s1K-1.1', 'split': 'train', 'transform': lambda r: [
|
| 42 |
{'role': 'system', 'content': R1_SYSTEM_PROMPT},
|
| 43 |
{'role': 'user', 'content': r.get('question') or ''},
|
| 44 |
{'role': 'assistant', 'content': '<think>\n' + (r.get('deepseek_thinking_trajectory') or '') + '\n</think>\n' + (r.get('solution') or '')},
|
| 45 |
-
]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
]
|
|
|
|
| 38 |
]},
|
| 39 |
|
| 40 |
# 21.1 MB, 1,000
|
| 41 |
+
{'kind': 'instruct', 'path': 'simplescaling/s1K-1.1', 'split': 'train[0%:50%]', 'transform': lambda r: [
|
| 42 |
{'role': 'system', 'content': R1_SYSTEM_PROMPT},
|
| 43 |
{'role': 'user', 'content': r.get('question') or ''},
|
| 44 |
{'role': 'assistant', 'content': '<think>\n' + (r.get('deepseek_thinking_trajectory') or '') + '\n</think>\n' + (r.get('solution') or '')},
|
| 45 |
+
]},
|
| 46 |
+
{'kind': 'instruct', 'path': 'simplescaling/s1K-1.1', 'split': 'train[50%:100%]', 'transform': lambda r: [
|
| 47 |
+
{'role': 'system', 'content': R1_SYSTEM_PROMPT},
|
| 48 |
+
{'role': 'user', 'content': r.get('question') or ''},
|
| 49 |
+
{'role': 'assistant', 'content': '<question>\n' + (r.get('question') or '') + '\n</question>\n<think>\n' + (r.get('deepseek_thinking_trajectory') or '') + '\n</think>\n<answer>\n' + (r.get('solution') or '') + '\n</answer>'},
|
| 50 |
+
]},
|
| 51 |
]
|