Fix local path loading and custom strategy type
Browse files
README.md
CHANGED
|
@@ -237,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
|
|
| 237 |
#### How to add custom prompts
|
| 238 |
|
| 239 |
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
|
| 240 |
-
2. Use your custom file name as the dataset type
|
| 241 |
|
| 242 |
Optionally, download some datasets, see [data/README.md](data/README.md)
|
| 243 |
|
|
@@ -255,10 +255,18 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
|
|
| 255 |
|
| 256 |
- dataset
|
| 257 |
```yaml
|
|
|
|
|
|
|
|
|
|
| 258 |
datasets:
|
| 259 |
-
- path: vicgalle/alpaca-gpt4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
type: alpaca # format from earlier
|
| 261 |
-
sequence_len: 2048 # max token length / prompt
|
| 262 |
```
|
| 263 |
|
| 264 |
- loading
|
|
@@ -328,10 +336,10 @@ tf32: true # require >=ampere
|
|
| 328 |
|
| 329 |
# a list of one or more datasets to finetune the model with
|
| 330 |
datasets:
|
| 331 |
-
#
|
| 332 |
- path: vicgalle/alpaca-gpt4
|
| 333 |
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
| 334 |
-
type: alpaca # format
|
| 335 |
data_files: # path to source data files
|
| 336 |
shards: # number of shards to split data into
|
| 337 |
|
|
|
|
| 237 |
#### How to add custom prompts
|
| 238 |
|
| 239 |
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
|
| 240 |
+
2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
|
| 241 |
|
| 242 |
Optionally, download some datasets, see [data/README.md](data/README.md)
|
| 243 |
|
|
|
|
| 255 |
|
| 256 |
- dataset
|
| 257 |
```yaml
|
| 258 |
+
sequence_len: 2048 # max token length for prompt
|
| 259 |
+
|
| 260 |
+
# huggingface repo
|
| 261 |
datasets:
|
| 262 |
+
- path: vicgalle/alpaca-gpt4
|
| 263 |
+
type: alpaca # format from earlier
|
| 264 |
+
|
| 265 |
+
# local
|
| 266 |
+
datasets:
|
| 267 |
+
- path: json
|
| 268 |
+
data_files: data.jsonl # or json
|
| 269 |
type: alpaca # format from earlier
|
|
|
|
| 270 |
```
|
| 271 |
|
| 272 |
- loading
|
|
|
|
| 336 |
|
| 337 |
# a list of one or more datasets to finetune the model with
|
| 338 |
datasets:
|
| 339 |
+
# hf dataset repo | "json" for local dataset, make sure to fill data_files
|
| 340 |
- path: vicgalle/alpaca-gpt4
|
| 341 |
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
| 342 |
+
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
| 343 |
data_files: # path to source data files
|
| 344 |
shards: # number of shards to split data into
|
| 345 |
|