Merge pull request #277 from cg123/dataset-name
Browse files- README.md +7 -0
- src/axolotl/utils/data.py +13 -14
README.md
CHANGED
|
@@ -262,6 +262,12 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
|
|
| 262 |
- path: vicgalle/alpaca-gpt4
|
| 263 |
type: alpaca # format from earlier
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
# local
|
| 266 |
datasets:
|
| 267 |
- path: json
|
|
@@ -344,6 +350,7 @@ datasets:
|
|
| 344 |
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
| 345 |
data_files: # path to source data files
|
| 346 |
shards: # number of shards to split data into
|
|
|
|
| 347 |
|
| 348 |
# axolotl attempts to save the dataset as an arrow after packing the data together so
|
| 349 |
# subsequent training attempts load faster, relative path
|
|
|
|
| 262 |
- path: vicgalle/alpaca-gpt4
|
| 263 |
type: alpaca # format from earlier
|
| 264 |
|
| 265 |
+
# huggingface repo with specific configuration/subset
|
| 266 |
+
datasets:
|
| 267 |
+
- path: EleutherAI/pile
|
| 268 |
+
name: enron_emails
|
| 269 |
+
type: completion # format from earlier
|
| 270 |
+
|
| 271 |
# local
|
| 272 |
datasets:
|
| 273 |
- path: json
|
|
|
|
| 350 |
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
| 351 |
data_files: # path to source data files
|
| 352 |
shards: # number of shards to split data into
|
| 353 |
+
name: # name of dataset configuration to load
|
| 354 |
|
| 355 |
# axolotl attempts to save the dataset as an arrow after packing the data together so
|
| 356 |
# subsequent training attempts load faster, relative path
|
src/axolotl/utils/data.py
CHANGED
|
@@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets(
|
|
| 94 |
try:
|
| 95 |
load_dataset(
|
| 96 |
d.path,
|
|
|
|
| 97 |
streaming=True,
|
| 98 |
use_auth_token=use_auth_token,
|
| 99 |
)
|
|
@@ -107,6 +108,7 @@ def load_tokenized_prepared_datasets(
|
|
| 107 |
if local_path.is_dir():
|
| 108 |
ds = load_dataset(
|
| 109 |
d.path,
|
|
|
|
| 110 |
data_files=d.data_files,
|
| 111 |
streaming=False,
|
| 112 |
split=None,
|
|
@@ -114,6 +116,7 @@ def load_tokenized_prepared_datasets(
|
|
| 114 |
elif local_path.is_file():
|
| 115 |
ds = load_dataset(
|
| 116 |
"json",
|
|
|
|
| 117 |
data_files=d.path,
|
| 118 |
streaming=False,
|
| 119 |
split=None,
|
|
@@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets(
|
|
| 123 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
| 124 |
)
|
| 125 |
elif ds_from_hub:
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
else:
|
| 134 |
-
ds = load_dataset(
|
| 135 |
-
d.path,
|
| 136 |
-
streaming=False,
|
| 137 |
-
use_auth_token=use_auth_token,
|
| 138 |
-
)
|
| 139 |
else:
|
| 140 |
fp = hf_hub_download(
|
| 141 |
repo_id=d.path,
|
| 142 |
repo_type="dataset",
|
| 143 |
filename=d.data_files,
|
| 144 |
)
|
| 145 |
-
ds = load_dataset(
|
|
|
|
|
|
|
| 146 |
if not ds:
|
| 147 |
raise ValueError("unhandled dataset load")
|
| 148 |
# support for using a subset of the data
|
|
|
|
| 94 |
try:
|
| 95 |
load_dataset(
|
| 96 |
d.path,
|
| 97 |
+
name=d.name,
|
| 98 |
streaming=True,
|
| 99 |
use_auth_token=use_auth_token,
|
| 100 |
)
|
|
|
|
| 108 |
if local_path.is_dir():
|
| 109 |
ds = load_dataset(
|
| 110 |
d.path,
|
| 111 |
+
name=d.name,
|
| 112 |
data_files=d.data_files,
|
| 113 |
streaming=False,
|
| 114 |
split=None,
|
|
|
|
| 116 |
elif local_path.is_file():
|
| 117 |
ds = load_dataset(
|
| 118 |
"json",
|
| 119 |
+
name=d.name,
|
| 120 |
data_files=d.path,
|
| 121 |
streaming=False,
|
| 122 |
split=None,
|
|
|
|
| 126 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
| 127 |
)
|
| 128 |
elif ds_from_hub:
|
| 129 |
+
ds = load_dataset(
|
| 130 |
+
d.path,
|
| 131 |
+
name=d.name,
|
| 132 |
+
streaming=False,
|
| 133 |
+
data_files=d.data_files,
|
| 134 |
+
use_auth_token=use_auth_token,
|
| 135 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
else:
|
| 137 |
fp = hf_hub_download(
|
| 138 |
repo_id=d.path,
|
| 139 |
repo_type="dataset",
|
| 140 |
filename=d.data_files,
|
| 141 |
)
|
| 142 |
+
ds = load_dataset(
|
| 143 |
+
"json", name=d.name, data_files=fp, streaming=False, split=None
|
| 144 |
+
)
|
| 145 |
if not ds:
|
| 146 |
raise ValueError("unhandled dataset load")
|
| 147 |
# support for using a subset of the data
|