|
from dataclasses import dataclass, field |
|
from typing import Optional |
|
|
|
|
|
@dataclass |
|
class DataTrainingArguments: |
|
""" |
|
Arguments pertaining to what data we are going to input our model for training and eval. |
|
""" |
|
|
|
task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."}) |
|
dataset_name: Optional[str] = field( |
|
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} |
|
) |
|
dataset_config_name: Optional[str] = field( |
|
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} |
|
) |
|
train_file: Optional[str] = field( |
|
default=None, metadata={"help": "The input training data file (a csv or JSON file)."} |
|
) |
|
validation_file: Optional[str] = field( |
|
default=None, |
|
metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."}, |
|
) |
|
test_file: Optional[str] = field( |
|
default=None, |
|
metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."}, |
|
) |
|
overwrite_cache: bool = field( |
|
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} |
|
) |
|
preprocessing_num_workers: Optional[int] = field( |
|
default=None, |
|
metadata={"help": "The number of processes to use for the preprocessing."}, |
|
) |
|
pad_to_max_length: bool = field( |
|
default=True, |
|
metadata={ |
|
"help": "Whether to pad all samples to model maximum sentence length. " |
|
"If False, will pad the samples dynamically when batching to the maximum length in the batch. More " |
|
"efficient on GPU but very bad for TPU." |
|
}, |
|
) |
|
max_train_samples: Optional[int] = field( |
|
default=None, |
|
metadata={ |
|
"help": "For debugging purposes or quicker training, truncate the number of training examples to this " |
|
"value if set." |
|
}, |
|
) |
|
max_val_samples: Optional[int] = field( |
|
default=None, |
|
metadata={ |
|
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this " |
|
"value if set." |
|
}, |
|
) |
|
max_test_samples: Optional[int] = field( |
|
default=None, |
|
metadata={ |
|
"help": "For debugging purposes or quicker training, truncate the number of test examples to this " |
|
"value if set." |
|
}, |
|
) |
|
label_all_tokens: bool = field( |
|
default=False, |
|
metadata={ |
|
"help": "Whether to put the label for one word on all tokens of generated by that word or just on the " |
|
"one (in which case the other tokens will have a padding index)." |
|
}, |
|
) |
|
return_entity_level_metrics: bool = field( |
|
default=False, |
|
metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."}, |
|
) |
|
|
|
|
|
@dataclass |
|
class XFUNDataTrainingArguments(DataTrainingArguments): |
|
lang: Optional[str] = field(default="en") |
|
additional_langs: Optional[str] = field(default=None) |
|
|