from dataclasses import dataclass, field from typing import Optional @dataclass class DataTrainingArguments: """ Arguments pertaining to what data we are going to input our model for training and eval. """ task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."}) dataset_name: Optional[str] = field( default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} ) dataset_config_name: Optional[str] = field( default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} ) train_file: Optional[str] = field( default=None, metadata={"help": "The input training data file (a csv or JSON file)."} ) validation_file: Optional[str] = field( default=None, metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."}, ) test_file: Optional[str] = field( default=None, metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."}, ) overwrite_cache: bool = field( default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} ) preprocessing_num_workers: Optional[int] = field( default=None, metadata={"help": "The number of processes to use for the preprocessing."}, ) pad_to_max_length: bool = field( default=True, metadata={ "help": "Whether to pad all samples to model maximum sentence length. " "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " "efficient on GPU but very bad for TPU." }, ) max_train_samples: Optional[int] = field( default=None, metadata={ "help": "For debugging purposes or quicker training, truncate the number of training examples to this " "value if set." }, ) max_val_samples: Optional[int] = field( default=None, metadata={ "help": "For debugging purposes or quicker training, truncate the number of validation examples to this " "value if set." }, ) max_test_samples: Optional[int] = field( default=None, metadata={ "help": "For debugging purposes or quicker training, truncate the number of test examples to this " "value if set." }, ) label_all_tokens: bool = field( default=False, metadata={ "help": "Whether to put the label for one word on all tokens of generated by that word or just on the " "one (in which case the other tokens will have a padding index)." }, ) return_entity_level_metrics: bool = field( default=False, metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."}, ) @dataclass class XFUNDataTrainingArguments(DataTrainingArguments): lang: Optional[str] = field(default="en") additional_langs: Optional[str] = field(default=None)