from pathlib import Path import io import json import math import statistics import sys import time from datasets import concatenate_datasets, Dataset from datasets import load_dataset from huggingface_hub import hf_hub_url import pandas as pd import numpy as np from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer from evaluate import load # 1. record each file name included # 1.1 read different file formats depending on parameters (i.e., filetype) # 2. determine column types and report how many rows for each type (format check) # (in a well-formatted dataset, each column should only have one type) # 3. report on the null values # 4. for certain column types, report statistics # 4.1 uniqueness: if all rows are of a small number of values, treat the column as 'categorical' < 10. # 4.2 strings: length ranges # 4.3 lists: length ranges # 4.3 int/float/double: their percentiles, min, max, mean CELL_TYPES_LENGTH = ["", ""] CELL_TYPES_NUMERIC = ["", ""] PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999] def read_data(all_files, filetype): df = None func_name = "" if filetype in ["parquet", "csv", "json"]: if filetype == "parquet": func_name = pd.read_parquet elif filetype == "csv": func_name = pd.read_csv elif filetype == "json": func_name = pd.read_json df = pd.concat(func_name(f) for f in all_files) elif filetype == "arrow": ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files]) df = pd.DataFrame(data=ds) elif filetype == "jsonl": func_name = pd.read_json all_lines = [] for fname in all_files: with open(fname, "r") as f: all_lines.extend(f.readlines()) df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines]) return df def compute_cell_length_ranges(cell_lengths, cell_unique_string_values): cell_length_ranges = {} cell_length_ranges = {} string_categorical = {} # this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value # with few unique items (need to check that while reading the cell), # so no need to treat it as a normal string if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10: string_categorical = str(len(cell_unique_string_values)) + " class(es)" elif cell_lengths: cell_lengths = sorted(cell_lengths) min_val = cell_lengths[0] max_val = cell_lengths[-1] distance = math.ceil((max_val - min_val) / 10.0) ranges = [] if min_val != max_val: for j in range(min_val, max_val, distance): ranges.append(j) for j in range(len(ranges)-1): cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0 ranges.append(max_val) j = 1 c = 0 for k in cell_lengths: if j == len(ranges): c += 1 elif k < ranges[j]: c += 1 else: cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c j += 1 c = 1 cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c else: ranges = [min_val] c = 0 for k in cell_lengths: c += 1 cell_length_ranges[str(min_val)] = c return cell_length_ranges, string_categorical def _compute_percentiles(values, percentiles=PERCENTILES): result = {} quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive') for p in percentiles: result[p/10] = quantiles[p-1] return result def compute_cell_value_statistics(cell_values): stats = {} if cell_values: cell_values = sorted(cell_values) stats["min"] = cell_values[0] stats["max"] = cell_values[-1] stats["mean"] = statistics.mean(cell_values) stats["stdev"] = statistics.stdev(cell_values) stats["variance"] = statistics.variance(cell_values) stats["percentiles"] = _compute_percentiles(cell_values) return stats def check_null(cell, cell_type): if cell_type == "": if math.isnan(cell): return True elif cell is None: return True return False def compute_property(data_path, glob, filetype): output = {} data_dir = Path(data_path) filenames = [] all_files = list(data_dir.glob(glob)) for f in all_files: print(str(f)) base_fname = str(f)[len(str(data_path)):] if not data_path.endswith("/"): base_fname = base_fname[1:] filenames.append(base_fname) output["filenames"] = filenames df = read_data(all_files, filetype) column_info = {} for col_name in df.columns: if col_name not in column_info: column_info[col_name] = {} cell_types = {} cell_lengths = {} cell_unique_string_values = {} cell_values = {} null_count = 0 col_values = df[col_name].to_list() for cell in col_values: # for index, row in df.iterrows(): # cell = row[col_name] cell_type = str(type(cell)) cell_type = str(type(cell)) # print(cell, cell_type) if check_null(cell, cell_type): null_count += 1 continue if cell_type not in cell_types: cell_types[cell_type] = 1 else: cell_types[cell_type] += 1 if cell_type in CELL_TYPES_LENGTH: cell_length = len(cell) if cell_type not in cell_lengths: cell_lengths[cell_type] = [] cell_lengths[cell_type].append(cell_length) if cell_type == "" and cell not in cell_unique_string_values: cell_unique_string_values[cell] = True elif cell_type in CELL_TYPES_NUMERIC: if cell_type not in cell_values: cell_values[cell_type] = [] cell_values[cell_type].append(cell) else: print(cell_type) clrs = {} ccs = {} for cell_type in CELL_TYPES_LENGTH: if cell_type in cell_lengths: clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values) clrs[cell_type] = clr ccs[cell_type] = cc css = {} for cell_type in CELL_TYPES_NUMERIC: if cell_type in cell_values: cell_stats = compute_cell_value_statistics(cell_values[cell_type]) css[cell_type] = cell_stats column_info[col_name]["cell_types"] = cell_types column_info[col_name]["cell_length_ranges"] = clrs column_info[col_name]["cell_categories"] = ccs column_info[col_name]["cell_stats"] = css column_info[col_name]["cell_missing"] = null_count output["column_info"] = column_info output["number_of_items"] = len(df) output["timestamp"] = time.time() return output def preprocess_function(examples): return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True) def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return metric.compute(predictions=predictions, references=labels) def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric): tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2) batch_size = 16 args = TrainingArguments( "test-glue", evaluation_strategy = "epoch", learning_rate=5e-5, seed=42, lr_scheduler_type="linear", per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=3, weight_decay=0.01, load_best_model_at_end=False, metric_for_best_model="accuracy", report_to="none" ) trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], tokenizer=tokenizer, compute_metrics=compute_metrics ) result = trainer.evaluate() return result if __name__ == "__main__": in_container = True if len(sys.argv) > 1: model_checkpoint = sys.argv[1] dataset_name = sys.argv[2] metric = sys.argv[3] in_container = False else: model_checkpoint = "sgugger/glue-mrpc" dataset_name = "nyu-mll/glue" metric = ["glue", "mrpc"] in_container = False print(model_checkpoint, dataset_name, metric) model_checkpoint = model_checkpoint raw_datasets = load_dataset(dataset_name, "mrpc") metric = load("glue", "mrpc") tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric) print(json.dumps(output)) if in_container: with open("/tmp/outputs/computation_result.json", "w") as f: json.dump(output, f, indent=4, sort_keys=True) else: print(json.dumps(output, indent=4, sort_keys=True))