Spaces:
Running
Running
from pathlib import Path | |
import io | |
import json | |
import math | |
import statistics | |
import sys | |
import time | |
from datasets import concatenate_datasets, Dataset | |
from datasets import load_dataset | |
from huggingface_hub import hf_hub_url | |
import pandas as pd | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
from evaluate import load | |
# 1. record each file name included | |
# 1.1 read different file formats depending on parameters (i.e., filetype) | |
# 2. determine column types and report how many rows for each type (format check) | |
# (in a well-formatted dataset, each column should only have one type) | |
# 3. report on the null values | |
# 4. for certain column types, report statistics | |
# 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10. | |
# 4.2 strings: length ranges | |
# 4.3 lists: length ranges | |
# 4.3 int/float/double: their percentiles, min, max, mean | |
CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"] | |
CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"] | |
PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999] | |
def read_data(all_files, filetype): | |
df = None | |
func_name = "" | |
if filetype in ["parquet", "csv", "json"]: | |
if filetype == "parquet": | |
func_name = pd.read_parquet | |
elif filetype == "csv": | |
func_name = pd.read_csv | |
elif filetype == "json": | |
func_name = pd.read_json | |
df = pd.concat(func_name(f) for f in all_files) | |
elif filetype == "arrow": | |
ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files]) | |
df = pd.DataFrame(data=ds) | |
elif filetype == "jsonl": | |
func_name = pd.read_json | |
all_lines = [] | |
for fname in all_files: | |
with open(fname, "r") as f: | |
all_lines.extend(f.readlines()) | |
df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines]) | |
return df | |
def compute_cell_length_ranges(cell_lengths, cell_unique_string_values): | |
cell_length_ranges = {} | |
cell_length_ranges = {} | |
string_categorical = {} | |
# this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value | |
# with few unique items (need to check that while reading the cell), | |
# so no need to treat it as a normal string | |
if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10: | |
string_categorical = str(len(cell_unique_string_values)) + " class(es)" | |
elif cell_lengths: | |
cell_lengths = sorted(cell_lengths) | |
min_val = cell_lengths[0] | |
max_val = cell_lengths[-1] | |
distance = math.ceil((max_val - min_val) / 10.0) | |
ranges = [] | |
if min_val != max_val: | |
for j in range(min_val, max_val, distance): | |
ranges.append(j) | |
for j in range(len(ranges)-1): | |
cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0 | |
ranges.append(max_val) | |
j = 1 | |
c = 0 | |
for k in cell_lengths: | |
if j == len(ranges): | |
c += 1 | |
elif k < ranges[j]: | |
c += 1 | |
else: | |
cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c | |
j += 1 | |
c = 1 | |
cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c | |
else: | |
ranges = [min_val] | |
c = 0 | |
for k in cell_lengths: | |
c += 1 | |
cell_length_ranges[str(min_val)] = c | |
return cell_length_ranges, string_categorical | |
def _compute_percentiles(values, percentiles=PERCENTILES): | |
result = {} | |
quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive') | |
for p in percentiles: | |
result[p/10] = quantiles[p-1] | |
return result | |
def compute_cell_value_statistics(cell_values): | |
stats = {} | |
if cell_values: | |
cell_values = sorted(cell_values) | |
stats["min"] = cell_values[0] | |
stats["max"] = cell_values[-1] | |
stats["mean"] = statistics.mean(cell_values) | |
stats["stdev"] = statistics.stdev(cell_values) | |
stats["variance"] = statistics.variance(cell_values) | |
stats["percentiles"] = _compute_percentiles(cell_values) | |
return stats | |
def check_null(cell, cell_type): | |
if cell_type == "<class 'float'>": | |
if math.isnan(cell): | |
return True | |
elif cell is None: | |
return True | |
return False | |
def compute_property(data_path, glob, filetype): | |
output = {} | |
data_dir = Path(data_path) | |
filenames = [] | |
all_files = list(data_dir.glob(glob)) | |
for f in all_files: | |
print(str(f)) | |
base_fname = str(f)[len(str(data_path)):] | |
if not data_path.endswith("/"): | |
base_fname = base_fname[1:] | |
filenames.append(base_fname) | |
output["filenames"] = filenames | |
df = read_data(all_files, filetype) | |
column_info = {} | |
for col_name in df.columns: | |
if col_name not in column_info: | |
column_info[col_name] = {} | |
cell_types = {} | |
cell_lengths = {} | |
cell_unique_string_values = {} | |
cell_values = {} | |
null_count = 0 | |
col_values = df[col_name].to_list() | |
for cell in col_values: | |
# for index, row in df.iterrows(): | |
# cell = row[col_name] | |
cell_type = str(type(cell)) | |
cell_type = str(type(cell)) | |
# print(cell, cell_type) | |
if check_null(cell, cell_type): | |
null_count += 1 | |
continue | |
if cell_type not in cell_types: | |
cell_types[cell_type] = 1 | |
else: | |
cell_types[cell_type] += 1 | |
if cell_type in CELL_TYPES_LENGTH: | |
cell_length = len(cell) | |
if cell_type not in cell_lengths: | |
cell_lengths[cell_type] = [] | |
cell_lengths[cell_type].append(cell_length) | |
if cell_type == "<class 'str'>" and cell not in cell_unique_string_values: | |
cell_unique_string_values[cell] = True | |
elif cell_type in CELL_TYPES_NUMERIC: | |
if cell_type not in cell_values: | |
cell_values[cell_type] = [] | |
cell_values[cell_type].append(cell) | |
else: | |
print(cell_type) | |
clrs = {} | |
ccs = {} | |
for cell_type in CELL_TYPES_LENGTH: | |
if cell_type in cell_lengths: | |
clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values) | |
clrs[cell_type] = clr | |
ccs[cell_type] = cc | |
css = {} | |
for cell_type in CELL_TYPES_NUMERIC: | |
if cell_type in cell_values: | |
cell_stats = compute_cell_value_statistics(cell_values[cell_type]) | |
css[cell_type] = cell_stats | |
column_info[col_name]["cell_types"] = cell_types | |
column_info[col_name]["cell_length_ranges"] = clrs | |
column_info[col_name]["cell_categories"] = ccs | |
column_info[col_name]["cell_stats"] = css | |
column_info[col_name]["cell_missing"] = null_count | |
output["column_info"] = column_info | |
output["number_of_items"] = len(df) | |
output["timestamp"] = time.time() | |
return output | |
def preprocess_function(examples): | |
return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True) | |
def compute_metrics(eval_pred): | |
predictions, labels = eval_pred | |
predictions = np.argmax(predictions, axis=1) | |
return metric.compute(predictions=predictions, references=labels) | |
def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric): | |
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) | |
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2) | |
batch_size = 16 | |
args = TrainingArguments( | |
"test-glue", | |
evaluation_strategy = "epoch", | |
learning_rate=5e-5, | |
seed=42, | |
lr_scheduler_type="linear", | |
per_device_train_batch_size=batch_size, | |
per_device_eval_batch_size=batch_size, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
load_best_model_at_end=False, | |
metric_for_best_model="accuracy", | |
report_to="none" | |
) | |
trainer = Trainer( | |
model, | |
args, | |
train_dataset=tokenized_datasets["train"], | |
eval_dataset=tokenized_datasets["validation"], | |
tokenizer=tokenizer, | |
compute_metrics=compute_metrics | |
) | |
result = trainer.evaluate() | |
return result | |
if __name__ == "__main__": | |
in_container = True | |
if len(sys.argv) > 1: | |
model_checkpoint = sys.argv[1] | |
dataset_name = sys.argv[2] | |
metric = sys.argv[3] | |
in_container = False | |
else: | |
model_checkpoint = "sgugger/glue-mrpc" | |
dataset_name = "nyu-mll/glue" | |
metric = ["glue", "mrpc"] | |
in_container = False | |
print(model_checkpoint, dataset_name, metric) | |
model_checkpoint = model_checkpoint | |
raw_datasets = load_dataset(dataset_name, "mrpc") | |
metric = load("glue", "mrpc") | |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric) | |
print(json.dumps(output)) | |
if in_container: | |
with open("/tmp/outputs/computation_result.json", "w") as f: | |
json.dump(output, f, indent=4, sort_keys=True) | |
else: | |
print(json.dumps(output, indent=4, sort_keys=True)) |