klsTestSpace / app.py
ksatzke's picture
Create app.py
2be9de8 verified
from pathlib import Path
import io
import json
import math
import statistics
import sys
import time
from datasets import concatenate_datasets, Dataset
from datasets import load_dataset
from huggingface_hub import hf_hub_url
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from evaluate import load
# 1. record each file name included
# 1.1 read different file formats depending on parameters (i.e., filetype)
# 2. determine column types and report how many rows for each type (format check)
# (in a well-formatted dataset, each column should only have one type)
# 3. report on the null values
# 4. for certain column types, report statistics
# 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
# 4.2 strings: length ranges
# 4.3 lists: length ranges
# 4.3 int/float/double: their percentiles, min, max, mean
CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]
PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]
def read_data(all_files, filetype):
df = None
func_name = ""
if filetype in ["parquet", "csv", "json"]:
if filetype == "parquet":
func_name = pd.read_parquet
elif filetype == "csv":
func_name = pd.read_csv
elif filetype == "json":
func_name = pd.read_json
df = pd.concat(func_name(f) for f in all_files)
elif filetype == "arrow":
ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
df = pd.DataFrame(data=ds)
elif filetype == "jsonl":
func_name = pd.read_json
all_lines = []
for fname in all_files:
with open(fname, "r") as f:
all_lines.extend(f.readlines())
df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])
return df
def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
cell_length_ranges = {}
cell_length_ranges = {}
string_categorical = {}
# this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value
# with few unique items (need to check that while reading the cell),
# so no need to treat it as a normal string
if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
string_categorical = str(len(cell_unique_string_values)) + " class(es)"
elif cell_lengths:
cell_lengths = sorted(cell_lengths)
min_val = cell_lengths[0]
max_val = cell_lengths[-1]
distance = math.ceil((max_val - min_val) / 10.0)
ranges = []
if min_val != max_val:
for j in range(min_val, max_val, distance):
ranges.append(j)
for j in range(len(ranges)-1):
cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
ranges.append(max_val)
j = 1
c = 0
for k in cell_lengths:
if j == len(ranges):
c += 1
elif k < ranges[j]:
c += 1
else:
cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
j += 1
c = 1
cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c
else:
ranges = [min_val]
c = 0
for k in cell_lengths:
c += 1
cell_length_ranges[str(min_val)] = c
return cell_length_ranges, string_categorical
def _compute_percentiles(values, percentiles=PERCENTILES):
result = {}
quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
for p in percentiles:
result[p/10] = quantiles[p-1]
return result
def compute_cell_value_statistics(cell_values):
stats = {}
if cell_values:
cell_values = sorted(cell_values)
stats["min"] = cell_values[0]
stats["max"] = cell_values[-1]
stats["mean"] = statistics.mean(cell_values)
stats["stdev"] = statistics.stdev(cell_values)
stats["variance"] = statistics.variance(cell_values)
stats["percentiles"] = _compute_percentiles(cell_values)
return stats
def check_null(cell, cell_type):
if cell_type == "<class 'float'>":
if math.isnan(cell):
return True
elif cell is None:
return True
return False
def compute_property(data_path, glob, filetype):
output = {}
data_dir = Path(data_path)
filenames = []
all_files = list(data_dir.glob(glob))
for f in all_files:
print(str(f))
base_fname = str(f)[len(str(data_path)):]
if not data_path.endswith("/"):
base_fname = base_fname[1:]
filenames.append(base_fname)
output["filenames"] = filenames
df = read_data(all_files, filetype)
column_info = {}
for col_name in df.columns:
if col_name not in column_info:
column_info[col_name] = {}
cell_types = {}
cell_lengths = {}
cell_unique_string_values = {}
cell_values = {}
null_count = 0
col_values = df[col_name].to_list()
for cell in col_values:
# for index, row in df.iterrows():
# cell = row[col_name]
cell_type = str(type(cell))
cell_type = str(type(cell))
# print(cell, cell_type)
if check_null(cell, cell_type):
null_count += 1
continue
if cell_type not in cell_types:
cell_types[cell_type] = 1
else:
cell_types[cell_type] += 1
if cell_type in CELL_TYPES_LENGTH:
cell_length = len(cell)
if cell_type not in cell_lengths:
cell_lengths[cell_type] = []
cell_lengths[cell_type].append(cell_length)
if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
cell_unique_string_values[cell] = True
elif cell_type in CELL_TYPES_NUMERIC:
if cell_type not in cell_values:
cell_values[cell_type] = []
cell_values[cell_type].append(cell)
else:
print(cell_type)
clrs = {}
ccs = {}
for cell_type in CELL_TYPES_LENGTH:
if cell_type in cell_lengths:
clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
clrs[cell_type] = clr
ccs[cell_type] = cc
css = {}
for cell_type in CELL_TYPES_NUMERIC:
if cell_type in cell_values:
cell_stats = compute_cell_value_statistics(cell_values[cell_type])
css[cell_type] = cell_stats
column_info[col_name]["cell_types"] = cell_types
column_info[col_name]["cell_length_ranges"] = clrs
column_info[col_name]["cell_categories"] = ccs
column_info[col_name]["cell_stats"] = css
column_info[col_name]["cell_missing"] = null_count
output["column_info"] = column_info
output["number_of_items"] = len(df)
output["timestamp"] = time.time()
return output
def preprocess_function(examples):
return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric):
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
batch_size = 16
args = TrainingArguments(
"test-glue",
evaluation_strategy = "epoch",
learning_rate=5e-5,
seed=42,
lr_scheduler_type="linear",
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=3,
weight_decay=0.01,
load_best_model_at_end=False,
metric_for_best_model="accuracy",
report_to="none"
)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
result = trainer.evaluate()
return result
if __name__ == "__main__":
in_container = True
if len(sys.argv) > 1:
model_checkpoint = sys.argv[1]
dataset_name = sys.argv[2]
metric = sys.argv[3]
in_container = False
else:
model_checkpoint = "sgugger/glue-mrpc"
dataset_name = "nyu-mll/glue"
metric = ["glue", "mrpc"]
in_container = False
print(model_checkpoint, dataset_name, metric)
model_checkpoint = model_checkpoint
raw_datasets = load_dataset(dataset_name, "mrpc")
metric = load("glue", "mrpc")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric)
print(json.dumps(output))
if in_container:
with open("/tmp/outputs/computation_result.json", "w") as f:
json.dump(output, f, indent=4, sort_keys=True)
else:
print(json.dumps(output, indent=4, sort_keys=True))