Spaces:

ksatzke
/

klsTestSpace

Running

App Files Files Community

klsTestSpace / app.py

ksatzke

Create app.py

2be9de8 verified 1 day ago

raw

history blame contribute delete

9.77 kB

	from pathlib import Path
	import io
	import json
	import math
	import statistics
	import sys
	import time

	from datasets import concatenate_datasets, Dataset
	from datasets import load_dataset

	from huggingface_hub import hf_hub_url

	import pandas as pd
	import numpy as np
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
	from evaluate import load


	# 1. record each file name included
	# 1.1 read different file formats depending on parameters (i.e., filetype)
	# 2. determine column types and report how many rows for each type (format check)
	# (in a well-formatted dataset, each column should only have one type)
	# 3. report on the null values
	# 4. for certain column types, report statistics
	# 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
	# 4.2 strings: length ranges
	# 4.3 lists: length ranges
	# 4.3 int/float/double: their percentiles, min, max, mean

	CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
	CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]

	PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]

	def read_data(all_files, filetype):
	df = None

	func_name = ""

	if filetype in ["parquet", "csv", "json"]:
	if filetype == "parquet":
	func_name = pd.read_parquet
	elif filetype == "csv":
	func_name = pd.read_csv
	elif filetype == "json":
	func_name = pd.read_json

	df = pd.concat(func_name(f) for f in all_files)

	elif filetype == "arrow":
	ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
	df = pd.DataFrame(data=ds)

	elif filetype == "jsonl":
	func_name = pd.read_json
	all_lines = []
	for fname in all_files:
	with open(fname, "r") as f:
	all_lines.extend(f.readlines())

	df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])

	return df

	def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
	cell_length_ranges = {}
	cell_length_ranges = {}
	string_categorical = {}
	# this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value
	# with few unique items (need to check that while reading the cell),
	# so no need to treat it as a normal string
	if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
	string_categorical = str(len(cell_unique_string_values)) + " class(es)"

	elif cell_lengths:
	cell_lengths = sorted(cell_lengths)
	min_val = cell_lengths[0]
	max_val = cell_lengths[-1]
	distance = math.ceil((max_val - min_val) / 10.0)
	ranges = []
	if min_val != max_val:
	for j in range(min_val, max_val, distance):
	ranges.append(j)
	for j in range(len(ranges)-1):
	cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
	ranges.append(max_val)

	j = 1
	c = 0
	for k in cell_lengths:
	if j == len(ranges):
	c += 1
	elif k < ranges[j]:
	c += 1
	else:
	cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
	j += 1
	c = 1

	cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c

	else:
	ranges = [min_val]
	c = 0
	for k in cell_lengths:
	c += 1
	cell_length_ranges[str(min_val)] = c

	return cell_length_ranges, string_categorical

	def _compute_percentiles(values, percentiles=PERCENTILES):
	result = {}
	quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
	for p in percentiles:
	result[p/10] = quantiles[p-1]
	return result

	def compute_cell_value_statistics(cell_values):
	stats = {}
	if cell_values:
	cell_values = sorted(cell_values)

	stats["min"] = cell_values[0]
	stats["max"] = cell_values[-1]
	stats["mean"] = statistics.mean(cell_values)
	stats["stdev"] = statistics.stdev(cell_values)
	stats["variance"] = statistics.variance(cell_values)

	stats["percentiles"] = _compute_percentiles(cell_values)

	return stats

	def check_null(cell, cell_type):
	if cell_type == "<class 'float'>":
	if math.isnan(cell):
	return True
	elif cell is None:
	return True
	return False

	def compute_property(data_path, glob, filetype):
	output = {}

	data_dir = Path(data_path)

	filenames = []
	all_files = list(data_dir.glob(glob))
	for f in all_files:
	print(str(f))
	base_fname = str(f)[len(str(data_path)):]
	if not data_path.endswith("/"):
	base_fname = base_fname[1:]
	filenames.append(base_fname)

	output["filenames"] = filenames

	df = read_data(all_files, filetype)

	column_info = {}

	for col_name in df.columns:
	if col_name not in column_info:
	column_info[col_name] = {}

	cell_types = {}

	cell_lengths = {}
	cell_unique_string_values = {}
	cell_values = {}
	null_count = 0
	col_values = df[col_name].to_list()
	for cell in col_values:
	# for index, row in df.iterrows():
	# cell = row[col_name]
	cell_type = str(type(cell))
	cell_type = str(type(cell))
	# print(cell, cell_type)
	if check_null(cell, cell_type):
	null_count += 1
	continue

	if cell_type not in cell_types:
	cell_types[cell_type] = 1
	else:
	cell_types[cell_type] += 1

	if cell_type in CELL_TYPES_LENGTH:
	cell_length = len(cell)
	if cell_type not in cell_lengths:
	cell_lengths[cell_type] = []

	cell_lengths[cell_type].append(cell_length)
	if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
	cell_unique_string_values[cell] = True

	elif cell_type in CELL_TYPES_NUMERIC:
	if cell_type not in cell_values:
	cell_values[cell_type] = []

	cell_values[cell_type].append(cell)

	else:
	print(cell_type)

	clrs = {}
	ccs = {}
	for cell_type in CELL_TYPES_LENGTH:
	if cell_type in cell_lengths:
	clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
	clrs[cell_type] = clr
	ccs[cell_type] = cc

	css = {}
	for cell_type in CELL_TYPES_NUMERIC:
	if cell_type in cell_values:
	cell_stats = compute_cell_value_statistics(cell_values[cell_type])
	css[cell_type] = cell_stats

	column_info[col_name]["cell_types"] = cell_types
	column_info[col_name]["cell_length_ranges"] = clrs
	column_info[col_name]["cell_categories"] = ccs
	column_info[col_name]["cell_stats"] = css
	column_info[col_name]["cell_missing"] = null_count

	output["column_info"] = column_info
	output["number_of_items"] = len(df)
	output["timestamp"] = time.time()

	return output

	def preprocess_function(examples):
	return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

	def compute_metrics(eval_pred):
	predictions, labels = eval_pred
	predictions = np.argmax(predictions, axis=1)
	return metric.compute(predictions=predictions, references=labels)

	def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric):
	tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
	model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
	batch_size = 16
	args = TrainingArguments(
	"test-glue",
	evaluation_strategy = "epoch",
	learning_rate=5e-5,
	seed=42,
	lr_scheduler_type="linear",
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=3,
	weight_decay=0.01,
	load_best_model_at_end=False,
	metric_for_best_model="accuracy",
	report_to="none"
	)

	trainer = Trainer(
	model,
	args,
	train_dataset=tokenized_datasets["train"],
	eval_dataset=tokenized_datasets["validation"],
	tokenizer=tokenizer,
	compute_metrics=compute_metrics
	)
	result = trainer.evaluate()
	return result


	if __name__ == "__main__":

	in_container = True
	if len(sys.argv) > 1:
	model_checkpoint = sys.argv[1]
	dataset_name = sys.argv[2]
	metric = sys.argv[3]
	in_container = False
	else:
	model_checkpoint = "sgugger/glue-mrpc"
	dataset_name = "nyu-mll/glue"
	metric = ["glue", "mrpc"]
	in_container = False

	print(model_checkpoint, dataset_name, metric)


	model_checkpoint = model_checkpoint
	raw_datasets = load_dataset(dataset_name, "mrpc")
	metric = load("glue", "mrpc")
	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
	output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric)
	print(json.dumps(output))

	if in_container:
	with open("/tmp/outputs/computation_result.json", "w") as f:
	json.dump(output, f, indent=4, sort_keys=True)
	else:
	print(json.dumps(output, indent=4, sort_keys=True))