Spaces:

ksatzke
/

klsTestSpace

Running

App Files Files Community

ksatzke commited on 1 day ago

Commit

2be9de8

verified ·

1 Parent(s): 85150f8

Create app.py

Browse files

Files changed (1) hide show

app.py +299 -0

app.py ADDED Viewed

	@@ -0,0 +1,299 @@

+from pathlib import Path
+import io
+import json
+import math
+import statistics
+import sys
+import time
+from datasets import concatenate_datasets, Dataset
+from datasets import load_dataset
+from huggingface_hub import hf_hub_url
+import pandas as pd
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
+from evaluate import load
+# 1. record each file name included
+# 1.1 read different file formats depending on parameters (i.e., filetype)
+# 2. determine column types and report how many rows for each type (format check)
+# (in a well-formatted dataset, each column should only have one type)
+# 3. report on the null values
+# 4. for certain column types, report statistics
+# 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
+# 4.2 strings: length ranges
+# 4.3 lists: length ranges
+# 4.3 int/float/double: their percentiles, min, max, mean
+CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
+CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]
+PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]
+def read_data(all_files, filetype):
+    df = None
+    func_name = ""
+    if filetype in ["parquet", "csv", "json"]:
+        if filetype == "parquet":
+            func_name = pd.read_parquet
+        elif filetype == "csv":
+            func_name = pd.read_csv
+        elif filetype == "json":
+            func_name = pd.read_json
+        df = pd.concat(func_name(f) for f in all_files)
+    elif filetype == "arrow":
+        ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
+        df = pd.DataFrame(data=ds)
+    elif filetype == "jsonl":
+        func_name = pd.read_json
+        all_lines = []
+        for fname in all_files:
+            with open(fname, "r") as f:
+                all_lines.extend(f.readlines())
+        df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])
+    return df
+def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
+    cell_length_ranges = {}
+    cell_length_ranges = {}
+    string_categorical = {}
+    # this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value
+    # with few unique items (need to check that while reading the cell),
+    # so no need to treat it as a normal string
+    if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
+        string_categorical = str(len(cell_unique_string_values)) + " class(es)"
+    elif cell_lengths:
+        cell_lengths = sorted(cell_lengths)
+        min_val = cell_lengths[0]
+        max_val = cell_lengths[-1]
+        distance = math.ceil((max_val - min_val) / 10.0)
+        ranges = []
+        if min_val != max_val:
+            for j in range(min_val, max_val, distance):
+                ranges.append(j)
+            for j in range(len(ranges)-1):
+                cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
+            ranges.append(max_val)
+            j = 1
+            c = 0
+            for k in cell_lengths:
+                if j == len(ranges):
+                    c += 1
+                elif k < ranges[j]:
+                    c += 1
+                else:
+                    cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
+                    j += 1
+                    c = 1
+            cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c
+        else:
+            ranges = [min_val]
+            c = 0
+            for k in cell_lengths:
+                c += 1
+            cell_length_ranges[str(min_val)] = c
+    return cell_length_ranges, string_categorical
+def _compute_percentiles(values, percentiles=PERCENTILES):
+    result = {}
+    quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
+    for p in percentiles:
+        result[p/10] = quantiles[p-1]
+    return result
+def compute_cell_value_statistics(cell_values):
+    stats = {}
+    if cell_values:
+        cell_values = sorted(cell_values)
+        stats["min"] = cell_values[0]
+        stats["max"] = cell_values[-1]
+        stats["mean"] = statistics.mean(cell_values)
+        stats["stdev"] = statistics.stdev(cell_values)
+        stats["variance"] = statistics.variance(cell_values)
+        stats["percentiles"] = _compute_percentiles(cell_values)
+    return stats
+def check_null(cell, cell_type):
+    if cell_type == "<class 'float'>":
+        if math.isnan(cell):
+            return True
+    elif cell is None:
+        return True
+    return False
+def compute_property(data_path, glob, filetype):
+    output = {}
+    data_dir = Path(data_path)
+    filenames = []
+    all_files = list(data_dir.glob(glob))
+    for f in all_files:
+        print(str(f))
+        base_fname = str(f)[len(str(data_path)):]
+        if not data_path.endswith("/"):
+            base_fname = base_fname[1:]
+        filenames.append(base_fname)
+    output["filenames"] = filenames
+    df = read_data(all_files, filetype)
+    column_info = {}
+    for col_name in df.columns:
+        if col_name not in column_info:
+            column_info[col_name] = {}
+        cell_types = {}
+        cell_lengths = {}
+        cell_unique_string_values = {}
+        cell_values = {}
+        null_count = 0
+        col_values = df[col_name].to_list()
+        for cell in col_values:
+        # for index, row in df.iterrows():
+        #     cell = row[col_name]
+            cell_type = str(type(cell))
+            cell_type = str(type(cell))
+            # print(cell, cell_type)
+            if check_null(cell, cell_type):
+                null_count += 1
+                continue
+            if cell_type not in cell_types:
+                cell_types[cell_type] = 1
+            else:
+                cell_types[cell_type] += 1
+            if cell_type in CELL_TYPES_LENGTH:
+                cell_length = len(cell)
+                if cell_type not in cell_lengths:
+                    cell_lengths[cell_type] = []
+                cell_lengths[cell_type].append(cell_length)
+                if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
+                    cell_unique_string_values[cell] = True
+            elif cell_type in CELL_TYPES_NUMERIC:
+                if cell_type not in cell_values:
+                    cell_values[cell_type] = []
+                cell_values[cell_type].append(cell)
+            else:
+                print(cell_type)
+        clrs = {}
+        ccs = {}
+        for cell_type in CELL_TYPES_LENGTH:
+            if cell_type in cell_lengths:
+                clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
+                clrs[cell_type] = clr
+                ccs[cell_type] = cc
+        css = {}
+        for cell_type in CELL_TYPES_NUMERIC:
+            if cell_type in cell_values:
+                cell_stats = compute_cell_value_statistics(cell_values[cell_type])
+                css[cell_type] = cell_stats
+        column_info[col_name]["cell_types"] = cell_types
+        column_info[col_name]["cell_length_ranges"] = clrs
+        column_info[col_name]["cell_categories"] = ccs
+        column_info[col_name]["cell_stats"] = css
+        column_info[col_name]["cell_missing"] = null_count
+    output["column_info"] = column_info
+    output["number_of_items"] = len(df)
+    output["timestamp"] = time.time()
+    return output
+def preprocess_function(examples):
+    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    return metric.compute(predictions=predictions, references=labels)
+def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric):
+    tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
+    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
+    batch_size = 16
+    args = TrainingArguments(
+        "test-glue",
+        evaluation_strategy = "epoch",
+        learning_rate=5e-5,
+        seed=42,
+        lr_scheduler_type="linear",
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size,
+        num_train_epochs=3,
+        weight_decay=0.01,
+        load_best_model_at_end=False,
+        metric_for_best_model="accuracy",
+        report_to="none"
+        )
+    trainer = Trainer(
+        model,
+        args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["validation"],
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics
+    )
+    result = trainer.evaluate()
+    return result
+if __name__ == "__main__":
+    in_container = True
+    if len(sys.argv) > 1:
+        model_checkpoint = sys.argv[1]
+        dataset_name = sys.argv[2]
+        metric = sys.argv[3]
+        in_container = False
+    else:
+        model_checkpoint = "sgugger/glue-mrpc"
+        dataset_name = "nyu-mll/glue"
+        metric = ["glue", "mrpc"]
+        in_container = False
+    print(model_checkpoint, dataset_name, metric)
+    model_checkpoint = model_checkpoint
+    raw_datasets = load_dataset(dataset_name, "mrpc")
+    metric = load("glue", "mrpc")
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+    output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric)
+    print(json.dumps(output))
+    if in_container:
+        with open("/tmp/outputs/computation_result.json", "w") as f:
+            json.dump(output, f, indent=4, sort_keys=True)
+    else:
+        print(json.dumps(output, indent=4, sort_keys=True))