ksatzke commited on
Commit
2be9de8
·
verified ·
1 Parent(s): 85150f8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +299 -0
app.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import io
3
+ import json
4
+ import math
5
+ import statistics
6
+ import sys
7
+ import time
8
+
9
+ from datasets import concatenate_datasets, Dataset
10
+ from datasets import load_dataset
11
+
12
+ from huggingface_hub import hf_hub_url
13
+
14
+ import pandas as pd
15
+ import numpy as np
16
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
17
+ from evaluate import load
18
+
19
+
20
+ # 1. record each file name included
21
+ # 1.1 read different file formats depending on parameters (i.e., filetype)
22
+ # 2. determine column types and report how many rows for each type (format check)
23
+ # (in a well-formatted dataset, each column should only have one type)
24
+ # 3. report on the null values
25
+ # 4. for certain column types, report statistics
26
+ # 4.1 uniqueness: if all rows are of a small number of <string> values, treat the column as 'categorical' < 10.
27
+ # 4.2 strings: length ranges
28
+ # 4.3 lists: length ranges
29
+ # 4.3 int/float/double: their percentiles, min, max, mean
30
+
31
+ CELL_TYPES_LENGTH = ["<class 'str'>", "<class 'list'>"]
32
+ CELL_TYPES_NUMERIC = ["<class 'int'>", "<class 'float'>"]
33
+
34
+ PERCENTILES = [1, 5, 10, 25, 50, 100, 250, 500, 750, 900, 950, 975, 990, 995, 999]
35
+
36
+ def read_data(all_files, filetype):
37
+ df = None
38
+
39
+ func_name = ""
40
+
41
+ if filetype in ["parquet", "csv", "json"]:
42
+ if filetype == "parquet":
43
+ func_name = pd.read_parquet
44
+ elif filetype == "csv":
45
+ func_name = pd.read_csv
46
+ elif filetype == "json":
47
+ func_name = pd.read_json
48
+
49
+ df = pd.concat(func_name(f) for f in all_files)
50
+
51
+ elif filetype == "arrow":
52
+ ds = concatenate_datasets([Dataset.from_file(str(fname)) for fname in all_files])
53
+ df = pd.DataFrame(data=ds)
54
+
55
+ elif filetype == "jsonl":
56
+ func_name = pd.read_json
57
+ all_lines = []
58
+ for fname in all_files:
59
+ with open(fname, "r") as f:
60
+ all_lines.extend(f.readlines())
61
+
62
+ df = pd.concat([pd.DataFrame.from_dict([json.loads(line)]) for line in all_lines])
63
+
64
+ return df
65
+
66
+ def compute_cell_length_ranges(cell_lengths, cell_unique_string_values):
67
+ cell_length_ranges = {}
68
+ cell_length_ranges = {}
69
+ string_categorical = {}
70
+ # this is probably a 'categorical' (i.e., 'classes' in HuggingFace) value
71
+ # with few unique items (need to check that while reading the cell),
72
+ # so no need to treat it as a normal string
73
+ if len(cell_unique_string_values) > 0 and len(cell_unique_string_values) <= 10:
74
+ string_categorical = str(len(cell_unique_string_values)) + " class(es)"
75
+
76
+ elif cell_lengths:
77
+ cell_lengths = sorted(cell_lengths)
78
+ min_val = cell_lengths[0]
79
+ max_val = cell_lengths[-1]
80
+ distance = math.ceil((max_val - min_val) / 10.0)
81
+ ranges = []
82
+ if min_val != max_val:
83
+ for j in range(min_val, max_val, distance):
84
+ ranges.append(j)
85
+ for j in range(len(ranges)-1):
86
+ cell_length_ranges[str(ranges[j]) + "-" + str(ranges[j+1])] = 0
87
+ ranges.append(max_val)
88
+
89
+ j = 1
90
+ c = 0
91
+ for k in cell_lengths:
92
+ if j == len(ranges):
93
+ c += 1
94
+ elif k < ranges[j]:
95
+ c += 1
96
+ else:
97
+ cell_length_ranges[str(ranges[j-1]) + "-" + str(ranges[j])] = c
98
+ j += 1
99
+ c = 1
100
+
101
+ cell_length_ranges[str(ranges[j-1]) + "-" + str(max_val)] = c
102
+
103
+ else:
104
+ ranges = [min_val]
105
+ c = 0
106
+ for k in cell_lengths:
107
+ c += 1
108
+ cell_length_ranges[str(min_val)] = c
109
+
110
+ return cell_length_ranges, string_categorical
111
+
112
+ def _compute_percentiles(values, percentiles=PERCENTILES):
113
+ result = {}
114
+ quantiles = statistics.quantiles(values, n=max(PERCENTILES)+1, method='inclusive')
115
+ for p in percentiles:
116
+ result[p/10] = quantiles[p-1]
117
+ return result
118
+
119
+ def compute_cell_value_statistics(cell_values):
120
+ stats = {}
121
+ if cell_values:
122
+ cell_values = sorted(cell_values)
123
+
124
+ stats["min"] = cell_values[0]
125
+ stats["max"] = cell_values[-1]
126
+ stats["mean"] = statistics.mean(cell_values)
127
+ stats["stdev"] = statistics.stdev(cell_values)
128
+ stats["variance"] = statistics.variance(cell_values)
129
+
130
+ stats["percentiles"] = _compute_percentiles(cell_values)
131
+
132
+ return stats
133
+
134
+ def check_null(cell, cell_type):
135
+ if cell_type == "<class 'float'>":
136
+ if math.isnan(cell):
137
+ return True
138
+ elif cell is None:
139
+ return True
140
+ return False
141
+
142
+ def compute_property(data_path, glob, filetype):
143
+ output = {}
144
+
145
+ data_dir = Path(data_path)
146
+
147
+ filenames = []
148
+ all_files = list(data_dir.glob(glob))
149
+ for f in all_files:
150
+ print(str(f))
151
+ base_fname = str(f)[len(str(data_path)):]
152
+ if not data_path.endswith("/"):
153
+ base_fname = base_fname[1:]
154
+ filenames.append(base_fname)
155
+
156
+ output["filenames"] = filenames
157
+
158
+ df = read_data(all_files, filetype)
159
+
160
+ column_info = {}
161
+
162
+ for col_name in df.columns:
163
+ if col_name not in column_info:
164
+ column_info[col_name] = {}
165
+
166
+ cell_types = {}
167
+
168
+ cell_lengths = {}
169
+ cell_unique_string_values = {}
170
+ cell_values = {}
171
+ null_count = 0
172
+ col_values = df[col_name].to_list()
173
+ for cell in col_values:
174
+ # for index, row in df.iterrows():
175
+ # cell = row[col_name]
176
+ cell_type = str(type(cell))
177
+ cell_type = str(type(cell))
178
+ # print(cell, cell_type)
179
+ if check_null(cell, cell_type):
180
+ null_count += 1
181
+ continue
182
+
183
+ if cell_type not in cell_types:
184
+ cell_types[cell_type] = 1
185
+ else:
186
+ cell_types[cell_type] += 1
187
+
188
+ if cell_type in CELL_TYPES_LENGTH:
189
+ cell_length = len(cell)
190
+ if cell_type not in cell_lengths:
191
+ cell_lengths[cell_type] = []
192
+
193
+ cell_lengths[cell_type].append(cell_length)
194
+ if cell_type == "<class 'str'>" and cell not in cell_unique_string_values:
195
+ cell_unique_string_values[cell] = True
196
+
197
+ elif cell_type in CELL_TYPES_NUMERIC:
198
+ if cell_type not in cell_values:
199
+ cell_values[cell_type] = []
200
+
201
+ cell_values[cell_type].append(cell)
202
+
203
+ else:
204
+ print(cell_type)
205
+
206
+ clrs = {}
207
+ ccs = {}
208
+ for cell_type in CELL_TYPES_LENGTH:
209
+ if cell_type in cell_lengths:
210
+ clr, cc = compute_cell_length_ranges(cell_lengths[cell_type], cell_unique_string_values)
211
+ clrs[cell_type] = clr
212
+ ccs[cell_type] = cc
213
+
214
+ css = {}
215
+ for cell_type in CELL_TYPES_NUMERIC:
216
+ if cell_type in cell_values:
217
+ cell_stats = compute_cell_value_statistics(cell_values[cell_type])
218
+ css[cell_type] = cell_stats
219
+
220
+ column_info[col_name]["cell_types"] = cell_types
221
+ column_info[col_name]["cell_length_ranges"] = clrs
222
+ column_info[col_name]["cell_categories"] = ccs
223
+ column_info[col_name]["cell_stats"] = css
224
+ column_info[col_name]["cell_missing"] = null_count
225
+
226
+ output["column_info"] = column_info
227
+ output["number_of_items"] = len(df)
228
+ output["timestamp"] = time.time()
229
+
230
+ return output
231
+
232
+ def preprocess_function(examples):
233
+ return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
234
+
235
+ def compute_metrics(eval_pred):
236
+ predictions, labels = eval_pred
237
+ predictions = np.argmax(predictions, axis=1)
238
+ return metric.compute(predictions=predictions, references=labels)
239
+
240
+ def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric):
241
+ tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
242
+ model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
243
+ batch_size = 16
244
+ args = TrainingArguments(
245
+ "test-glue",
246
+ evaluation_strategy = "epoch",
247
+ learning_rate=5e-5,
248
+ seed=42,
249
+ lr_scheduler_type="linear",
250
+ per_device_train_batch_size=batch_size,
251
+ per_device_eval_batch_size=batch_size,
252
+ num_train_epochs=3,
253
+ weight_decay=0.01,
254
+ load_best_model_at_end=False,
255
+ metric_for_best_model="accuracy",
256
+ report_to="none"
257
+ )
258
+
259
+ trainer = Trainer(
260
+ model,
261
+ args,
262
+ train_dataset=tokenized_datasets["train"],
263
+ eval_dataset=tokenized_datasets["validation"],
264
+ tokenizer=tokenizer,
265
+ compute_metrics=compute_metrics
266
+ )
267
+ result = trainer.evaluate()
268
+ return result
269
+
270
+
271
+ if __name__ == "__main__":
272
+
273
+ in_container = True
274
+ if len(sys.argv) > 1:
275
+ model_checkpoint = sys.argv[1]
276
+ dataset_name = sys.argv[2]
277
+ metric = sys.argv[3]
278
+ in_container = False
279
+ else:
280
+ model_checkpoint = "sgugger/glue-mrpc"
281
+ dataset_name = "nyu-mll/glue"
282
+ metric = ["glue", "mrpc"]
283
+ in_container = False
284
+
285
+ print(model_checkpoint, dataset_name, metric)
286
+
287
+
288
+ model_checkpoint = model_checkpoint
289
+ raw_datasets = load_dataset(dataset_name, "mrpc")
290
+ metric = load("glue", "mrpc")
291
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
292
+ output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric)
293
+ print(json.dumps(output))
294
+
295
+ if in_container:
296
+ with open("/tmp/outputs/computation_result.json", "w") as f:
297
+ json.dump(output, f, indent=4, sort_keys=True)
298
+ else:
299
+ print(json.dumps(output, indent=4, sort_keys=True))