Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
·
7c5239c
1
Parent(s):
ff8aca1
Adds flag for live deployment so that things will not be all recalculated when live.
Browse files- app.py +2 -0
- data_measurements/dataset_statistics.py +119 -99
- data_measurements/streamlit_utils.py +6 -2
app.py
CHANGED
|
@@ -157,6 +157,8 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
| 157 |
dstats.load_or_prepare_text_duplicates()
|
| 158 |
dstats.load_or_prepare_npmi()
|
| 159 |
dstats.load_or_prepare_zipf()
|
|
|
|
|
|
|
| 160 |
|
| 161 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=True):
|
| 162 |
"""
|
|
|
|
| 157 |
dstats.load_or_prepare_text_duplicates()
|
| 158 |
dstats.load_or_prepare_npmi()
|
| 159 |
dstats.load_or_prepare_zipf()
|
| 160 |
+
# Don't recalculate; we're live
|
| 161 |
+
dstats.set_deployment(True)
|
| 162 |
|
| 163 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=True):
|
| 164 |
"""
|
data_measurements/dataset_statistics.py
CHANGED
|
@@ -299,6 +299,15 @@ class DatasetStatisticsCacheClass:
|
|
| 299 |
# Needed for UI
|
| 300 |
self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
|
| 301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
def get_base_dataset(self):
|
| 303 |
"""Gets a pointer to the truncated base dataset object."""
|
| 304 |
if not self.dset:
|
|
@@ -378,31 +387,34 @@ class DatasetStatisticsCacheClass:
|
|
| 378 |
write_json(self.length_stats_dict, self.length_stats_json_fid)
|
| 379 |
|
| 380 |
def prepare_length_df(self):
|
| 381 |
-
if self.
|
| 382 |
-
self.tokenized_df
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
|
|
|
| 389 |
|
| 390 |
def prepare_text_length_stats(self):
|
| 391 |
-
if
|
| 392 |
-
self.
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
|
|
|
| 401 |
|
| 402 |
def prepare_fig_text_lengths(self):
|
| 403 |
-
if
|
| 404 |
-
self.
|
| 405 |
-
|
|
|
|
| 406 |
|
| 407 |
def load_or_prepare_embeddings(self, save=True):
|
| 408 |
if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_json_fid):
|
|
@@ -489,39 +501,41 @@ class DatasetStatisticsCacheClass:
|
|
| 489 |
self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
|
| 490 |
|
| 491 |
def prepare_general_stats(self):
|
| 492 |
-
if self.
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
|
|
|
| 512 |
|
| 513 |
def prepare_text_duplicates(self):
|
| 514 |
-
if self.
|
| 515 |
-
self.
|
| 516 |
-
|
| 517 |
-
self.tokenized_df
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
|
|
|
| 525 |
|
| 526 |
def load_or_prepare_dataset(self, save=True):
|
| 527 |
"""
|
|
@@ -557,12 +571,13 @@ class DatasetStatisticsCacheClass:
|
|
| 557 |
if (self.use_cache and exists(self.tokenized_df_fid)):
|
| 558 |
self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
|
| 559 |
else:
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
|
|
|
| 566 |
|
| 567 |
def load_or_prepare_text_dset(self, save=True):
|
| 568 |
if (self.use_cache and exists(self.text_dset_fid)):
|
|
@@ -572,22 +587,24 @@ class DatasetStatisticsCacheClass:
|
|
| 572 |
logs.info(self.text_dset)
|
| 573 |
# ...Or load it from the server and store it anew
|
| 574 |
else:
|
| 575 |
-
self.
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
|
|
|
| 580 |
|
| 581 |
def prepare_text_dset(self):
|
| 582 |
-
self.
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
examples
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
|
|
|
| 591 |
|
| 592 |
def do_tokenization(self):
|
| 593 |
"""
|
|
@@ -646,25 +663,27 @@ class DatasetStatisticsCacheClass:
|
|
| 646 |
if save:
|
| 647 |
write_plotly(self.fig_labels, self.fig_labels_json_fid)
|
| 648 |
else:
|
| 649 |
-
self.
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
|
|
|
| 654 |
|
| 655 |
def prepare_labels(self):
|
| 656 |
-
self.
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
examples
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
self.
|
| 667 |
-
|
|
|
|
| 668 |
|
| 669 |
def load_or_prepare_npmi(self):
|
| 670 |
self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=self.use_cache)
|
|
@@ -784,16 +803,17 @@ class nPMIStatisticsCacheClass:
|
|
| 784 |
joint_npmi_df = self.load_joint_npmi_df(joint_npmi_fid)
|
| 785 |
# When maybe some things have been computed for the selected subgroups.
|
| 786 |
else:
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
|
|
|
| 797 |
logs.info("The joint npmi df is")
|
| 798 |
logs.info(joint_npmi_df)
|
| 799 |
return joint_npmi_df
|
|
|
|
| 299 |
# Needed for UI
|
| 300 |
self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
|
| 301 |
|
| 302 |
+
self.live = False
|
| 303 |
+
|
| 304 |
+
def set_deployment(self, live=True):
|
| 305 |
+
"""
|
| 306 |
+
Function that we can hit when we deploy, so that cache files are not
|
| 307 |
+
written out/recalculated, but instead that part of the UI can be punted.
|
| 308 |
+
"""
|
| 309 |
+
self.live = live
|
| 310 |
+
|
| 311 |
def get_base_dataset(self):
|
| 312 |
"""Gets a pointer to the truncated base dataset object."""
|
| 313 |
if not self.dset:
|
|
|
|
| 387 |
write_json(self.length_stats_dict, self.length_stats_json_fid)
|
| 388 |
|
| 389 |
def prepare_length_df(self):
|
| 390 |
+
if not self.live:
|
| 391 |
+
if self.tokenized_df is None:
|
| 392 |
+
self.tokenized_df = self.do_tokenization()
|
| 393 |
+
self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[
|
| 394 |
+
TOKENIZED_FIELD].apply(len)
|
| 395 |
+
self.length_df = self.tokenized_df[
|
| 396 |
+
[LENGTH_FIELD, OUR_TEXT_FIELD]].sort_values(
|
| 397 |
+
by=[LENGTH_FIELD], ascending=True
|
| 398 |
+
)
|
| 399 |
|
| 400 |
def prepare_text_length_stats(self):
|
| 401 |
+
if not self.live:
|
| 402 |
+
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns or self.length_df is None:
|
| 403 |
+
self.prepare_length_df()
|
| 404 |
+
avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
|
| 405 |
+
self.avg_length = round(avg_length, 1)
|
| 406 |
+
std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
|
| 407 |
+
self.std_length = round(std_length, 1)
|
| 408 |
+
self.num_uniq_lengths = len(self.length_df["length"].unique())
|
| 409 |
+
self.length_stats_dict = {"avg length": self.avg_length,
|
| 410 |
+
"std length": self.std_length,
|
| 411 |
+
"num lengths": self.num_uniq_lengths}
|
| 412 |
|
| 413 |
def prepare_fig_text_lengths(self):
|
| 414 |
+
if not self.live:
|
| 415 |
+
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
| 416 |
+
self.prepare_length_df()
|
| 417 |
+
self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
|
| 418 |
|
| 419 |
def load_or_prepare_embeddings(self, save=True):
|
| 420 |
if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_json_fid):
|
|
|
|
| 501 |
self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
|
| 502 |
|
| 503 |
def prepare_general_stats(self):
|
| 504 |
+
if not self.live:
|
| 505 |
+
if self.tokenized_df is None:
|
| 506 |
+
logs.warning("Tokenized dataset not yet loaded; doing so.")
|
| 507 |
+
self.load_or_prepare_dataset()
|
| 508 |
+
if self.vocab_counts_df is None:
|
| 509 |
+
logs.warning("Vocab not yet loaded; doing so.")
|
| 510 |
+
self.load_or_prepare_vocab()
|
| 511 |
+
self.sorted_top_vocab_df = self.vocab_counts_filtered_df.sort_values(
|
| 512 |
+
"count", ascending=False
|
| 513 |
+
).head(_TOP_N)
|
| 514 |
+
self.total_words = len(self.vocab_counts_df)
|
| 515 |
+
self.total_open_words = len(self.vocab_counts_filtered_df)
|
| 516 |
+
self.text_nan_count = int(self.tokenized_df.isnull().sum().sum())
|
| 517 |
+
self.prepare_text_duplicates()
|
| 518 |
+
self.dedup_total = sum(self.dup_counts_df[CNT])
|
| 519 |
+
self.general_stats_dict = {
|
| 520 |
+
TOT_WORDS: self.total_words,
|
| 521 |
+
TOT_OPEN_WORDS: self.total_open_words,
|
| 522 |
+
TEXT_NAN_CNT: self.text_nan_count,
|
| 523 |
+
DEDUP_TOT: self.dedup_total,
|
| 524 |
+
}
|
| 525 |
|
| 526 |
def prepare_text_duplicates(self):
|
| 527 |
+
if not self.live:
|
| 528 |
+
if self.tokenized_df is None:
|
| 529 |
+
self.load_or_prepare_tokenized_df()
|
| 530 |
+
dup_df = self.tokenized_df[
|
| 531 |
+
self.tokenized_df.duplicated([OUR_TEXT_FIELD])]
|
| 532 |
+
self.dup_counts_df = pd.DataFrame(
|
| 533 |
+
dup_df.pivot_table(
|
| 534 |
+
columns=[OUR_TEXT_FIELD], aggfunc="size"
|
| 535 |
+
).sort_values(ascending=False),
|
| 536 |
+
columns=[CNT],
|
| 537 |
+
)
|
| 538 |
+
self.dup_counts_df[OUR_TEXT_FIELD] = self.dup_counts_df.index.copy()
|
| 539 |
|
| 540 |
def load_or_prepare_dataset(self, save=True):
|
| 541 |
"""
|
|
|
|
| 571 |
if (self.use_cache and exists(self.tokenized_df_fid)):
|
| 572 |
self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
|
| 573 |
else:
|
| 574 |
+
if not self.live:
|
| 575 |
+
# tokenize all text instances
|
| 576 |
+
self.tokenized_df = self.do_tokenization()
|
| 577 |
+
if save:
|
| 578 |
+
logs.warning("Saving tokenized dataset to disk")
|
| 579 |
+
# save tokenized text
|
| 580 |
+
write_df(self.tokenized_df, self.tokenized_df_fid)
|
| 581 |
|
| 582 |
def load_or_prepare_text_dset(self, save=True):
|
| 583 |
if (self.use_cache and exists(self.text_dset_fid)):
|
|
|
|
| 587 |
logs.info(self.text_dset)
|
| 588 |
# ...Or load it from the server and store it anew
|
| 589 |
else:
|
| 590 |
+
if not self.live:
|
| 591 |
+
self.prepare_text_dset()
|
| 592 |
+
if save:
|
| 593 |
+
# save extracted text instances
|
| 594 |
+
logs.warning("Saving dataset to disk")
|
| 595 |
+
self.text_dset.save_to_disk(self.text_dset_fid)
|
| 596 |
|
| 597 |
def prepare_text_dset(self):
|
| 598 |
+
if not self.live:
|
| 599 |
+
self.get_base_dataset()
|
| 600 |
+
# extract all text instances
|
| 601 |
+
self.text_dset = self.dset.map(
|
| 602 |
+
lambda examples: extract_field(
|
| 603 |
+
examples, self.text_field, OUR_TEXT_FIELD
|
| 604 |
+
),
|
| 605 |
+
batched=True,
|
| 606 |
+
remove_columns=list(self.dset.features),
|
| 607 |
+
)
|
| 608 |
|
| 609 |
def do_tokenization(self):
|
| 610 |
"""
|
|
|
|
| 663 |
if save:
|
| 664 |
write_plotly(self.fig_labels, self.fig_labels_json_fid)
|
| 665 |
else:
|
| 666 |
+
if not self.live:
|
| 667 |
+
self.prepare_labels()
|
| 668 |
+
if save:
|
| 669 |
+
# save extracted label instances
|
| 670 |
+
self.label_dset.save_to_disk(self.label_dset_fid)
|
| 671 |
+
write_plotly(self.fig_labels, self.fig_labels_json_fid)
|
| 672 |
|
| 673 |
def prepare_labels(self):
|
| 674 |
+
if not self.live:
|
| 675 |
+
self.get_base_dataset()
|
| 676 |
+
self.label_dset = self.dset.map(
|
| 677 |
+
lambda examples: extract_field(
|
| 678 |
+
examples, self.label_field, OUR_LABEL_FIELD
|
| 679 |
+
),
|
| 680 |
+
batched=True,
|
| 681 |
+
remove_columns=list(self.dset.features),
|
| 682 |
+
)
|
| 683 |
+
self.label_df = self.label_dset.to_pandas()
|
| 684 |
+
self.fig_labels = make_fig_labels(
|
| 685 |
+
self.label_df, self.label_names, OUR_LABEL_FIELD
|
| 686 |
+
)
|
| 687 |
|
| 688 |
def load_or_prepare_npmi(self):
|
| 689 |
self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=self.use_cache)
|
|
|
|
| 803 |
joint_npmi_df = self.load_joint_npmi_df(joint_npmi_fid)
|
| 804 |
# When maybe some things have been computed for the selected subgroups.
|
| 805 |
else:
|
| 806 |
+
if not self.live:
|
| 807 |
+
logs.info("Preparing new joint npmi")
|
| 808 |
+
joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
|
| 809 |
+
subgroup_pair, subgroup_files
|
| 810 |
+
)
|
| 811 |
+
# Cache new results
|
| 812 |
+
logs.info("Writing out.")
|
| 813 |
+
for subgroup in subgroup_pair:
|
| 814 |
+
write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
|
| 815 |
+
with open(joint_npmi_fid, "w+") as f:
|
| 816 |
+
joint_npmi_df.to_csv(f)
|
| 817 |
logs.info("The joint npmi df is")
|
| 818 |
logs.info(joint_npmi_df)
|
| 819 |
return joint_npmi_df
|
data_measurements/streamlit_utils.py
CHANGED
|
@@ -178,7 +178,11 @@ def expander_text_lengths(dstats, column_id):
|
|
| 178 |
value=0,
|
| 179 |
step=1,
|
| 180 |
)
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
|
| 184 |
### Third, use a sentence embedding model
|
|
@@ -273,7 +277,7 @@ def expander_text_duplicates(dstats, column_id):
|
|
| 273 |
st.write(
|
| 274 |
"### Here is the list of all the duplicated items and their counts in your dataset:"
|
| 275 |
)
|
| 276 |
-
if dstats.dup_counts_df is None:
|
| 277 |
st.write("There are no duplicates in this dataset! 🥳")
|
| 278 |
else:
|
| 279 |
gb = GridOptionsBuilder.from_dataframe(dstats.dup_counts_df)
|
|
|
|
| 178 |
value=0,
|
| 179 |
step=1,
|
| 180 |
)
|
| 181 |
+
|
| 182 |
+
# This is quite a large file and is breaking our ability to navigate the app development.
|
| 183 |
+
# Just passing if it's not already there for launch v0
|
| 184 |
+
if dstats.length_df is not None:
|
| 185 |
+
st.dataframe(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
|
| 186 |
|
| 187 |
|
| 188 |
### Third, use a sentence embedding model
|
|
|
|
| 277 |
st.write(
|
| 278 |
"### Here is the list of all the duplicated items and their counts in your dataset:"
|
| 279 |
)
|
| 280 |
+
if dstats.dup_counts_df is None or dstats.dup_counts_df.empty:
|
| 281 |
st.write("There are no duplicates in this dataset! 🥳")
|
| 282 |
else:
|
| 283 |
gb = GridOptionsBuilder.from_dataframe(dstats.dup_counts_df)
|