Spaces:

huggingface
/

data-measurements-tool

Running

App Files Files Community

meg-huggingface commited on Dec 6, 2021

Commit

d3c28ec

1 Parent(s): e3f7160

Loading per-widget. Various changes to streamlit interactions for efficiency.

Browse files

Files changed (3) hide show

app.py +13 -14
data_measurements/dataset_statistics.py +33 -23
data_measurements/streamlit_utils.py +11 -6

app.py CHANGED Viewed

@@ -101,20 +101,18 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
     if use_cache:
         logs.warning("Using cache")
     dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
-    logs.warning("Loading Dataset")
     dstats.load_or_prepare_dataset()
-    logs.warning("Extracting Labels")
     dstats.load_or_prepare_labels()
-    logs.warning("Computing Text Lengths")
     dstats.load_or_prepare_text_lengths()
-    logs.warning("Computing Duplicates")
     dstats.load_or_prepare_text_duplicates()
-    logs.warning("Extracting Vocabulary")
     dstats.load_or_prepare_vocab()
-    logs.warning("Calculating General Statistics...")
     dstats.load_or_prepare_general_stats()
-    logs.warning("Completed Calculation.")
-    logs.warning("Calculating Fine-Grained Statistics...")
     if show_embeddings:
         logs.warning("Loading Embeddings")
         dstats.load_or_prepare_embeddings()
@@ -135,6 +133,7 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
     Returns:
     """
     if not isdir(CACHE_DIR):
         logs.warning("Creating cache")
         # We need to preprocess everything.
@@ -143,6 +142,8 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
     if use_cache:
         logs.warning("Using cache")
     dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
     # Header widget
     dstats.load_or_prepare_dset_peek()
     # General stats widget
@@ -157,23 +158,21 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
     dstats.load_or_prepare_text_duplicates()
     dstats.load_or_prepare_npmi()
     dstats.load_or_prepare_zipf()
-    # Don't recalculate; we're live
-    dstats.set_deployment(True)
-def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=True):
     """
     Function for displaying the elements in the right column of the streamlit app.
     Args:
         ds_name_to_dict (dict): the dataset name and options in dictionary form
         show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
         column_id (str): what column of the dataset the analysis is done on
-        use_cache (Bool): whether the cache is used by default or not
     Returns:
         The function displays the information using the functions defined in the st_utils class.
     """
     # Note that at this point we assume we can use cache; default value is True.
     # start showing stuff
-    title_str = f"### Showing{column_id}: {dstats.dset_name} - {dstats.dset_config} - {'-'.join(dstats.text_field)}"
     st.markdown(title_str)
     logs.info("showing header")
     st_utils.expander_header(dstats, ds_name_to_dict, column_id)
@@ -230,7 +229,7 @@ def main():
     else:
         logs.warning("Using Single Dataset Mode")
         dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
-        dstats = load_or_prepare(dataset_args, show_embeddings, use_cache=use_cache)
         show_column(dstats, ds_name_to_dict, show_embeddings, "")

     if use_cache:
         logs.warning("Using cache")
     dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
+    logs.warning("Loading dataset")
     dstats.load_or_prepare_dataset()
+    logs.warning("Loading labels")
     dstats.load_or_prepare_labels()
+    logs.warning("Loading text lengths")
     dstats.load_or_prepare_text_lengths()
+    logs.warning("Loading duplicates")
     dstats.load_or_prepare_text_duplicates()
+    logs.warning("Loading vocabulary")
     dstats.load_or_prepare_vocab()
+    logs.warning("Loading general statistics...")
     dstats.load_or_prepare_general_stats()
     if show_embeddings:
         logs.warning("Loading Embeddings")
         dstats.load_or_prepare_embeddings()
     Returns:
     """
     if not isdir(CACHE_DIR):
         logs.warning("Creating cache")
         # We need to preprocess everything.
     if use_cache:
         logs.warning("Using cache")
     dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
+    # Don't recalculate; we're live
+    dstats.set_deployment(True)
     # Header widget
     dstats.load_or_prepare_dset_peek()
     # General stats widget
     dstats.load_or_prepare_text_duplicates()
     dstats.load_or_prepare_npmi()
     dstats.load_or_prepare_zipf()
+    return dstats
+def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
     """
     Function for displaying the elements in the right column of the streamlit app.
     Args:
         ds_name_to_dict (dict): the dataset name and options in dictionary form
         show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
         column_id (str): what column of the dataset the analysis is done on
     Returns:
         The function displays the information using the functions defined in the st_utils class.
     """
     # Note that at this point we assume we can use cache; default value is True.
     # start showing stuff
+    title_str = f"### Showing{column_id}: {dstats.dset_name} - {dstats.dset_config} - {dstats.split_name} - {'-'.join(dstats.text_field)}"
     st.markdown(title_str)
     logs.info("showing header")
     st_utils.expander_header(dstats, ds_name_to_dict, column_id)
     else:
         logs.warning("Using Single Dataset Mode")
         dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
+        dstats = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
         show_column(dstats, ds_name_to_dict, show_embeddings, "")

data_measurements/dataset_statistics.py CHANGED Viewed

@@ -178,6 +178,7 @@ class DatasetStatisticsCacheClass:
         self.dset_config = dset_config
         # name of the split to analyze
         self.split_name = split_name
         # which text fields are we analysing?
         self.text_field = text_field
         # which label fields are we analysing?
@@ -207,6 +208,7 @@ class DatasetStatisticsCacheClass:
         self.vocab_counts_df = None
         # Vocabulary filtered to remove stopwords
         self.vocab_counts_filtered_df = None
         ## General statistics and duplicates
         self.total_words = 0
         self.total_open_words = 0
@@ -340,12 +342,13 @@ class DatasetStatisticsCacheClass:
             logs.info('Loading cached general stats')
             self.load_general_stats()
         else:
-            logs.info('Preparing general stats')
-            self.prepare_general_stats()
-            if save:
-                write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
-                write_df(self.dup_counts_df, self.dup_counts_df_fid)
-                write_json(self.general_stats_dict, self.general_stats_json_fid)
     def load_or_prepare_text_lengths(self, save=True):
@@ -362,17 +365,19 @@ class DatasetStatisticsCacheClass:
         if (self.use_cache and exists(self.fig_tok_length_fid)):
             self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
         else:
-            self.prepare_fig_text_lengths()
-            if save:
-                write_plotly(self.fig_tok_length, self.fig_tok_length_fid)
         # Text length dataframe
         if self.use_cache and exists(self.length_df_fid):
             self.length_df = feather.read_feather(self.length_df_fid)
         else:
-            self.prepare_length_df()
-            if save:
-                write_df(self.length_df, self.length_df_fid)
         # Text length stats.
         if self.use_cache and exists(self.length_stats_json_fid):
@@ -382,9 +387,10 @@ class DatasetStatisticsCacheClass:
             self.std_length = self.length_stats_dict["std length"]
             self.num_uniq_lengths = self.length_stats_dict["num lengths"]
         else:
-            self.prepare_text_length_stats()
-            if save:
-                write_json(self.length_stats_dict, self.length_stats_json_fid)
     def prepare_length_df(self):
         if not self.live:
@@ -481,15 +487,17 @@ class DatasetStatisticsCacheClass:
             with open(self.dup_counts_df_fid, "rb") as f:
                 self.dup_counts_df = feather.read_feather(f)
         elif self.dup_counts_df is None:
-            self.prepare_text_duplicates()
-            if save:
-                write_df(self.dup_counts_df, self.dup_counts_df_fid)
         else:
-            # This happens when self.dup_counts_df is already defined;
-            # This happens when general_statistics were calculated first,
-            # since general statistics requires the number of duplicates
-            if save:
-                write_df(self.dup_counts_df, self.dup_counts_df_fid)
     def load_general_stats(self):
         self.general_stats_dict = json.load(open(self.general_stats_json_fid, encoding="utf-8"))
@@ -815,6 +823,8 @@ class nPMIStatisticsCacheClass:
                     write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
                 with open(joint_npmi_fid, "w+") as f:
                     joint_npmi_df.to_csv(f)
         logs.info("The joint npmi df is")
         logs.info(joint_npmi_df)
         return joint_npmi_df

         self.dset_config = dset_config
         # name of the split to analyze
         self.split_name = split_name
+        # TODO: Chould this be "feature" ?
         # which text fields are we analysing?
         self.text_field = text_field
         # which label fields are we analysing?
         self.vocab_counts_df = None
         # Vocabulary filtered to remove stopwords
         self.vocab_counts_filtered_df = None
+        self.sorted_top_vocab_df = None
         ## General statistics and duplicates
         self.total_words = 0
         self.total_open_words = 0
             logs.info('Loading cached general stats')
             self.load_general_stats()
         else:
+            if not self.live:
+                logs.info('Preparing general stats')
+                self.prepare_general_stats()
+                if save:
+                    write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
+                    write_df(self.dup_counts_df, self.dup_counts_df_fid)
+                    write_json(self.general_stats_dict, self.general_stats_json_fid)
     def load_or_prepare_text_lengths(self, save=True):
         if (self.use_cache and exists(self.fig_tok_length_fid)):
             self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
         else:
+            if not self.live:
+                self.prepare_fig_text_lengths()
+                if save:
+                    write_plotly(self.fig_tok_length, self.fig_tok_length_fid)
         # Text length dataframe
         if self.use_cache and exists(self.length_df_fid):
             self.length_df = feather.read_feather(self.length_df_fid)
         else:
+            if not self.live:
+                self.prepare_length_df()
+                if save:
+                    write_df(self.length_df, self.length_df_fid)
         # Text length stats.
         if self.use_cache and exists(self.length_stats_json_fid):
             self.std_length = self.length_stats_dict["std length"]
             self.num_uniq_lengths = self.length_stats_dict["num lengths"]
         else:
+            if not self.live:
+                self.prepare_text_length_stats()
+                if save:
+                    write_json(self.length_stats_dict, self.length_stats_json_fid)
     def prepare_length_df(self):
         if not self.live:
             with open(self.dup_counts_df_fid, "rb") as f:
                 self.dup_counts_df = feather.read_feather(f)
         elif self.dup_counts_df is None:
+            if not self.live:
+                self.prepare_text_duplicates()
+                if save:
+                    write_df(self.dup_counts_df, self.dup_counts_df_fid)
         else:
+            if not self.live:
+                # This happens when self.dup_counts_df is already defined;
+                # This happens when general_statistics were calculated first,
+                # since general statistics requires the number of duplicates
+                if save:
+                    write_df(self.dup_counts_df, self.dup_counts_df_fid)
     def load_general_stats(self):
         self.general_stats_dict = json.load(open(self.general_stats_json_fid, encoding="utf-8"))
                     write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
                 with open(joint_npmi_fid, "w+") as f:
                     joint_npmi_df.to_csv(f)
+            else:
+                joint_npmi_df = pd.DataFrame()
         logs.info("The joint npmi df is")
         logs.info(joint_npmi_df)
         return joint_npmi_df

data_measurements/streamlit_utils.py CHANGED Viewed

@@ -126,13 +126,18 @@ def expander_general_stats(dstats, column_id):
                 str(dstats.text_nan_count)
             )
         )
-        st.markdown(
-            "There are {0} duplicate items in the dataset. "
-            "For more information about the duplicates, "
-            "click the 'Duplicates' tab below.".format(
-                str(dstats.dedup_total)
             )
-        )
 ### Show the label distribution from the datasets

                 str(dstats.text_nan_count)
             )
         )
+        if dstats.dedup_total > 0:
+            st.markdown(
+                "There are {0} duplicate items in the dataset. "
+                "For more information about the duplicates, "
+                "click the 'Duplicates' tab below.".format(
+                    str(dstats.dedup_total)
+                )
             )
+        else:
+            st.markdown(
+                "There are 0 duplicate items in the dataset. ")
 ### Show the label distribution from the datasets