Spaces:

huggingface
/

data-measurements-tool

Build error

App Files Files Community

meg HF Staff commited on Dec 7, 2021

Commit

ec99b37

1 Parent(s): c24f881

Merging from rollback

Browse files

Files changed (3) hide show

app.py +8 -0
data_measurements/embeddings.py +3 -4
data_measurements/streamlit_utils.py +14 -12

app.py CHANGED Viewed

@@ -122,6 +122,12 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
     dstats.load_or_prepare_zipf()
     return dstats
 def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
     """
     Loader specifically for the widgets used in the app.
@@ -144,6 +150,8 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
     dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
     # Don't recalculate; we're live
     dstats.set_deployment(True)
     # Header widget
     dstats.load_or_prepare_dset_peek()
     # General stats widget

     dstats.load_or_prepare_zipf()
     return dstats
+@st.cache(
+    hash_funcs={
+        dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
+    },
+    allow_output_mutation=True,
+)
 def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
     """
     Loader specifically for the widgets used in the app.
     dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
     # Don't recalculate; we're live
     dstats.set_deployment(True)
+    # We need to have the text_dset loaded for further load_or_prepare
+    dstats.load_or_prepare_dataset()
     # Header widget
     dstats.load_or_prepare_dset_peek()
     # General stats widget

data_measurements/embeddings.py CHANGED Viewed

@@ -146,11 +146,12 @@ class Embeddings:
                 [(node["nid"], nid) for nid, node in enumerate(self.node_list)]
             )
             torch.save((self.node_list, self.nid_map), self.node_list_fid)
         if self.use_cache and exists(self.fig_tree_fid):
             self.fig_tree = read_json(self.fig_tree_fid)
         else:
             self.fig_tree = make_tree_plot(
-                self.node_list, self.text_dset, self.text_field_name
             )
             self.fig_tree.write_json(self.fig_tree_fid)
@@ -460,14 +461,12 @@ def fast_cluster(
     return node_list
-def make_tree_plot(node_list, text_dset, text_field_name):
     """
     Makes a graphical representation of the tree encoded
     in node-list. The hover label for each node shows the number
     of descendants and the 5 examples that are closest to the centroid
     """
-    nid_map = dict([(node["nid"], nid) for nid, node in enumerate(node_list)])
     for nid, node in enumerate(node_list):
         # get list of
         node_examples = {}

                 [(node["nid"], nid) for nid, node in enumerate(self.node_list)]
             )
             torch.save((self.node_list, self.nid_map), self.node_list_fid)
+        print(exists(self.fig_tree_fid), self.fig_tree_fid)
         if self.use_cache and exists(self.fig_tree_fid):
             self.fig_tree = read_json(self.fig_tree_fid)
         else:
             self.fig_tree = make_tree_plot(
+                self.node_list, self.nid_map, self.text_dset, self.text_field_name
             )
             self.fig_tree.write_json(self.fig_tree_fid)
     return node_list
+def make_tree_plot(node_list, nid_map, text_dset, text_field_name):
     """
     Makes a graphical representation of the tree encoded
     in node-list. The hover label for each node shows the number
     of descendants and the 5 examples that are closest to the centroid
     """
     for nid, node in enumerate(node_list):
         # get list of
         node_examples = {}

data_measurements/streamlit_utils.py CHANGED Viewed

@@ -21,6 +21,7 @@ from st_aggrid import AgGrid, GridOptionsBuilder
 from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
 def sidebar_header():
     st.sidebar.markdown(
         """
@@ -107,9 +108,7 @@ def expander_general_stats(dstats, column_id):
             "Use this widget to check whether the terms you see most represented"
             " in the dataset make sense for the goals of the dataset."
         )
-        st.markdown(
-            "There are {0} total words".format(str(dstats.total_words))
-        )
         st.markdown(
             "There are {0} words after removing closed "
             "class words".format(str(dstats.total_open_words))
@@ -129,14 +128,10 @@ def expander_general_stats(dstats, column_id):
             st.markdown(
                 "There are {0} duplicate items in the dataset. "
                 "For more information about the duplicates, "
-                "click the 'Duplicates' tab below.".format(
-                    str(dstats.dedup_total)
-                )
             )
         else:
-            st.markdown(
-                "There are 0 duplicate items in the dataset. ")
 ### Show the label distribution from the datasets
@@ -166,7 +161,6 @@ def expander_text_lengths(dstats, column_id):
         st.markdown(
             "### Here is the relative frequency of different text lengths in your dataset:"
         )
-        #TODO: figure out more elegant way to do this:
         try:
             st.image(dstats.fig_tok_length_png)
         except:
@@ -181,8 +175,16 @@ def expander_text_lengths(dstats, column_id):
         # This is quite a large file and is breaking our ability to navigate the app development.
         # Just passing if it's not already there for launch v0
         if dstats.length_df is not None:
-            start_id_show_lengths= st.selectbox("Show examples of length:", sorted(dstats.length_df["length"].unique().tolist()))
-            st.table(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
 ### Third, use a sentence embedding model

 from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
 def sidebar_header():
     st.sidebar.markdown(
         """
             "Use this widget to check whether the terms you see most represented"
             " in the dataset make sense for the goals of the dataset."
         )
+        st.markdown("There are {0} total words".format(str(dstats.total_words)))
         st.markdown(
             "There are {0} words after removing closed "
             "class words".format(str(dstats.total_open_words))
             st.markdown(
                 "There are {0} duplicate items in the dataset. "
                 "For more information about the duplicates, "
+                "click the 'Duplicates' tab below.".format(str(dstats.dedup_total))
             )
         else:
+            st.markdown("There are 0 duplicate items in the dataset. ")
 ### Show the label distribution from the datasets
         st.markdown(
             "### Here is the relative frequency of different text lengths in your dataset:"
         )
         try:
             st.image(dstats.fig_tok_length_png)
         except:
         # This is quite a large file and is breaking our ability to navigate the app development.
         # Just passing if it's not already there for launch v0
         if dstats.length_df is not None:
+            start_id_show_lengths = st.selectbox(
+                "Show examples of length:",
+                sorted(dstats.length_df["length"].unique().tolist()),
+                key=f"select_show_length_{column_id}",
+            )
+            st.table(
+                dstats.length_df[
+                    dstats.length_df["length"] == start_id_show_lengths
+                ].set_index("length")
+            )
 ### Third, use a sentence embedding model