Spaces:

huggingface
/

data-measurements-tool

Build error

App Files Files Community

Yacine Jernite commited on Dec 7, 2021

Commit

d437368

2 Parent(s): 2af16ed dffdb92

Merge branch 'main' of https://huggingface.co/spaces/huggingface/data-measurements-tool-2 into main

Browse files

Files changed (6) hide show

cache_dir/wikitext_wikitext-103-v1_train_text/fig_tok_length.png +3 -0
cache_dir/wikitext_wikitext-103-v1_validation_text/fig_tok_length.png +3 -0
cache_dir/wikitext_wikitext-2-v1_train_text/fig_tok_length.png +3 -0
cache_dir/wikitext_wikitext-2-v1_validation_text/fig_tok_length.png +3 -0
data_measurements/dataset_statistics.py +1 -1
run_data_measurements.py +28 -32

cache_dir/wikitext_wikitext-103-v1_train_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 7c77df667d1ea9a678fc9c3f67ea6e066774949926e2e71198ff8f5b062bb885
Pointer size: 130 Bytes
Size of remote file: 37 kB

cache_dir/wikitext_wikitext-103-v1_validation_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 0f06c1bda3ad86339023fc1d6f95ac1145c629e0f1c003df59949986e40dc897
Pointer size: 130 Bytes
Size of remote file: 44.8 kB

cache_dir/wikitext_wikitext-2-v1_train_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 2a08287707e4b9430a9f0e6678269212b854ab21b3a34614ec7f15158ba22e93
Pointer size: 130 Bytes
Size of remote file: 39.6 kB

cache_dir/wikitext_wikitext-2-v1_validation_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 08721bbfc146992d3954e4c4dd54cd12da4441782a2be879e68f129bd5cb6044
Pointer size: 130 Bytes
Size of remote file: 44.8 kB

data_measurements/dataset_statistics.py CHANGED Viewed

@@ -455,7 +455,7 @@ class DatasetStatisticsCacheClass:
             self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
         else:
             logs.info("Calculating vocab afresh")
-            if len(self.tokenized_df) == 0:
                 self.tokenized_df = self.do_tokenization()
                 if save:
                     logs.info("Writing out.")

             self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
         else:
             logs.info("Calculating vocab afresh")
+            if self.tokenized_df is None:
                 self.tokenized_df = self.do_tokenization()
                 if save:
                     logs.info("Writing out.")

run_data_measurements.py CHANGED Viewed

@@ -12,13 +12,14 @@ from data_measurements import dataset_utils
 def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
     """
     Loader specifically for the widgets used in the app.
     Args:
-        ds_args:
-        show_embeddings:
-        use_cache:
     Returns:
     """
     if not isdir(ds_args["cache_dir"]):
@@ -30,13 +31,15 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
     dstats = dataset_statistics.DatasetStatisticsCacheClass(**ds_args,
                                                             use_cache=use_cache)
     # Header widget
     dstats.load_or_prepare_dset_peek()
     # General stats widget
     dstats.load_or_prepare_general_stats()
     # Labels widget
     try:
-        dstats.set_label_field("label")
         dstats.load_or_prepare_labels()
     except:
         pass
@@ -56,7 +59,16 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
     dstats.load_or_prepare_zipf()
-def load_or_prepare(dataset_args, do_html=False, use_cache=False):
     all = False
     dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
     print("Loading dataset.")
@@ -79,22 +91,15 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
     if all or dataset_args["calculation"] == "lengths":
         print("\n* Calculating text lengths.")
-        fig_tok_length_fid = pjoin(dstats.cache_path, "lengths_fig.html")
-        tok_length_json_fid = pjoin(dstats.cache_path, "lengths.json")
         dstats.load_or_prepare_text_lengths()
-        with open(tok_length_json_fid, "w+") as f:
-            json.dump(dstats.fig_tok_length.to_json(), f)
-            print("Token lengths now available at %s." % tok_length_json_fid)
-        if do_html:
-            dstats.fig_tok_length.write_html(fig_tok_length_fid)
-            print("Figure saved to %s." % fig_tok_length_fid)
         print("Done!")
     if all or dataset_args["calculation"] == "labels":
         if not dstats.label_field:
-            print("Warning: You asked for label calculation, but didn't provide "
-                  "the labels field name.  Assuming it is 'label'...")
             dstats.set_label_field("label")
             print("\n* Calculating label distribution.")
             dstats.load_or_prepare_labels()
             fig_label_html = pjoin(dstats.cache_path, "labels_fig.html")
@@ -111,7 +116,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
         npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
             dstats, use_cache=use_cache
         )
-        do_npmi(npmi_stats, use_cache=use_cache)
         print("Done!")
         print(
             "nPMI results now available in %s for all identity terms that "
@@ -142,7 +147,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
         dstats.load_or_prepare_embeddings()
-def do_npmi(npmi_stats, use_cache=True):
     available_terms = npmi_stats.load_or_prepare_npmi_terms()
     completed_pairs = {}
     print("Iterating through terms for joint npmi.")
@@ -165,7 +170,6 @@ def get_text_label_df(
     label_field,
     calculation,
     out_dir,
-    do_html=False,
     use_cache=True,
 ):
     if not use_cache:
@@ -190,7 +194,7 @@ def get_text_label_df(
         "calculation": calculation,
         "cache_dir": out_dir,
     }
-    load_or_prepare_widgets(dataset_args, use_cache=use_cache)
 def main():
@@ -272,18 +276,10 @@ def main():
     args = parser.parse_args()
     print("Proceeding with the following arguments:")
     print(args)
-    # run_data_measurements.py -n hate_speech18 -c default -s train -f text -w npmi
-    get_text_label_df(
-        args.dataset,
-        args.config,
-        args.split,
-        args.feature,
-        args.label_field,
-        args.calculation,
-        args.out_dir,
-        do_html=args.do_html,
-        use_cache=args.cached,
-    )
     print()

 def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
     """
     Loader specifically for the widgets used in the app.
+    Does not take specifications from user.
     Args:
+        ds_args: Dataset configuration settings (config name, split, etc)
+        show_embeddings: Whether to compute embeddings (slow)
+        use_cache: Whether to grab files that have already been computed
     Returns:
+        Saves files to disk in cache_dir, if user has not specified another dir.
     """
     if not isdir(ds_args["cache_dir"]):
     dstats = dataset_statistics.DatasetStatisticsCacheClass(**ds_args,
                                                             use_cache=use_cache)
+    # Embeddings widget
+    dstats.load_or_prepare_dataset()
     # Header widget
     dstats.load_or_prepare_dset_peek()
     # General stats widget
     dstats.load_or_prepare_general_stats()
     # Labels widget
     try:
+        dstats.set_label_field(ds_args['label_field'])
         dstats.load_or_prepare_labels()
     except:
         pass
     dstats.load_or_prepare_zipf()
+def load_or_prepare(dataset_args, use_cache=False):
+    """
+    Users can specify which aspects of the dataset they would like to compute.
+    Args:
+        dataset_args: Dataset configuration settings (config name, split, etc)
+        use_cache: Whether to grab files that have already been computed
+    Returns:
+        Saves files to disk in cache_dir, if user has not specified another dir.
+    """
     all = False
     dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
     print("Loading dataset.")
     if all or dataset_args["calculation"] == "lengths":
         print("\n* Calculating text lengths.")
         dstats.load_or_prepare_text_lengths()
         print("Done!")
     if all or dataset_args["calculation"] == "labels":
         if not dstats.label_field:
+            print("Warning: You asked for label calculation, but didn't "
+                  "provide the labels field name.  Assuming it is 'label'...")
             dstats.set_label_field("label")
+        else:
             print("\n* Calculating label distribution.")
             dstats.load_or_prepare_labels()
             fig_label_html = pjoin(dstats.cache_path, "labels_fig.html")
         npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
             dstats, use_cache=use_cache
         )
+        do_npmi(npmi_stats)
         print("Done!")
         print(
             "nPMI results now available in %s for all identity terms that "
         dstats.load_or_prepare_embeddings()
+def do_npmi(npmi_stats):
     available_terms = npmi_stats.load_or_prepare_npmi_terms()
     completed_pairs = {}
     print("Iterating through terms for joint npmi.")
     label_field,
     calculation,
     out_dir,
     use_cache=True,
 ):
     if not use_cache:
         "calculation": calculation,
         "cache_dir": out_dir,
     }
+    load_or_prepare(dataset_args, use_cache=use_cache)
 def main():
     args = parser.parse_args()
     print("Proceeding with the following arguments:")
     print(args)
+    # run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
+    get_text_label_df(args.dataset, args.config, args.split, args.feature,
+                      args.label_field, args.calculation, args.out_dir,
+                      use_cache=args.cached)
     print()