meg-huggingface commited on
Commit
d508e46
1 Parent(s): dffdb92

More doc stringing and printing stuff

Browse files
Files changed (1) hide show
  1. run_data_measurements.py +14 -5
run_data_measurements.py CHANGED
@@ -11,8 +11,10 @@ from data_measurements import dataset_utils
11
 
12
  def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
13
  """
14
- Loader specifically for the widgets used in the app.
15
- Does not take specifications from user.
 
 
16
  Args:
17
  ds_args: Dataset configuration settings (config name, split, etc)
18
  show_embeddings: Whether to compute embeddings (slow)
@@ -62,6 +64,10 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
62
  def load_or_prepare(dataset_args, use_cache=False):
63
  """
64
  Users can specify which aspects of the dataset they would like to compute.
 
 
 
 
65
  Args:
66
  dataset_args: Dataset configuration settings (config name, split, etc)
67
  use_cache: Whether to grab files that have already been computed
@@ -70,7 +76,8 @@ def load_or_prepare(dataset_args, use_cache=False):
70
  Saves files to disk in cache_dir, if user has not specified another dir.
71
  """
72
  all = False
73
- dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
 
74
  print("Loading dataset.")
75
  dstats.load_or_prepare_dataset()
76
  print("Dataset loaded. Preparing vocab.")
@@ -84,7 +91,8 @@ def load_or_prepare(dataset_args, use_cache=False):
84
  print("\n* Calculating general statistics.")
85
  dstats.load_or_prepare_general_stats()
86
  print("Done!")
87
- print("Basic text statistics now available at %s." % dstats.general_stats_json_fid)
 
88
  print(
89
  "Text duplicates now available at %s." % dstats.dup_counts_df_fid
90
  )
@@ -108,7 +116,8 @@ def load_or_prepare(dataset_args, use_cache=False):
108
  with open(fig_label_json, "w+") as f:
109
  json.dump(dstats.fig_labels.to_json(), f)
110
  print("Done!")
111
- print("Label distribution now available at %s." % dstats.label_dset_fid)
 
112
  print("Figure saved to %s." % fig_label_html)
113
 
114
  if all or dataset_args["calculation"] == "npmi":
 
11
 
12
  def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
13
  """
14
+ Loader specifically for the widgets used in the app -- does not compute
15
+ intermediate files, unless they are not there and are needed for a file
16
+ used in the UI.
17
+ Does not take specifications from user; does all widgets.
18
  Args:
19
  ds_args: Dataset configuration settings (config name, split, etc)
20
  show_embeddings: Whether to compute embeddings (slow)
 
64
  def load_or_prepare(dataset_args, use_cache=False):
65
  """
66
  Users can specify which aspects of the dataset they would like to compute.
67
+ This additionally computes intermediate files not used in the UI.
68
+ If the calculation flag is not specified by the user (-w), calculates all
69
+ except for embeddings, as those are quite time consuming so should be
70
+ specified separately.
71
  Args:
72
  dataset_args: Dataset configuration settings (config name, split, etc)
73
  use_cache: Whether to grab files that have already been computed
 
76
  Saves files to disk in cache_dir, if user has not specified another dir.
77
  """
78
  all = False
79
+ dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args,
80
+ use_cache=use_cache)
81
  print("Loading dataset.")
82
  dstats.load_or_prepare_dataset()
83
  print("Dataset loaded. Preparing vocab.")
 
91
  print("\n* Calculating general statistics.")
92
  dstats.load_or_prepare_general_stats()
93
  print("Done!")
94
+ print("Basic text statistics now available at %s." %
95
+ dstats.general_stats_json_fid)
96
  print(
97
  "Text duplicates now available at %s." % dstats.dup_counts_df_fid
98
  )
 
116
  with open(fig_label_json, "w+") as f:
117
  json.dump(dstats.fig_labels.to_json(), f)
118
  print("Done!")
119
+ print("Label distribution now available at %s." %
120
+ dstats.label_dset_fid)
121
  print("Figure saved to %s." % fig_label_html)
122
 
123
  if all or dataset_args["calculation"] == "npmi":