Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
·
85cf91c
1
Parent(s):
6a9c993
Modularization and caching of text length widget
Browse files- app.py +3 -12
- data_measurements/dataset_statistics.py +59 -14
- data_measurements/streamlit_utils.py +8 -17
app.py
CHANGED
|
@@ -177,15 +177,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
|
|
| 177 |
logs.info("showing general stats")
|
| 178 |
st_utils.expander_general_stats(dstats, column_id)
|
| 179 |
st_utils.expander_label_distribution(dstats.fig_labels, column_id)
|
| 180 |
-
st_utils.expander_text_lengths(
|
| 181 |
-
dstats.tokenized_df,
|
| 182 |
-
dstats.fig_tok_length,
|
| 183 |
-
dstats.avg_length,
|
| 184 |
-
dstats.std_length,
|
| 185 |
-
OUR_TEXT_FIELD,
|
| 186 |
-
LENGTH_FIELD,
|
| 187 |
-
column_id,
|
| 188 |
-
)
|
| 189 |
st_utils.expander_text_duplicates(dstats, column_id)
|
| 190 |
|
| 191 |
# We do the loading of these after the others in order to have some time
|
|
@@ -197,8 +189,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
|
|
| 197 |
)
|
| 198 |
available_terms = npmi_stats.get_available_terms()
|
| 199 |
st_utils.npmi_widget(
|
| 200 |
-
column_id, available_terms, npmi_stats, _MIN_VOCAB_COUNT
|
| 201 |
-
)
|
| 202 |
logs.info("showing zipf")
|
| 203 |
st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
|
| 204 |
if show_embeddings:
|
|
@@ -222,7 +213,7 @@ def main():
|
|
| 222 |
compare_mode = st.sidebar.checkbox("Comparison mode")
|
| 223 |
|
| 224 |
# When not doing new development, use the cache.
|
| 225 |
-
use_cache =
|
| 226 |
show_embeddings = st.sidebar.checkbox("Show embeddings")
|
| 227 |
# List of datasets for which embeddings are hard to compute:
|
| 228 |
|
|
|
|
| 177 |
logs.info("showing general stats")
|
| 178 |
st_utils.expander_general_stats(dstats, column_id)
|
| 179 |
st_utils.expander_label_distribution(dstats.fig_labels, column_id)
|
| 180 |
+
st_utils.expander_text_lengths(dstats, column_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
st_utils.expander_text_duplicates(dstats, column_id)
|
| 182 |
|
| 183 |
# We do the loading of these after the others in order to have some time
|
|
|
|
| 189 |
)
|
| 190 |
available_terms = npmi_stats.get_available_terms()
|
| 191 |
st_utils.npmi_widget(
|
| 192 |
+
column_id, available_terms, npmi_stats, _MIN_VOCAB_COUNT)
|
|
|
|
| 193 |
logs.info("showing zipf")
|
| 194 |
st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
|
| 195 |
if show_embeddings:
|
|
|
|
| 213 |
compare_mode = st.sidebar.checkbox("Comparison mode")
|
| 214 |
|
| 215 |
# When not doing new development, use the cache.
|
| 216 |
+
use_cache = True
|
| 217 |
show_embeddings = st.sidebar.checkbox("Show embeddings")
|
| 218 |
# List of datasets for which embeddings are hard to compute:
|
| 219 |
|
data_measurements/dataset_statistics.py
CHANGED
|
@@ -197,6 +197,7 @@ class DatasetStatisticsCacheClass:
|
|
| 197 |
# Tokenized text
|
| 198 |
self.tokenized_df = None
|
| 199 |
# save sentence length histogram in the class so it doesn't ge re-computed
|
|
|
|
| 200 |
self.fig_tok_length = None
|
| 201 |
# Data Frame version of self.label_dset
|
| 202 |
self.label_df = None
|
|
@@ -262,6 +263,8 @@ class DatasetStatisticsCacheClass:
|
|
| 262 |
self.text_dset_fid = pjoin(self.cache_path, "text_dset")
|
| 263 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
| 264 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
|
|
|
|
|
|
| 265 |
self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
|
| 266 |
self.general_stats_fid = pjoin(self.cache_path, "general_stats_dict.json")
|
| 267 |
self.dup_counts_df_fid = pjoin(
|
|
@@ -317,24 +320,66 @@ class DatasetStatisticsCacheClass:
|
|
| 317 |
|
| 318 |
|
| 319 |
def load_or_prepare_text_lengths(self, save=True):
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
if self.tokenized_df is None:
|
| 325 |
self.tokenized_df = self.do_tokenization()
|
| 326 |
-
self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
)
|
| 332 |
-
self.std_length = round(
|
| 333 |
-
statistics.stdev(self.tokenized_df[self.our_length_field]), 1
|
| 334 |
)
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
def load_or_prepare_embeddings(self, save=True):
|
| 340 |
if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_fid):
|
|
|
|
| 197 |
# Tokenized text
|
| 198 |
self.tokenized_df = None
|
| 199 |
# save sentence length histogram in the class so it doesn't ge re-computed
|
| 200 |
+
self.length_df = None
|
| 201 |
self.fig_tok_length = None
|
| 202 |
# Data Frame version of self.label_dset
|
| 203 |
self.label_df = None
|
|
|
|
| 263 |
self.text_dset_fid = pjoin(self.cache_path, "text_dset")
|
| 264 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
| 265 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
| 266 |
+
self.length_df_fid = pjoin(self.cache_path, "length_df.feather")
|
| 267 |
+
self.length_stats_fid = pjoin(self.cache_path, "length_stats.json")
|
| 268 |
self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
|
| 269 |
self.general_stats_fid = pjoin(self.cache_path, "general_stats_dict.json")
|
| 270 |
self.dup_counts_df_fid = pjoin(
|
|
|
|
| 320 |
|
| 321 |
|
| 322 |
def load_or_prepare_text_lengths(self, save=True):
|
| 323 |
+
"""
|
| 324 |
+
The text length widget relies on this function, which provides
|
| 325 |
+
a figure of the text lengths, some text length statistics, and
|
| 326 |
+
a text length dataframe to peruse.
|
| 327 |
+
Args:
|
| 328 |
+
save:
|
| 329 |
+
Returns:
|
| 330 |
+
|
| 331 |
+
"""
|
| 332 |
+
# Text length figure
|
| 333 |
+
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
| 334 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
| 335 |
+
else:
|
| 336 |
+
self.prepare_fig_text_lengths()
|
| 337 |
+
if save:
|
| 338 |
+
write_plotly(self.fig_tok_length, self.fig_tok_length_fid)
|
| 339 |
+
|
| 340 |
+
# Text length dataframe
|
| 341 |
+
if self.use_cache and exists(self.length_df_fid):
|
| 342 |
+
self.length_df = feather.read_feather(self.length_df_fid)
|
| 343 |
+
else:
|
| 344 |
+
self.prepare_length_df()
|
| 345 |
+
if save:
|
| 346 |
+
write_df(self.length_df, self.length_df_fid)
|
| 347 |
+
|
| 348 |
+
# Text length stats.
|
| 349 |
+
if self.use_cache and exists(self.length_stats_fid):
|
| 350 |
+
with open(self.length_stats_fid, "r") as f:
|
| 351 |
+
self.length_stats_dict = json.load(f)
|
| 352 |
+
self.avg_length = self.length_stats_dict["avg length"]
|
| 353 |
+
self.std_length = self.length_stats_dict["std length"]
|
| 354 |
+
else:
|
| 355 |
+
self.prepare_text_length_stats()
|
| 356 |
+
if save:
|
| 357 |
+
write_json(self.length_stats_dict, self.length_stats_fid)
|
| 358 |
+
|
| 359 |
+
def prepare_length_df(self):
|
| 360 |
if self.tokenized_df is None:
|
| 361 |
self.tokenized_df = self.do_tokenization()
|
| 362 |
+
self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[
|
| 363 |
+
TOKENIZED_FIELD].apply(len)
|
| 364 |
+
self.length_df = self.tokenized_df[
|
| 365 |
+
[LENGTH_FIELD, OUR_TEXT_FIELD]].sort_values(
|
| 366 |
+
by=[LENGTH_FIELD], ascending=True
|
|
|
|
|
|
|
|
|
|
| 367 |
)
|
| 368 |
+
|
| 369 |
+
def prepare_text_length_stats(self):
|
| 370 |
+
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
| 371 |
+
self.prepare_length_df()
|
| 372 |
+
avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
|
| 373 |
+
self.avg_length = round(avg_length, 1)
|
| 374 |
+
std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
|
| 375 |
+
self.std_length = round(std_length, 1)
|
| 376 |
+
self.length_stats_dict = {"avg length": self.avg_length,
|
| 377 |
+
"std length": self.std_length}
|
| 378 |
+
|
| 379 |
+
def prepare_fig_text_lengths(self):
|
| 380 |
+
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
| 381 |
+
self.prepare_length_df()
|
| 382 |
+
self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
|
| 383 |
|
| 384 |
def load_or_prepare_embeddings(self, save=True):
|
| 385 |
if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_fid):
|
data_measurements/streamlit_utils.py
CHANGED
|
@@ -147,13 +147,7 @@ def expander_label_distribution(fig_labels, column_id):
|
|
| 147 |
st.markdown("No labels were found in the dataset")
|
| 148 |
|
| 149 |
|
| 150 |
-
def expander_text_lengths(
|
| 151 |
-
tokenized_df,
|
| 152 |
-
fig_tok_length,
|
| 153 |
-
avg_length,
|
| 154 |
-
std_length,
|
| 155 |
-
text_field_name,
|
| 156 |
-
length_field_name,
|
| 157 |
column_id,
|
| 158 |
):
|
| 159 |
_TEXT_LENGTH_CAPTION = (
|
|
@@ -165,31 +159,28 @@ def expander_text_lengths(
|
|
| 165 |
"Below, you can see how the lengths of the text instances in your dataset are distributed."
|
| 166 |
)
|
| 167 |
st.markdown(
|
| 168 |
-
"Any unexpected peaks or valleys in the distribution may help to identify
|
| 169 |
)
|
| 170 |
st.markdown(
|
| 171 |
"### Here is the relative frequency of different text lengths in your dataset:"
|
| 172 |
)
|
| 173 |
-
st.plotly_chart(fig_tok_length, use_container_width=True)
|
| 174 |
-
data = tokenized_df[[length_field_name, text_field_name]].sort_values(
|
| 175 |
-
by=["length"], ascending=True
|
| 176 |
-
)
|
| 177 |
st.markdown(
|
| 178 |
"The average length of text instances is **"
|
| 179 |
-
+ str(avg_length)
|
| 180 |
+ " words**, with a standard deviation of **"
|
| 181 |
-
+ str(std_length)
|
| 182 |
+ "**."
|
| 183 |
)
|
| 184 |
|
| 185 |
start_id_show_lengths = st.slider(
|
| 186 |
f"Show the shortest sentences{column_id} starting at:",
|
| 187 |
0,
|
| 188 |
-
len(
|
| 189 |
value=0,
|
| 190 |
step=1,
|
| 191 |
)
|
| 192 |
-
st.dataframe(
|
| 193 |
|
| 194 |
|
| 195 |
### Third, use a sentence embedding model
|
|
@@ -404,7 +395,7 @@ with an ideal α value of 1."""
|
|
| 404 |
|
| 405 |
|
| 406 |
### Finally finally finally, show nPMI stuff.
|
| 407 |
-
def npmi_widget(column_id, available_terms, npmi_stats, min_vocab
|
| 408 |
"""
|
| 409 |
Part of the main app, but uses a user interaction so pulled out as its own f'n.
|
| 410 |
:param use_cache:
|
|
|
|
| 147 |
st.markdown("No labels were found in the dataset")
|
| 148 |
|
| 149 |
|
| 150 |
+
def expander_text_lengths(dstats,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
column_id,
|
| 152 |
):
|
| 153 |
_TEXT_LENGTH_CAPTION = (
|
|
|
|
| 159 |
"Below, you can see how the lengths of the text instances in your dataset are distributed."
|
| 160 |
)
|
| 161 |
st.markdown(
|
| 162 |
+
"Any unexpected peaks or valleys in the distribution may help to identify instances you want to remove or augment."
|
| 163 |
)
|
| 164 |
st.markdown(
|
| 165 |
"### Here is the relative frequency of different text lengths in your dataset:"
|
| 166 |
)
|
| 167 |
+
st.plotly_chart(dstats.fig_tok_length, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
| 168 |
st.markdown(
|
| 169 |
"The average length of text instances is **"
|
| 170 |
+
+ str(dstats.avg_length)
|
| 171 |
+ " words**, with a standard deviation of **"
|
| 172 |
+
+ str(dstats.std_length)
|
| 173 |
+ "**."
|
| 174 |
)
|
| 175 |
|
| 176 |
start_id_show_lengths = st.slider(
|
| 177 |
f"Show the shortest sentences{column_id} starting at:",
|
| 178 |
0,
|
| 179 |
+
len(dstats.length_df["length"].unique()),
|
| 180 |
value=0,
|
| 181 |
step=1,
|
| 182 |
)
|
| 183 |
+
st.dataframe(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
|
| 184 |
|
| 185 |
|
| 186 |
### Third, use a sentence embedding model
|
|
|
|
| 395 |
|
| 396 |
|
| 397 |
### Finally finally finally, show nPMI stuff.
|
| 398 |
+
def npmi_widget(column_id, available_terms, npmi_stats, min_vocab):
|
| 399 |
"""
|
| 400 |
Part of the main app, but uses a user interaction so pulled out as its own f'n.
|
| 401 |
:param use_cache:
|