Spaces:
Build error
Build error
Yacine Jernite
commited on
Commit
·
f4b8e6e
1
Parent(s):
c500e3c
can only select available splits
Browse files
cache_dir/has_cache.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e7d89146f736ca9852dd82abaa7d29225499d53ca16f7714cfa576915e0a7d7
|
| 3 |
+
size 3584
|
data_measurements/streamlit_utils.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
| 14 |
|
| 15 |
import statistics
|
| 16 |
|
|
|
|
| 17 |
import pandas as pd
|
| 18 |
import seaborn as sns
|
| 19 |
import streamlit as st
|
|
@@ -22,6 +23,8 @@ from st_aggrid import AgGrid, GridOptionsBuilder
|
|
| 22 |
from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
|
| 23 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
| 24 |
|
|
|
|
|
|
|
| 25 |
def sidebar_header():
|
| 26 |
st.sidebar.markdown(
|
| 27 |
"""
|
|
@@ -29,16 +32,17 @@ def sidebar_header():
|
|
| 29 |
Right now this has a few pre-loaded datasets for which you can:
|
| 30 |
- view some general statistics about the text vocabulary, lengths, labels
|
| 31 |
- explore some distributional statistics to assess properties of the language
|
| 32 |
-
- view some comparison statistics and overview of the text distribution
|
| 33 |
-
|
| 34 |
-
The tool is in development, and will keep growing in utility and functionality 🤗🚧
|
| 35 |
""",
|
| 36 |
unsafe_allow_html=True,
|
| 37 |
)
|
| 38 |
|
| 39 |
|
| 40 |
def sidebar_selection(ds_name_to_dict, column_id):
|
| 41 |
-
ds_names = list(ds_name_to_dict.keys())
|
|
|
|
| 42 |
with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
|
| 43 |
# choose a dataset to analyze
|
| 44 |
ds_name = st.selectbox(
|
|
@@ -52,6 +56,7 @@ def sidebar_selection(ds_name_to_dict, column_id):
|
|
| 52 |
config_names = ['en','en.noblocklist','realnewslike']
|
| 53 |
else:
|
| 54 |
config_names = list(ds_configs.keys())
|
|
|
|
| 55 |
config_name = st.selectbox(
|
| 56 |
f"Choose configuration{column_id}:",
|
| 57 |
config_names,
|
|
@@ -60,7 +65,8 @@ def sidebar_selection(ds_name_to_dict, column_id):
|
|
| 60 |
# choose a subset of num_examples
|
| 61 |
# TODO: Handling for multiple text features
|
| 62 |
ds_config = ds_configs[config_name]
|
| 63 |
-
text_features = ds_config[HF_FEATURE_FIELD]["string"]
|
|
|
|
| 64 |
# TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
|
| 65 |
text_field = st.selectbox(
|
| 66 |
f"Which text feature from the{column_id} dataset would you like to analyze?",
|
|
@@ -69,7 +75,8 @@ def sidebar_selection(ds_name_to_dict, column_id):
|
|
| 69 |
else [tp for tp in text_features if tp[0] != "id"],
|
| 70 |
)
|
| 71 |
# Choose a split and dataset size
|
| 72 |
-
avail_splits = list(ds_config["splits"].keys())
|
|
|
|
| 73 |
# 12.Nov note: Removing "test" because those should not be examined
|
| 74 |
# without discussion of pros and cons, which we haven't done yet.
|
| 75 |
if "test" in avail_splits:
|
|
|
|
| 14 |
|
| 15 |
import statistics
|
| 16 |
|
| 17 |
+
import json
|
| 18 |
import pandas as pd
|
| 19 |
import seaborn as sns
|
| 20 |
import streamlit as st
|
|
|
|
| 23 |
from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
|
| 24 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
| 25 |
|
| 26 |
+
_HAS_CACHE = json.load(open("cache_dir/has_cache.json"))
|
| 27 |
+
|
| 28 |
def sidebar_header():
|
| 29 |
st.sidebar.markdown(
|
| 30 |
"""
|
|
|
|
| 32 |
Right now this has a few pre-loaded datasets for which you can:
|
| 33 |
- view some general statistics about the text vocabulary, lengths, labels
|
| 34 |
- explore some distributional statistics to assess properties of the language
|
| 35 |
+
- view some comparison statistics and overview of the text distribution
|
| 36 |
+
|
| 37 |
+
The tool is in development, and will keep growing in utility and functionality 🤗🚧
|
| 38 |
""",
|
| 39 |
unsafe_allow_html=True,
|
| 40 |
)
|
| 41 |
|
| 42 |
|
| 43 |
def sidebar_selection(ds_name_to_dict, column_id):
|
| 44 |
+
# ds_names = list(ds_name_to_dict.keys())
|
| 45 |
+
ds_names = list(_HAS_CACHE.keys())
|
| 46 |
with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
|
| 47 |
# choose a dataset to analyze
|
| 48 |
ds_name = st.selectbox(
|
|
|
|
| 56 |
config_names = ['en','en.noblocklist','realnewslike']
|
| 57 |
else:
|
| 58 |
config_names = list(ds_configs.keys())
|
| 59 |
+
config_names = list(_HAS_CACHE[ds_name].keys())
|
| 60 |
config_name = st.selectbox(
|
| 61 |
f"Choose configuration{column_id}:",
|
| 62 |
config_names,
|
|
|
|
| 65 |
# choose a subset of num_examples
|
| 66 |
# TODO: Handling for multiple text features
|
| 67 |
ds_config = ds_configs[config_name]
|
| 68 |
+
# text_features = ds_config[HF_FEATURE_FIELD]["string"]
|
| 69 |
+
text_features = [tuple(text_field.split('-')) for text_field in _HAS_CACHE[ds_name][config_name]]
|
| 70 |
# TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
|
| 71 |
text_field = st.selectbox(
|
| 72 |
f"Which text feature from the{column_id} dataset would you like to analyze?",
|
|
|
|
| 75 |
else [tp for tp in text_features if tp[0] != "id"],
|
| 76 |
)
|
| 77 |
# Choose a split and dataset size
|
| 78 |
+
# avail_splits = list(ds_config["splits"].keys())
|
| 79 |
+
avail_splits = list(_HAS_CACHE[ds_name][config_name]['-'.join(text_field)].keys())
|
| 80 |
# 12.Nov note: Removing "test" because those should not be examined
|
| 81 |
# without discussion of pros and cons, which we haven't done yet.
|
| 82 |
if "test" in avail_splits:
|