Spaces:

bunkalab
/

wikipedia-en

Sleeping

App Files Files Community

charlesdedampierre commited on Dec 18, 2023

Commit

d934fdc

1 Parent(s): 313787c

manual commit

Browse files

Files changed (8) hide show

.gitattributes +2 -0
README.md +1 -1
app.py +57 -0
data/data_sample_wikipedia.csv +3 -0
data/topics_info.csv +3 -0
images/logo.png +0 -0
images/map.png +0 -0
images/pipeline.png +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/data_sample_wikipedia.csv filter=lfs diff=lfs merge=lfs -text
+data/topics_info.csv filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -9,4 +9,4 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>

app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import streamlit as st
+import pandas as pd
+st.sidebar.image("images/logo.png", use_column_width=True)
+st.sidebar.write("Bunka Summarizes & Visualizes Information as Maps using LLMs.")
+st.sidebar.title("Github Page")
+st.sidebar.write(
+    "Have a look at the following package on GitHub: https://github.com/charlesdedampierre/BunkaTopics"
+)
+st.sidebar.title("Dataset")
+st.sidebar.write(
+    "We used a subset of Wikipedia dataset: https://huggingface.co/datasets/wikimedia/wikipedia"
+)
+st.title("How to understand large textual datasets?")
+st.info(
+    "We randomly sampled 40,000 articles from the English subset 20231101.en of the Wikipedia dataset. We then took the first 500 words of each articles in order to generate an abstract that will be used for topic modeling."
+)
+df = pd.read_csv("data/data_sample_wikipedia.csv", index_col=[0])
+df = df[["text", "url"]]
+st.dataframe(df, use_container_width=True)
+st.title("Inside the Wikipedia dataset")
+st.image(
+    "images/map.png",
+    use_column_width=True,
+    caption="This mapping can be done for each subset of the Wikipedia dataset, and the articles can be selected on a topic basis through the python package, allowing to filter and curate the data.",
+)
+st.markdown(
+    '<div align="center"><a href="https://charlesdedampierre.github.io/wikipedia-bunka-map"><h2 style="color: #0066ff;">Full Interactive Map</h2></a></div>',
+    unsafe_allow_html=True,
+)
+st.info(
+    "This interactive map explores each datapoint to get a more precise overview of the contents (it takes 10 seconds to load)"
+)
+st.title("Some insights by territory")
+df_info = pd.read_csv("data/topics_info.csv", index_col=[0])
+df_info = df_info[["name", "size", "percent"]]
+df_info["percent"] = round(df_info["percent"] * 100, 3)
+df_info["percent"] = df_info["percent"].apply(lambda x: str(int(x)) + "%")
+st.dataframe(df_info, use_container_width=True)
+st.title("Bunka Exploration Engine")
+st.image(
+    "images/pipeline.png",
+    use_column_width=True,
+)

data/data_sample_wikipedia.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a176b6210f012dda75179d92af0a3471558db0f9a1569d7198338cc6915d6aaf
+size 148888677

data/topics_info.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ac108f2e0ec494838a5fbb5b57cb36ded798f5bdd17fb9f64996bd5fdae230a
+size 36270189

images/logo.png ADDED Viewed

images/map.png ADDED Viewed

images/pipeline.png ADDED Viewed