inspect_web_clusters

Running

App Files Files Community

inspect_web_clusters / app.py

loubnabnl HF Staff

Update app.py

6881bc0 verified almost 2 years ago

raw

history blame

2.5 kB

	import streamlit as st
	from datasets import load_dataset
	import os

	HF_TOKEN = os.environ.get("HF_TOKEN", None)

	st.set_page_config(page_title="FW Clusters inspection", layout="wide")
	st.title("FW clusters inspection (free topics)")

	st.markdown("""
	We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).

	Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10.

	Additionally, the model was tasked with finding the topic of each cluster.
	""")


	@st.cache_data
	def load_data(min_score=1, max_score=10, show_special=False):
	ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2)
	def filter_func(x):
	try:
	score = int(x['educational_score'])
	value = False if show_special else min_score <= score <= max_score
	return value
	except (ValueError, TypeError):
	# Return True if show_special is checked and educational_score is None or ''
	return show_special

	ds = ds.filter(filter_func)
	return ds

	st.subheader("Cluster information")
	col_1, col_2, col_3 = st.columns(2)
	with col_1:
	min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score')
	with col_2:
	max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score')
	with col_3:
	show_special = st.checkbox('Show only clusters with undefined educational score', False)

	# Load data based on slider values and checkbox status
	ds = load_data(min_value, max_value, show_special)
	selected_category_type = st.selectbox("Select a topic", categories)
	categories = list(set(ds["category"]))
	selected_cluster = ds.filter(lambda x: x['category'] == selected_category)

	# Select sample index
	n_samples = len(selected_cluster)
	if n_samples > 0:
	col_1, col_2 = st.columns(2)
	with col_1:
	index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one", min_value=0, max_value=len(selected_cluster)-1, value=0, step=1)

	files = selected_cluster[index_cluster]["examples"]

	with col_2:
	index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one", min_value=0, max_value=len(files)-1, value=0, step=1)

	sample = files[index_example]
	st.markdown(sample)
	else:
	st.markdown("No files found, change the cluster.")