|  | import streamlit as st | 
					
						
						|  | from datasets import load_dataset | 
					
						
						|  | import os | 
					
						
						|  |  | 
					
						
						|  | HF_TOKEN = os.environ.get("HF_TOKEN", None) | 
					
						
						|  |  | 
					
						
						|  | st.set_page_config(page_title="FW Clusters inspection", layout="wide") | 
					
						
						|  | st.title("FW clusters inspection (free topics)") | 
					
						
						|  |  | 
					
						
						|  | st.markdown(""" | 
					
						
						|  | We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering). | 
					
						
						|  |  | 
					
						
						|  | Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. | 
					
						
						|  |  | 
					
						
						|  | Additionally, the model was tasked with finding the topic of each cluster. | 
					
						
						|  | """) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def load_data(min_score=1, max_score=10, show_special=False): | 
					
						
						|  | ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2) | 
					
						
						|  | def filter_func(x): | 
					
						
						|  | try: | 
					
						
						|  | score = int(x['educational_score']) | 
					
						
						|  | value = False if show_special else min_score <= score <= max_score | 
					
						
						|  | return value | 
					
						
						|  | except (ValueError, TypeError): | 
					
						
						|  |  | 
					
						
						|  | return show_special | 
					
						
						|  |  | 
					
						
						|  | ds = ds.filter(filter_func) | 
					
						
						|  | return ds | 
					
						
						|  |  | 
					
						
						|  | st.subheader("Cluster information") | 
					
						
						|  | col_1, col_2, col_3 = st.columns(2) | 
					
						
						|  | with col_1: | 
					
						
						|  | min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score') | 
					
						
						|  | with col_2: | 
					
						
						|  | max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score') | 
					
						
						|  | with col_3: | 
					
						
						|  | show_special = st.checkbox('Show only clusters with undefined educational score', False) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | ds = load_data(min_value, max_value, show_special) | 
					
						
						|  | selected_category_type = st.selectbox("Select a topic", categories) | 
					
						
						|  | categories = list(set(ds["category"])) | 
					
						
						|  | selected_cluster = ds.filter(lambda x: x['category'] == selected_category) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | n_samples = len(selected_cluster) | 
					
						
						|  | if n_samples > 0: | 
					
						
						|  | col_1, col_2 = st.columns(2) | 
					
						
						|  | with col_1: | 
					
						
						|  | index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one",  min_value=0, max_value=len(selected_cluster)-1, value=0, step=1) | 
					
						
						|  |  | 
					
						
						|  | files = selected_cluster[index_cluster]["examples"] | 
					
						
						|  |  | 
					
						
						|  | with col_2: | 
					
						
						|  | index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one",  min_value=0, max_value=len(files)-1, value=0, step=1) | 
					
						
						|  |  | 
					
						
						|  | sample = files[index_example] | 
					
						
						|  | st.markdown(sample) | 
					
						
						|  | else: | 
					
						
						|  | st.markdown("No files found, change the cluster.") |