import gradio as gr from sklearn.datasets import make_blobs from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples, silhouette_score import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np theme = gr.themes.Monochrome( primary_hue="indigo", secondary_hue="blue", neutral_hue="slate", ) def main( n_clusters: int = 2, n_samples: int = 500, n_features: int = 2, n_centers: int = 4, cluster_std: int = 1, ): # Generating the sample data from make_blobs # This particular setting has one distinct cluster and 3 clusters placed close # together. X, y = make_blobs( n_samples=n_samples, n_features=n_features, centers=n_centers, cluster_std=cluster_std, center_box=(-10.0, 10.0), shuffle=True, random_state=1, ) # For reproducibility n_clusters = int(n_clusters) fig1, ax1 = plt.subplots() fig1.set_size_inches(9, 4) fig2, ax2 = plt.subplots() fig2.set_size_inches(9, 4) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, n_init="auto", random_state=10) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) print( "For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg, ) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx( np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7, ) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter( X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k" ) # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers ax2.scatter( centers[:, 0], centers[:, 1], marker="o", c="white", alpha=1, s=200, edgecolor="k", ) for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k") ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") return fig1, fig2 title = """# Selecting the number of clusters with silhouette analysis on KMeans clustering 📊""" description = """ This app demonstrates a silhouette analysis for KMeans clustering on sample data. The purpose of a clustering algorithm is to find groups of similar data points. The purpose of a silhouette analysis is to determine the optimal number of clusters for a given clustering algorithm. The silhouette analysis can be used on any clustering algorithm, but it is most commonly used with KMeans clustering. """ with gr.Blocks(theme=theme) as demo: gr.Markdown(title) gr.Markdown(description) gr.Markdown("""### Dataset Generation Parameters""") with gr.Row(): with gr.Column(): n_samples = gr.inputs.Slider( minimum=100, maximum=1000, default=500, step=50, label="Number of Samples", ) n_features = gr.inputs.Slider( minimum=2, maximum=5, default=2, step=1, label="Number of Features" ) n_centers = gr.inputs.Slider( minimum=2, maximum=5, default=4, step=1, label="Number of Centers" ) cluster_std = gr.inputs.Slider( minimum=0.0, maximum=1.0, default=1, step=0.1, label="Cluster deviation" ) n_clusters = gr.inputs.Slider( minimum=2, maximum=6, default=2, step=1, label="Number of Clusters" ) run_button = gr.Button("Analyse Silhouette") with gr.Row(): plot_silhouette = gr.Plot() plot_clusters = gr.Plot() outputs = [plot_silhouette, plot_clusters] inputs = [n_clusters, n_samples, n_features, n_centers, cluster_std] run_button.click(fn=main, inputs=inputs, outputs=outputs) demo.launch()