Spaces:

mshamrai
/

language-metric-analysis

Runtime error

App Files Files Community

mshamrai commited on 5 days ago

Commit

f9b063b

1 Parent(s): 983aedb

chore: init demo

Browse files

Files changed (5) hide show

.gitignore +3 -0
app.py +248 -0
constants.py +217 -0
requirements.txt +3 -0
utils.py +270 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.gradio
+__pycache__
+plots

app.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import pickle
+import os
+from sklearn.manifold import TSNE
+import matplotlib.pyplot as plt
+from utils import (plot_distances_tsne,
+                   plot_distances_umap,
+                   cluster_languages_hdbscan,
+                   cluster_languages_kmeans,
+                   plot_mst,
+                   cluster_languages_by_families,
+                   cluster_languages_by_subfamilies,
+                   filter_languages_by_families)
+from functools import partial
+with open("../../results/languages_list.pkl", "rb") as f:
+    languages = pickle.load(f)
+DATASETS = ["wikimedia/wikipedia", "uonlp/CulturaX", "HuggingFaceFW/fineweb-2"]
+MODELS = ["mistralai/Mistral-7B-v0.1", "google/gemma-3-4b-pt", "meta-llama/Llama-3.2-1B"]
+distance_matrices = {
+    dataset: {
+        model: np.load(os.path.join("../../results", dataset, model, "distances_matrix.npy"))
+        for model in MODELS
+    }
+    for dataset in DATASETS
+}
+average_distances_matrix = np.load("../../results/average_distances_matrix.npy")
+def filter_languages_nan(model, dataset, use_average):
+    if use_average:
+        matrix = average_distances_matrix
+    else:
+        matrix = distance_matrices[dataset][model]
+    vector = matrix[0]
+    updated_languages = np.array(languages)[~np.isnan(vector)]
+    updated_matrix = matrix[~np.isnan(vector), :][:, ~np.isnan(vector)]
+    return updated_matrix, updated_languages
+def get_similar_languages(model, dataset, selected_language, use_average, n):
+    """
+    Retrieves the distances for the selected language from the chosen model and dataset,
+    sorts them by similarity (lowest distance first), and returns a DataFrame.
+    """
+    if use_average:
+        matrix = average_distances_matrix
+    else:
+        matrix = distance_matrices[dataset][model]
+    selected_language_index = languages.index(selected_language)
+    distances = matrix[selected_language_index]
+    df = pd.DataFrame({"Language": languages, "Distance": distances})
+    sorted_distances = df.sort_values(by="Distance")
+    sorted_distances.drop(index=selected_language_index, inplace=True)
+    sorted_distances.reset_index(drop=True, inplace=True)
+    sorted_distances.reset_index(inplace=True)
+    sorted_distances["Distance"] = sorted_distances["Distance"].round(4)
+    return sorted_distances.head(n)
+def update_languages(model, dataset):
+    """
+    Returns the language list based on the given model and dataset.
+    """
+    matrix = distance_matrices[dataset][model]
+    vector = matrix[0]
+    updated_languages = np.array(languages)[~np.isnan(vector)]
+    return list(updated_languages)
+def update_language_options(model, dataset, language, use_average):
+    if use_average:
+        updated_languages = languages
+    else:
+        updated_languages = update_languages(model, dataset)
+    if language not in updated_languages:
+        language = updated_languages[0]
+    return gr.Dropdown(label="Language", choices=updated_languages, value=language)
+def toggle_inputs(use_average):
+    if use_average:
+        return gr.update(interactive=False, visible=False), gr.update(interactive=False, visible=False)
+    else:
+        return gr.update(interactive=True, visible=True), gr.update(interactive=True, visible=True)
+i = 0
+def plot_distances(model, dataset, use_average, cluster_method, cluster_method_param, plot_fn):
+    """
+    Plots all languages from the distances matrix using t-SNE.
+    """
+    global i
+    updated_matrix, updated_languages = filter_languages_nan(model, dataset, use_average)
+    if cluster_method == "HDBSCAN":
+        filtered_matrix, filtered_languages, clusters = cluster_languages_hdbscan(
+            updated_matrix, updated_languages, min_cluster_size=cluster_method_param
+        )
+        legends = None
+    elif cluster_method == "KMeans":
+        filtered_matrix, filtered_languages, clusters = cluster_languages_kmeans(
+            updated_matrix, updated_languages, n_clusters=cluster_method_param
+        )
+        legends = None
+    elif cluster_method == "Family":
+        clusters, legends = cluster_languages_by_families(updated_languages)
+        filtered_matrix = updated_matrix
+        filtered_languages = updated_languages
+    elif cluster_method == "Subfamily":
+        clusters, legends = cluster_languages_by_subfamilies(updated_languages)
+        filtered_matrix = updated_matrix
+        filtered_languages = updated_languages
+    else:
+        raise ValueError("Invalid cluster method")
+    fig = plot_fn(model, dataset, use_average, filtered_matrix, filtered_languages, clusters, legends)
+    fig.tight_layout()
+    fig.savefig(f"plots/plot_{i}.pdf", format="pdf")
+    i += 1
+    return fig
+with gr.Blocks() as demo:
+    gr.Markdown("## Language Distance Explorer")
+    average_checkbox = gr.Checkbox(label="Use Average Distances", value=False)
+    with gr.Row():
+        model_input = gr.Dropdown(label="Model", choices=MODELS, value=MODELS[0])
+        dataset_input = gr.Dropdown(
+            label="Dataset",
+            choices=DATASETS,
+            value=DATASETS[0]
+        )
+    with gr.Tab(label="Closest Languages Table"):
+        with gr.Row():
+            language_input = gr.Dropdown(label="Language", choices=languages, value=languages[0])
+            top_n_input = gr.Slider(label="Top N", minimum=1, maximum=30, step=1, value=10)
+        output_table = gr.Dataframe(label="Similar Languages")
+        model_input.change(fn=update_language_options, inputs=[model_input, dataset_input, language_input, average_checkbox], outputs=language_input)
+        dataset_input.change(fn=update_language_options, inputs=[model_input, dataset_input, language_input, average_checkbox], outputs=language_input)
+        language_input.change(fn=get_similar_languages, inputs=[model_input, dataset_input, language_input, average_checkbox, top_n_input], outputs=output_table)
+        model_input.change(fn=get_similar_languages, inputs=[model_input, dataset_input, language_input, average_checkbox, top_n_input], outputs=output_table)
+        dataset_input.change(fn=get_similar_languages, inputs=[model_input, dataset_input, language_input, average_checkbox, top_n_input], outputs=output_table)
+        top_n_input.change(fn=get_similar_languages, inputs=[model_input, dataset_input, language_input, average_checkbox, top_n_input], outputs=output_table)
+        average_checkbox.change(
+            fn=toggle_inputs,
+            inputs=[average_checkbox],
+            outputs=[model_input, dataset_input]
+        )
+        average_checkbox.change(fn=update_language_options, inputs=[model_input, dataset_input, language_input, average_checkbox], outputs=language_input)
+        average_checkbox.change(fn=get_similar_languages, inputs=[model_input, dataset_input, language_input, average_checkbox, top_n_input], outputs=output_table)
+    with gr.Tab(label="Distance Plot"):
+        with gr.Row():
+            cluster_method_input = gr.Dropdown(label="Cluster Method", choices=["HDBSCAN", "KMeans", "Family", "Subfamily"], value="HDBSCAN")
+            clusters_input = gr.Slider(label="Minimum Elements in a Cluster", minimum=2, maximum=10, step=1, value=2)
+        def update_clusters_input_option(cluster_method):
+            if cluster_method == "HDBSCAN":
+                return gr.Slider(label="Minimum Elements in a Cluster", minimum=2, maximum=10, step=1, value=2, visible=True, interactive=True)
+            elif cluster_method == "KMeans":
+                return gr.Slider(label="Number of Clusters", minimum=2, maximum=20, step=1, value=2, visible=True, interactive=True)
+            else:
+                return gr.update(interactive=False, visible=False)
+        cluster_method_input.change(fn=update_clusters_input_option, inputs=[cluster_method_input], outputs=clusters_input)
+        with gr.Row():
+            plot_tsne_button = gr.Button("Plot t-SNE")
+            plot_umap_button = gr.Button("Plot UMAP")
+            plot_mst_button = gr.Button("Plot MST")
+        with gr.Row():
+            plot_output = gr.Plot(label="Distance Plot")
+        plot_tsne_button.click(fn=partial(plot_distances, plot_fn=plot_distances_tsne),
+                            inputs=[model_input, dataset_input, average_checkbox, cluster_method_input, clusters_input],
+                            outputs=plot_output)
+        plot_umap_button.click(fn=partial(plot_distances, plot_fn=plot_distances_umap),
+                            inputs=[model_input, dataset_input, average_checkbox, cluster_method_input, clusters_input],
+                            outputs=plot_output)
+        plot_mst_button.click(fn=partial(plot_distances, plot_fn=plot_mst),
+                            inputs=[model_input, dataset_input, average_checkbox, cluster_method_input, clusters_input],
+                            outputs=plot_output)
+    with gr.Tab(label="Language Families Subplot"):
+        checked_families_input = gr.CheckboxGroup(label="Language Families",
+                             choices=[
+                                'Afroasiatic',
+                                'Austroasiatic',
+                                'Austronesian',
+                                'Constructed',
+                                'Creole',
+                                'Dravidian',
+                                'Germanic',
+                                'Indo-European',
+                                'Japonic',
+                                'Kartvelian',
+                                'Koreanic',
+                                'Language Isolate',
+                                'Niger-Congo',
+                                'Northeast Caucasian',
+                                'Romance',
+                                'Sino-Tibetan',
+                                'Turkic',
+                                'Uralic'
+                            ],
+                            value=["Indo-European"])
+        with gr.Row():
+            plot_family_button = gr.Button("Plot Families")
+            plot_figsize_h_input = gr.Slider(label="Figure Height", minimum=5, maximum=30, step=1, value=15)
+            plot_figsize_w_input = gr.Slider(label="Figure Width", minimum=5, maximum=30, step=1, value=15)
+        plot_family_output = gr.Plot(label="Families Plot")
+        def plot_families_subfamilies(families, model, dataset, use_average, figsize_h, figsize_w):
+            global i
+            updated_matrix, updated_languages = filter_languages_nan(model, dataset, use_average)
+            updated_matrix, updated_languages = filter_languages_by_families(updated_matrix, updated_languages, families)
+            clusters, legends = cluster_languages_by_subfamilies(updated_languages)
+            fig = plot_mst(model, dataset, use_average, updated_matrix, updated_languages, clusters, legends, fig_size=(figsize_w, figsize_h))
+            fig.tight_layout()
+            fig.savefig(f"plots/plot_{i}.pdf", format="pdf")
+            i += 1
+            return fig
+        plot_family_button.click(fn=plot_families_subfamilies,
+                                inputs=[checked_families_input, model_input, dataset_input, average_checkbox, plot_figsize_h_input, plot_figsize_w_input],
+                                outputs=plot_family_output)
+demo.launch(share=True)

constants.py ADDED Viewed

	@@ -0,0 +1,217 @@

+language_subfamilies = {
+    "Afrikaans": "West Germanic",
+    "Albanian": "Albanian",
+    "Arabic": "Semitic",
+    "Egyptian Arabic": "Semitic",
+    "Aragonese": "Romance",
+    "Armenian": "Armenian",
+    "Asturian": "Romance",
+    "Azerbaijani": "Oghuz",
+    "Bashkir": "Kypchak",
+    "Basque": "Language Isolate",
+    "Bavarian": "Austro-Bavarian",
+    "Belarusian": "East Slavic",
+    "Bengali": "Eastern Indo-Aryan",
+    "Bishnupriya Manipuri": "Eastern Indo-Aryan",
+    "Bosnian": "South Slavic",
+    "Breton": "Brythonic",
+    "Bulgarian": "South Slavic",
+    "Burmese": "Burmish",
+    "Catalan": "Romance",
+    "Cebuano": "Central Philippine",
+    "Chechen": "Nakh-Daghestanian",
+    "Chinese (Simplified)": "Sinitic",
+    "Chinese (Traditional)": "Sinitic",
+    "Min Nan Chinese": "Sinitic",
+    "Chuvash": "Oghur",
+    "Croatian": "South Slavic",
+    "Czech": "West Slavic",
+    "Danish": "North Germanic",
+    "Dutch": "West Germanic",
+    "English": "West Germanic",
+    "Estonian": "Finnic",
+    "Finnish": "Finnic",
+    "French": "Gallo-Romance",
+    "Galician": "Gallo-Romance",
+    "Georgian": "Kartvelian",
+    "German": "West Germanic",
+    "Greek": "Hellenic",
+    "Gujarati": "Gujarati",
+    "Haitian": "French-based Creole",
+    "Hebrew": "Semitic",
+    "Hindi": "Central Indo-Aryan",
+    "Hungarian": "Ugric",
+    "Icelandic": "North Germanic",
+    "Ido": "Constructed",
+    "Indonesian": "Malayic",
+    "Irish": "Goidelic",
+    "Italian": "Italo-Dalmatian",
+    "Japanese": "Japonic",
+    "Javanese": "Javanic",
+    "Kannada": "Southern Dravidian",
+    "Kazakh": "Kypchak",
+    "Kirghiz": "Kypchak",
+    "Korean": "Koreanic",
+    "Latin": "Italic",
+    "Latvian": "Baltic",
+    "Lithuanian": "Baltic",
+    "Lombard": "Gallo-Italic",
+    "Low Saxon": "West Germanic",
+    "Luxembourgish": "West Germanic",
+    "Macedonian": "South Slavic",
+    "Malagasy": "Malayic",
+    "Malay": "Malayic",
+    "Malayalam": "Southern Dravidian",
+    "Marathi": "Central Indo-Aryan",
+    "Minangkabau": "Malayic",
+    "Nepali": "Eastern Indo-Aryan",
+    "Newar": "Newaric",
+    "Norwegian (Bokmal)": "North Germanic",
+    "Norwegian (Nynorsk)": "North Germanic",
+    "Occitan": "Gallo-Romance",
+    "Persian (Farsi)": "Iranian",
+    "Piedmontese": "Gallo-Italic",
+    "Polish": "West Slavic",
+    "Portuguese": "Iberian Romance",
+    "Punjabi": "Punjabi",
+    "Romanian": "Eastern Romance",
+    "Russian": "East Slavic",
+    "Scots": "West Germanic",
+    "Serbian": "South Slavic",
+    "Serbo-Croatian": "South Slavic",
+    "Sicilian": "Italo-Dalmatian",
+    "Slovak": "West Slavic",
+    "Slovenian": "South Slavic",
+    "South Azerbaijani": "Oghuz",
+    "Spanish": "Iberian Romance",
+    "Sundanese": "Sundic",
+    "Swahili": "Bantu",
+    "Swedish": "North Germanic",
+    "Tagalog": "Central Philippine",
+    "Tajik": "Iranian",
+    "Tamil": "Southern Dravidian",
+    "Tatar": "Kypchak",
+    "Telugu": "Southern Dravidian",
+    "Turkish": "Oghuz",
+    "Ukrainian": "East Slavic",
+    "Urdu": "Central Indo-Aryan",
+    "Uzbek": "Karluk",
+    "Vietnamese": "Vietic",
+    "Volapük": "Constructed",
+    "Waray-Waray": "Central Philippine",
+    "Welsh": "Brythonic",
+    "West Frisian": "West Germanic",
+    "Western Punjabi": "Punjabi",
+    "Yoruba": "Yoruboid",
+    "Esperanto": "Constructed",
+    "Crimean Tatar": "Kypchak"
+}
+language_families = {
+    "Afrikaans": "Germanic",
+    "Albanian": "Indo-European",
+    "Arabic": "Afroasiatic",
+    "Egyptian Arabic": "Afroasiatic",
+    "Aragonese": "Romance",
+    "Armenian": "Indo-European",
+    "Asturian": "Romance",
+    "Azerbaijani": "Turkic",
+    "Bashkir": "Turkic",
+    "Basque": "Language Isolate",
+    "Bavarian": "Germanic",
+    "Belarusian": "Indo-European",
+    "Bengali": "Indo-European",
+    "Bishnupriya Manipuri": "Indo-European",
+    "Bosnian": "Indo-European",
+    "Breton": "Indo-European",
+    "Bulgarian": "Indo-European",
+    "Burmese": "Sino-Tibetan",
+    "Catalan": "Romance",
+    "Cebuano": "Austronesian",
+    "Chechen": "Northeast Caucasian",
+    "Chinese (Simplified)": "Sino-Tibetan",
+    "Chinese (Traditional)": "Sino-Tibetan",
+    "Min Nan Chinese": "Sino-Tibetan",
+    "Chuvash": "Turkic",
+    "Croatian": "Indo-European",
+    "Czech": "Indo-European",
+    "Danish": "Germanic",
+    "Dutch": "Germanic",
+    "English": "Germanic",
+    "Estonian": "Uralic",
+    "Finnish": "Uralic",
+    "French": "Romance",
+    "Galician": "Romance",
+    "Georgian": "Kartvelian",
+    "German": "Germanic",
+    "Greek": "Indo-European",
+    "Gujarati": "Indo-European",
+    "Haitian": "Creole",
+    "Hebrew": "Afroasiatic",
+    "Hindi": "Indo-European",
+    "Hungarian": "Uralic",
+    "Icelandic": "Germanic",
+    "Ido": "Constructed",
+    "Indonesian": "Austronesian",
+    "Irish": "Indo-European",
+    "Italian": "Romance",
+    "Japanese": "Japonic",
+    "Javanese": "Austronesian",
+    "Kannada": "Dravidian",
+    "Kazakh": "Turkic",
+    "Kirghiz": "Turkic",
+    "Korean": "Koreanic",
+    "Latin": "Indo-European",
+    "Latvian": "Indo-European",
+    "Lithuanian": "Indo-European",
+    "Lombard": "Romance",
+    "Low Saxon": "Germanic",
+    "Luxembourgish": "Germanic",
+    "Macedonian": "Indo-European",
+    "Malagasy": "Austronesian",
+    "Malay": "Austronesian",
+    "Malayalam": "Dravidian",
+    "Marathi": "Indo-European",
+    "Minangkabau": "Austronesian",
+    "Nepali": "Indo-European",
+    "Newar": "Sino-Tibetan",
+    "Norwegian (Bokmal)": "Germanic",
+    "Norwegian (Nynorsk)": "Germanic",
+    "Occitan": "Romance",
+    "Persian (Farsi)": "Indo-European",
+    "Piedmontese": "Romance",
+    "Polish": "Indo-European",
+    "Portuguese": "Romance",
+    "Punjabi": "Indo-European",
+    "Romanian": "Romance",
+    "Russian": "Indo-European",
+    "Scots": "Germanic",
+    "Serbian": "Indo-European",
+    "Serbo-Croatian": "Indo-European",
+    "Sicilian": "Romance",
+    "Slovak": "Indo-European",
+    "Slovenian": "Indo-European",
+    "South Azerbaijani": "Turkic",
+    "Spanish": "Romance",
+    "Sundanese": "Austronesian",
+    "Swahili": "Niger-Congo",
+    "Swedish": "Germanic",
+    "Tagalog": "Austronesian",
+    "Tajik": "Indo-European",
+    "Tamil": "Dravidian",
+    "Tatar": "Turkic",
+    "Telugu": "Dravidian",
+    "Turkish": "Turkic",
+    "Ukrainian": "Indo-European",
+    "Urdu": "Indo-European",
+    "Uzbek": "Turkic",
+    "Vietnamese": "Austroasiatic",
+    "Volapük": "Constructed",
+    "Waray-Waray": "Austronesian",
+    "Welsh": "Indo-European",
+    "West Frisian": "Germanic",
+    "Western Punjabi": "Indo-European",
+    "Yoruba": "Niger-Congo",
+    "Esperanto": "Constructed",
+    "Crimean Tatar": "Turkic"
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio==5.23.3
+networkx==3.4.2
+umap-learn==0.5.7

utils.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import networkx as nx
+from sklearn.cluster import HDBSCAN
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.manifold import TSNE
+import umap
+from sklearn.cluster import KMeans
+from scipy.spatial import KDTree
+from adjustText import adjust_text
+from constants import language_families, language_subfamilies
+def filter_languages_by_families(matrix, languages, families):
+    """
+    Filters the languages based on their families.
+    Parameters:
+    - languages: list of languages to filter.
+    - families: list of families to include.
+    Returns:
+    - filtered_languages: list of languages that belong to the specified families.
+    """
+    filtered_languages = [(i, lang) for i, lang in enumerate(languages) if language_families[lang] in families]
+    filtered_indices = [i for i, lang in filtered_languages]
+    filtered_languages = [lang for i, lang in filtered_languages]
+    filtered_matrix = matrix[np.ix_(filtered_indices, filtered_indices)]
+    return filtered_matrix, filtered_languages
+def get_dynamic_color_map(n_colors):
+    """
+    Generates a dynamic color map with the specified number of colors.
+    Parameters:
+    - n_colors: int, the number of distinct colors required.
+    Returns:
+    - color_map: list of RGB tuples representing the colors.
+    """
+    cmap = plt.get_cmap("tab20") if n_colors <= 20 else plt.get_cmap("hsv")
+    color_map = [cmap(i / n_colors) for i in range(n_colors)]
+    return color_map
+def cluster_languages_by_families(languages):
+    lang_families = [language_families[lang] for lang in languages]
+    legend = sorted(set(lang_families))
+    clusters = [legend.index(family) for family in lang_families]
+    return clusters, legend
+def cluster_languages_by_subfamilies(languages):
+    labels = [language_families[lang] + f" ({language_subfamilies[lang]})" for lang in languages]
+    legend = sorted(set(labels))
+    clusters = [legend.index(family) for family in labels]
+    return clusters, legend
+def plot_mst(model, dataset, use_average, matrix, languages, clusters, legend=None, fig_size=(20,20)):
+    """
+    Plots a Minimum Spanning Tree (MST) from a given distance matrix, node labels, and cluster assignments.
+    Parameters:
+    - dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between nodes.
+    - labels: list of length N containing the labels for each node.
+    - clusters: list of length N containing the cluster assignment (or ID) for each node.
+    """
+    # Create an empty undirected graph
+    G = nx.Graph()
+    # Number of nodes
+    N = len(languages)
+    # Add edges to the graph from the distance matrix.
+    # Only iterate over the upper triangle of the matrix (i < j)
+    for i in range(N):
+        for j in range(i + 1, N):
+            G.add_edge(i, j, weight=matrix[i, j])
+    # Compute the Minimum Spanning Tree using NetworkX's built-in function.
+    mst = nx.minimum_spanning_tree(G)
+    # Choose a layout for the MST. Here we use Kamada-Kawai layout which considers edge weights.
+    pos = nx.kamada_kawai_layout(mst, weight='weight')
+    # Map each cluster to a color
+    unique_clusters = sorted(set(clusters))
+    cmap = get_dynamic_color_map(len(unique_clusters))
+    cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)}
+    node_colors = [cluster_colors.get(cluster) for cluster in clusters]
+    # Create a figure for plotting.
+    fig, ax = plt.subplots(figsize=fig_size)
+    # Draw the MST edges.
+    nx.draw_networkx_edges(mst, pos, edge_color='gray', ax=ax)
+    # Draw the nodes with colors corresponding to their clusters.
+    nx.draw_networkx_nodes(mst, pos, node_color=node_colors, node_size=100, ax=ax, alpha=0.7)
+    # Instead of directly drawing labels, we create text objects to adjust them later
+    texts = []
+    for i, label in enumerate(languages):
+        x, y = pos[i]
+        texts.append(ax.text(x, y, label, fontsize=10))
+    # Adjust text labels to minimize overlap.
+    # The arrowprops argument can draw arrows from labels to nodes if desired.
+    adjust_text(texts, expand_text=(1.05, 1.2))
+    # Add a legend for clusters
+    if legend is None:
+        legend = {cluster: str(cluster) for cluster in unique_clusters}
+    legend_handles = [
+        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=cluster_colors[cluster], markersize=10, alpha=0.7, label=legend[cluster])
+        for cluster in unique_clusters
+    ]
+    ax.legend(handles=legend_handles, title="Clusters", loc="best")
+    # Remove axis for clarity.
+    ax.axis('off')
+    # ax.set_title(f"Minimum Spanning Tree of Languages ({'Average' if use_average else f'{model}, {dataset}'})")
+    return fig
+def cluster_languages_kmeans(dist_matrix, languages, n_clusters=5):
+    """
+    Clusters languages using a distance matrix and KMeans.
+    Parameters:
+    - dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between languages.
+    - n_clusters: int, the number of clusters to form.
+    Returns:
+    - filtered_matrix: 2D NumPy array of the filtered distance matrix.
+    - filtered_languages: list of filtered languages.
+    - filtered_clusters: list of filtered cluster assignments.
+    """
+    # Perform clustering using KMeans
+    kmeans_model = KMeans(n_clusters=n_clusters, random_state=23)
+    clusters = kmeans_model.fit_predict(dist_matrix)
+    # Count the number of elements in each cluster
+    cluster_counts = np.bincount(clusters)
+    # Identify clusters with more than 1 element
+    valid_clusters = np.where(cluster_counts > 1)[0]
+    # Filter out points belonging to clusters with only 1 element
+    valid_indices = np.isin(clusters, valid_clusters)
+    filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)]
+    filtered_languages = np.array(languages)[valid_indices]
+    filtered_clusters = np.array(clusters)[valid_indices]
+    return filtered_matrix, filtered_languages, filtered_clusters
+def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2):
+    """
+    Clusters languages using a distance matrix and HDBSCAN.
+    Parameters:
+    - dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between languages.
+    - min_cluster_size: int, the minimum size of clusters.
+    Returns:
+    - clusters: list of length N containing the cluster assignment (or ID) for each language.
+    """
+    # Perform clustering using HDBSCAN with the precomputed distance matrix
+    clustering_model = HDBSCAN(
+        metric='precomputed', min_cluster_size=min_cluster_size
+    )
+    clusters = clustering_model.fit_predict(dist_matrix)
+    # Filter out points belonging to cluster -1 using NumPy
+    valid_indices = np.where(clusters != -1)[0]
+    filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)]
+    filtered_languages = np.array(languages)[valid_indices]
+    filtered_clusters = np.array(clusters)[valid_indices]
+    return filtered_matrix, filtered_languages, filtered_clusters
+def plot_distances_tsne(model, dataset, use_average, matrix, languages, clusters, legend=None):
+    """
+    Plots all languages from the distances matrix using t-SNE and colors them by clusters.
+    """
+    tsne = TSNE(n_components=2, random_state=23, metric="precomputed", init="random")
+    tsne_results = tsne.fit_transform(matrix)
+    # Map each cluster to a color
+    unique_clusters = sorted(set(clusters))
+    cmap = get_dynamic_color_map(len(unique_clusters))
+    cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)}
+    fig, ax = plt.subplots(figsize=(16, 12))
+    scatter = ax.scatter(tsne_results[:, 0], tsne_results[:, 1], c=[cluster_colors[cluster] for cluster in clusters], alpha=0.7)
+    # for i, lang in enumerate(languages):
+    #     ax.text(tsne_results[i, 0], tsne_results[i, 1], lang, fontsize=8, alpha=0.8)
+    # Instead of directly drawing labels, we create text objects to adjust them later
+    texts = []
+    for i, label in enumerate(languages):
+        x, y = tsne_results[i, 0], tsne_results[i, 1]
+        texts.append(ax.text(x, y, label, fontsize=10))
+    # Adjust text labels to minimize overlap.
+    # The arrowprops argument can draw arrows from labels to nodes if desired.
+    adjust_text(texts, expand_text=(1.05, 1.2))
+    # Add a legend for clusters
+    if legend is None:
+        legend = {cluster: str(cluster) for cluster in unique_clusters}
+    legend_handles = [
+        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=cluster_colors[cluster], markersize=10, label=legend[cluster])
+        for cluster in unique_clusters
+    ]
+    ax.legend(handles=legend_handles, title="Clusters", loc="best")
+    ax.set_title(f"t-SNE Visualization of Language Distances ({'Average' if use_average else f'{model}, {dataset}'})")
+    ax.set_xlabel("t-SNE Dimension 1")
+    ax.set_ylabel("t-SNE Dimension 2")
+    return fig
+def plot_distances_umap(model, dataset, use_average, matrix, languages, clusters, legend=None):
+    """
+    Plots all languages from the distances matrix using UMAP and colors them by clusters.
+    """
+    umap_model = umap.UMAP(metric="precomputed", random_state=23)
+    umap_results = umap_model.fit_transform(matrix)
+    # Map each cluster to a color
+    unique_clusters = sorted(set(clusters))
+    cmap = get_dynamic_color_map(len(unique_clusters))
+    cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)}
+    fig, ax = plt.subplots(figsize=(16, 12))
+    scatter = ax.scatter(umap_results[:, 0], umap_results[:, 1], c=[cluster_colors[cluster] for cluster in clusters], alpha=0.7)
+    # for i, lang in enumerate(languages):
+    #     ax.text(umap_results[i, 0], umap_results[i, 1], lang, fontsize=8, alpha=0.8)
+    # Instead of directly drawing labels, we create text objects to adjust them later
+    texts = []
+    for i, label in enumerate(languages):
+        x, y = umap_results[i, 0], umap_results[i, 1]
+        texts.append(ax.text(x, y, label, fontsize=10))
+    # Adjust text labels to minimize overlap.
+    # The arrowprops argument can draw arrows from labels to nodes if desired.
+    adjust_text(texts, expand_text=(1.05, 1.2))
+    # Add a legend for clusters
+    if legend is None:
+        legend = {cluster: str(cluster) for cluster in unique_clusters}
+    legend_handles = [
+        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=cluster_colors[cluster], markersize=10, label=legend[cluster])
+        for cluster in unique_clusters
+    ]
+    ax.legend(handles=legend_handles, title="Clusters", loc="best")
+    ax.set_title(f"UMAP Visualization of Language Distances ({'Average' if use_average else f'{model}, {dataset}'})")
+    ax.set_xlabel("UMAP Dimension 1")
+    ax.set_ylabel("UMAP Dimension 2")
+    return fig