Spaces:
Runtime error
Runtime error
import networkx as nx | |
from sklearn.cluster import HDBSCAN | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from sklearn.manifold import TSNE | |
import umap | |
from sklearn.cluster import KMeans | |
from scipy.spatial import KDTree | |
from adjustText import adjust_text | |
from constants import language_families, language_subfamilies | |
def filter_languages_by_families(matrix, languages, families): | |
""" | |
Filters the languages based on their families. | |
Parameters: | |
- languages: list of languages to filter. | |
- families: list of families to include. | |
Returns: | |
- filtered_languages: list of languages that belong to the specified families. | |
""" | |
filtered_languages = [(i, lang) for i, lang in enumerate(languages) if language_families[lang] in families] | |
filtered_indices = [i for i, lang in filtered_languages] | |
filtered_languages = [lang for i, lang in filtered_languages] | |
filtered_matrix = matrix[np.ix_(filtered_indices, filtered_indices)] | |
return filtered_matrix, filtered_languages | |
def get_dynamic_color_map(n_colors): | |
""" | |
Generates a dynamic color map with the specified number of colors. | |
Parameters: | |
- n_colors: int, the number of distinct colors required. | |
Returns: | |
- color_map: list of RGB tuples representing the colors. | |
""" | |
cmap = plt.get_cmap("tab20") if n_colors <= 20 else plt.get_cmap("hsv") | |
color_map = [cmap(i / n_colors) for i in range(n_colors)] | |
return color_map | |
def cluster_languages_by_families(languages): | |
lang_families = [language_families[lang] for lang in languages] | |
legend = sorted(set(lang_families)) | |
clusters = [legend.index(family) for family in lang_families] | |
return clusters, legend | |
def cluster_languages_by_subfamilies(languages): | |
labels = [language_families[lang] + f" ({language_subfamilies[lang]})" for lang in languages] | |
legend = sorted(set(labels)) | |
clusters = [legend.index(family) for family in labels] | |
return clusters, legend | |
def plot_mst(model, dataset, use_average, matrix, languages, clusters, legend=None, fig_size=(20,20)): | |
""" | |
Plots a Minimum Spanning Tree (MST) from a given distance matrix, node labels, and cluster assignments. | |
Parameters: | |
- dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between nodes. | |
- labels: list of length N containing the labels for each node. | |
- clusters: list of length N containing the cluster assignment (or ID) for each node. | |
""" | |
# Create an empty undirected graph | |
G = nx.Graph() | |
# Number of nodes | |
N = len(languages) | |
# Add edges to the graph from the distance matrix. | |
# Only iterate over the upper triangle of the matrix (i < j) | |
for i in range(N): | |
for j in range(i + 1, N): | |
G.add_edge(i, j, weight=matrix[i, j]) | |
# Compute the Minimum Spanning Tree using NetworkX's built-in function. | |
mst = nx.minimum_spanning_tree(G) | |
# Choose a layout for the MST. Here we use Kamada-Kawai layout which considers edge weights. | |
pos = nx.kamada_kawai_layout(mst, weight='weight') | |
# Map each cluster to a color | |
unique_clusters = sorted(set(clusters)) | |
cmap = get_dynamic_color_map(len(unique_clusters)) | |
cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} | |
node_colors = [cluster_colors.get(cluster) for cluster in clusters] | |
# Create a figure for plotting. | |
fig, ax = plt.subplots(figsize=fig_size) | |
# Draw the MST edges. | |
nx.draw_networkx_edges(mst, pos, edge_color='gray', ax=ax) | |
# Draw the nodes with colors corresponding to their clusters. | |
nx.draw_networkx_nodes(mst, pos, node_color=node_colors, node_size=100, ax=ax, alpha=0.7) | |
# Instead of directly drawing labels, we create text objects to adjust them later | |
texts = [] | |
for i, label in enumerate(languages): | |
x, y = pos[i] | |
texts.append(ax.text(x, y, label, fontsize=10)) | |
# Adjust text labels to minimize overlap. | |
# The arrowprops argument can draw arrows from labels to nodes if desired. | |
adjust_text(texts, expand_text=(1.05, 1.2)) | |
# Add a legend for clusters | |
if legend is None: | |
legend = {cluster: str(cluster) for cluster in unique_clusters} | |
legend_handles = [ | |
plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=cluster_colors[cluster], markersize=10, alpha=0.7, label=legend[cluster]) | |
for cluster in unique_clusters | |
] | |
ax.legend(handles=legend_handles, title="Clusters", loc="best") | |
# Remove axis for clarity. | |
ax.axis('off') | |
# ax.set_title(f"Minimum Spanning Tree of Languages ({'Average' if use_average else f'{model}, {dataset}'})") | |
return fig | |
def cluster_languages_kmeans(dist_matrix, languages, n_clusters=5): | |
""" | |
Clusters languages using a distance matrix and KMeans. | |
Parameters: | |
- dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between languages. | |
- n_clusters: int, the number of clusters to form. | |
Returns: | |
- filtered_matrix: 2D NumPy array of the filtered distance matrix. | |
- filtered_languages: list of filtered languages. | |
- filtered_clusters: list of filtered cluster assignments. | |
""" | |
# Perform clustering using KMeans | |
kmeans_model = KMeans(n_clusters=n_clusters, random_state=23) | |
clusters = kmeans_model.fit_predict(dist_matrix) | |
# Count the number of elements in each cluster | |
cluster_counts = np.bincount(clusters) | |
# Identify clusters with more than 1 element | |
valid_clusters = np.where(cluster_counts > 1)[0] | |
# Filter out points belonging to clusters with only 1 element | |
valid_indices = np.isin(clusters, valid_clusters) | |
filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)] | |
filtered_languages = np.array(languages)[valid_indices] | |
filtered_clusters = np.array(clusters)[valid_indices] | |
return filtered_matrix, filtered_languages, filtered_clusters | |
def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2): | |
""" | |
Clusters languages using a distance matrix and HDBSCAN. | |
Parameters: | |
- dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between languages. | |
- min_cluster_size: int, the minimum size of clusters. | |
Returns: | |
- clusters: list of length N containing the cluster assignment (or ID) for each language. | |
""" | |
# Perform clustering using HDBSCAN with the precomputed distance matrix | |
clustering_model = HDBSCAN( | |
metric='precomputed', min_cluster_size=min_cluster_size | |
) | |
clusters = clustering_model.fit_predict(dist_matrix) | |
# Filter out points belonging to cluster -1 using NumPy | |
valid_indices = np.where(clusters != -1)[0] | |
filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)] | |
filtered_languages = np.array(languages)[valid_indices] | |
filtered_clusters = np.array(clusters)[valid_indices] | |
return filtered_matrix, filtered_languages, filtered_clusters | |
def plot_distances_tsne(model, dataset, use_average, matrix, languages, clusters, legend=None): | |
""" | |
Plots all languages from the distances matrix using t-SNE and colors them by clusters. | |
""" | |
tsne = TSNE(n_components=2, random_state=23, metric="precomputed", init="random") | |
tsne_results = tsne.fit_transform(matrix) | |
# Map each cluster to a color | |
unique_clusters = sorted(set(clusters)) | |
cmap = get_dynamic_color_map(len(unique_clusters)) | |
cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} | |
fig, ax = plt.subplots(figsize=(16, 12)) | |
scatter = ax.scatter(tsne_results[:, 0], tsne_results[:, 1], c=[cluster_colors[cluster] for cluster in clusters], alpha=0.7) | |
# for i, lang in enumerate(languages): | |
# ax.text(tsne_results[i, 0], tsne_results[i, 1], lang, fontsize=8, alpha=0.8) | |
# Instead of directly drawing labels, we create text objects to adjust them later | |
texts = [] | |
for i, label in enumerate(languages): | |
x, y = tsne_results[i, 0], tsne_results[i, 1] | |
texts.append(ax.text(x, y, label, fontsize=10)) | |
# Adjust text labels to minimize overlap. | |
# The arrowprops argument can draw arrows from labels to nodes if desired. | |
adjust_text(texts, expand_text=(1.05, 1.2)) | |
# Add a legend for clusters | |
if legend is None: | |
legend = {cluster: str(cluster) for cluster in unique_clusters} | |
legend_handles = [ | |
plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=cluster_colors[cluster], markersize=10, label=legend[cluster]) | |
for cluster in unique_clusters | |
] | |
ax.legend(handles=legend_handles, title="Clusters", loc="best") | |
ax.set_title(f"t-SNE Visualization of Language Distances ({'Average' if use_average else f'{model}, {dataset}'})") | |
ax.set_xlabel("t-SNE Dimension 1") | |
ax.set_ylabel("t-SNE Dimension 2") | |
return fig | |
def plot_distances_umap(model, dataset, use_average, matrix, languages, clusters, legend=None): | |
""" | |
Plots all languages from the distances matrix using UMAP and colors them by clusters. | |
""" | |
umap_model = umap.UMAP(metric="precomputed", random_state=23) | |
umap_results = umap_model.fit_transform(matrix) | |
# Map each cluster to a color | |
unique_clusters = sorted(set(clusters)) | |
cmap = get_dynamic_color_map(len(unique_clusters)) | |
cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} | |
fig, ax = plt.subplots(figsize=(16, 12)) | |
scatter = ax.scatter(umap_results[:, 0], umap_results[:, 1], c=[cluster_colors[cluster] for cluster in clusters], alpha=0.7) | |
# for i, lang in enumerate(languages): | |
# ax.text(umap_results[i, 0], umap_results[i, 1], lang, fontsize=8, alpha=0.8) | |
# Instead of directly drawing labels, we create text objects to adjust them later | |
texts = [] | |
for i, label in enumerate(languages): | |
x, y = umap_results[i, 0], umap_results[i, 1] | |
texts.append(ax.text(x, y, label, fontsize=10)) | |
# Adjust text labels to minimize overlap. | |
# The arrowprops argument can draw arrows from labels to nodes if desired. | |
adjust_text(texts, expand_text=(1.05, 1.2)) | |
# Add a legend for clusters | |
if legend is None: | |
legend = {cluster: str(cluster) for cluster in unique_clusters} | |
legend_handles = [ | |
plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=cluster_colors[cluster], markersize=10, label=legend[cluster]) | |
for cluster in unique_clusters | |
] | |
ax.legend(handles=legend_handles, title="Clusters", loc="best") | |
ax.set_title(f"UMAP Visualization of Language Distances ({'Average' if use_average else f'{model}, {dataset}'})") | |
ax.set_xlabel("UMAP Dimension 1") | |
ax.set_ylabel("UMAP Dimension 2") | |
return fig |