PhyloLM / tools.py
Daetheys's picture
First version gradio
3d6ba31
import numpy as np
from scipy.spatial.distance import squareform
from fastcluster import linkage
import umap
# ------------------------------------------------------------------------------------------------
#
# Sim Matrix Ordering
#
# ------------------------------------------------------------------------------------------------
def seriation(Z,N,cur_index):
'''
input:
- Z is a hierarchical tree (dendrogram)
- N is the number of points given to the clustering process
- cur_index is the position in the tree for the recursive traversal
output:
- order implied by the hierarchical tree Z
seriation computes the order implied by a hierarchical tree (dendrogram)
'''
if cur_index < N:
return [cur_index]
else:
left = int(Z[cur_index-N,0])
right = int(Z[cur_index-N,1])
return (seriation(Z,N,left) + seriation(Z,N,right))
def compute_serial_matrix(dist_mat,method="ward"):
'''
input:
- dist_mat is a distance matrix
- method = ["ward","single","average","complete"]
output:
- seriated_dist is the input dist_mat,
but with re-ordered rows and columns
according to the seriation, i.e. the
order implied by the hierarchical tree
- res_order is the order implied by
the hierarhical tree
- res_linkage is the hierarhical tree (dendrogram)
compute_serial_matrix transforms a distance matrix into
a sorted distance matrix according to the order implied
by the hierarchical tree (dendrogram)
'''
N = len(dist_mat)
flat_dist_mat = squareform(dist_mat)
res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
res_order = seriation(res_linkage, N, N + N-2)
seriated_dist = np.zeros((N,N))
a,b = np.triu_indices(N,k=1)
seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
seriated_dist[b,a] = seriated_dist[a,b]
return seriated_dist, res_order, res_linkage
def compute_ordered_matrix(sim_matrix,dist_matrix, model_names):
if len(sim_matrix) >= 2:
# Compute serial matrix (hierarchical clustering) for tab1
ordered_dist_matrix, order, Z = compute_serial_matrix(dist_matrix)
ordered_sim_matrix = sim_matrix[order][:, order]
ordered_model_names = [model_names[i] for i in order]
else:
ordered_sim_matrix = sim_matrix
ordered_model_names = model_names
return ordered_sim_matrix, ordered_model_names
# ------------------------------------------------------------------------------------------------
#
# UMAP computation
#
# ------------------------------------------------------------------------------------------------
def compute_umap(dist_matrix,d=2):
embedding = umap.UMAP(densmap=True,n_components=d, metric='precomputed',random_state=42).fit_transform(dist_matrix)
return embedding