Spaces:
Runtime error
Runtime error
Update clustering.py
Browse files- clustering.py +14 -18
clustering.py
CHANGED
@@ -625,10 +625,9 @@ def perform_clustering(
|
|
625 |
|
626 |
embeddings_matrix = np.array(data[embeddings_col].tolist())
|
627 |
|
628 |
-
if
|
629 |
threshold_values = np.round(np.linspace(min_eps, max_eps, n), 6).astype(float)
|
630 |
log_message(f"perform_clustering {threshold_values}")
|
631 |
-
# threshold_values = np.linspace(min_eps, max_eps, n)
|
632 |
|
633 |
cluster_assignments = {}
|
634 |
cluster_counts = {}
|
@@ -638,6 +637,7 @@ def perform_clustering(
|
|
638 |
|
639 |
for distance_threshold in threshold_values:
|
640 |
log_message(distance_threshold)
|
|
|
641 |
clustering = AgglomerativeClustering(
|
642 |
n_clusters=None,
|
643 |
distance_threshold=distance_threshold,
|
@@ -645,21 +645,19 @@ def perform_clustering(
|
|
645 |
metric="cosine",
|
646 |
)
|
647 |
|
648 |
-
|
649 |
-
|
650 |
-
)
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
].value_counts()
|
655 |
-
labels = data[
|
656 |
|
657 |
# Calcular Calinski-Harabasz Score
|
658 |
if len(np.unique(labels)) > 1:
|
659 |
# Recalcular matriz de distancias con base en los clusters
|
660 |
-
euclidean_distances = pairwise_distances(
|
661 |
-
embeddings_matrix, metric="euclidean"
|
662 |
-
)
|
663 |
ch_score = calinski_harabasz_score(euclidean_distances, labels)
|
664 |
ch_score = round(ch_score, 2)
|
665 |
else:
|
@@ -677,12 +675,10 @@ def perform_clustering(
|
|
677 |
# Placeholder for finding the most similar comment function
|
678 |
most_similar_comments[distance_threshold] = {}
|
679 |
for cluster_id in np.unique(labels):
|
680 |
-
cluster_data = data[data[
|
681 |
avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
|
682 |
-
#
|
683 |
-
most_similar_comment = find_most_similar_comment(
|
684 |
-
cluster_data, avg_embedding
|
685 |
-
)
|
686 |
most_similar_comments[distance_threshold][cluster_id] = most_similar_comment
|
687 |
|
688 |
return (
|
|
|
625 |
|
626 |
embeddings_matrix = np.array(data[embeddings_col].tolist())
|
627 |
|
628 |
+
if threshold_values is None:
|
629 |
threshold_values = np.round(np.linspace(min_eps, max_eps, n), 6).astype(float)
|
630 |
log_message(f"perform_clustering {threshold_values}")
|
|
|
631 |
|
632 |
cluster_assignments = {}
|
633 |
cluster_counts = {}
|
|
|
637 |
|
638 |
for distance_threshold in threshold_values:
|
639 |
log_message(distance_threshold)
|
640 |
+
|
641 |
clustering = AgglomerativeClustering(
|
642 |
n_clusters=None,
|
643 |
distance_threshold=distance_threshold,
|
|
|
645 |
metric="cosine",
|
646 |
)
|
647 |
|
648 |
+
# Formatear el nombre de la columna para incluir solo 6 decimales
|
649 |
+
cluster_name = f"cluster_{distance_threshold:.6f}"
|
650 |
+
data[cluster_name] = clustering.fit_predict(embeddings_matrix)
|
651 |
+
|
652 |
+
# Almacenar los resultados en las estructuras correspondientes
|
653 |
+
cluster_assignments[distance_threshold] = data[cluster_name]
|
654 |
+
cluster_counts[distance_threshold] = data[cluster_name].value_counts()
|
655 |
+
labels = data[cluster_name]
|
656 |
|
657 |
# Calcular Calinski-Harabasz Score
|
658 |
if len(np.unique(labels)) > 1:
|
659 |
# Recalcular matriz de distancias con base en los clusters
|
660 |
+
euclidean_distances = pairwise_distances(embeddings_matrix, metric="euclidean")
|
|
|
|
|
661 |
ch_score = calinski_harabasz_score(euclidean_distances, labels)
|
662 |
ch_score = round(ch_score, 2)
|
663 |
else:
|
|
|
675 |
# Placeholder for finding the most similar comment function
|
676 |
most_similar_comments[distance_threshold] = {}
|
677 |
for cluster_id in np.unique(labels):
|
678 |
+
cluster_data = data[data[cluster_name] == cluster_id]
|
679 |
avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
|
680 |
+
# Reemplazar con tu implementaci贸n real
|
681 |
+
most_similar_comment = find_most_similar_comment(cluster_data, avg_embedding)
|
|
|
|
|
682 |
most_similar_comments[distance_threshold][cluster_id] = most_similar_comment
|
683 |
|
684 |
return (
|