Spaces:
Runtime error
Runtime error
Update clustering.py
Browse files- clustering.py +9 -9
clustering.py
CHANGED
@@ -637,21 +637,21 @@ def perform_clustering(
|
|
637 |
|
638 |
for distance_threshold in threshold_values:
|
639 |
log_message(distance_threshold)
|
640 |
-
|
641 |
clustering = AgglomerativeClustering(
|
642 |
n_clusters=None,
|
643 |
-
|
644 |
linkage="complete",
|
645 |
metric="cosine",
|
646 |
)
|
647 |
|
648 |
# Formatear el nombre de la columna para incluir solo 6 decimales
|
649 |
-
cluster_name = f"cluster_{
|
650 |
data[cluster_name] = clustering.fit_predict(embeddings_matrix)
|
651 |
|
652 |
# Almacenar los resultados en las estructuras correspondientes
|
653 |
-
cluster_assignments[
|
654 |
-
cluster_counts[
|
655 |
labels = data[cluster_name]
|
656 |
|
657 |
# Calcular Calinski-Harabasz Score
|
@@ -662,7 +662,7 @@ def perform_clustering(
|
|
662 |
ch_score = round(ch_score, 2)
|
663 |
else:
|
664 |
ch_score = -1 # Valor predeterminado si solo hay un cl煤ster
|
665 |
-
calinski_harabasz_scores[
|
666 |
|
667 |
# Calcular Silhouette Score
|
668 |
if len(np.unique(labels)) > 1:
|
@@ -670,16 +670,16 @@ def perform_clustering(
|
|
670 |
sil_score = round(sil_score, 2)
|
671 |
else:
|
672 |
sil_score = -1 # Valor predeterminado si solo hay un cl煤ster
|
673 |
-
silhouette_scores[
|
674 |
|
675 |
# Placeholder for finding the most similar comment function
|
676 |
-
most_similar_comments[
|
677 |
for cluster_id in np.unique(labels):
|
678 |
cluster_data = data[data[cluster_name] == cluster_id]
|
679 |
avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
|
680 |
# Reemplazar con tu implementaci贸n real
|
681 |
most_similar_comment = find_most_similar_comment(cluster_data, avg_embedding)
|
682 |
-
most_similar_comments[
|
683 |
|
684 |
return (
|
685 |
cluster_assignments,
|
|
|
637 |
|
638 |
for distance_threshold in threshold_values:
|
639 |
log_message(distance_threshold)
|
640 |
+
rounded_distance_threshold = round(rounded_distance_threshold, 6)
|
641 |
clustering = AgglomerativeClustering(
|
642 |
n_clusters=None,
|
643 |
+
rounded_distance_threshold=rounded_distance_threshold,
|
644 |
linkage="complete",
|
645 |
metric="cosine",
|
646 |
)
|
647 |
|
648 |
# Formatear el nombre de la columna para incluir solo 6 decimales
|
649 |
+
cluster_name = f"cluster_{rounded_distance_threshold:.6f}"
|
650 |
data[cluster_name] = clustering.fit_predict(embeddings_matrix)
|
651 |
|
652 |
# Almacenar los resultados en las estructuras correspondientes
|
653 |
+
cluster_assignments[rounded_distance_threshold] = data[cluster_name]
|
654 |
+
cluster_counts[rounded_distance_threshold] = data[cluster_name].value_counts()
|
655 |
labels = data[cluster_name]
|
656 |
|
657 |
# Calcular Calinski-Harabasz Score
|
|
|
662 |
ch_score = round(ch_score, 2)
|
663 |
else:
|
664 |
ch_score = -1 # Valor predeterminado si solo hay un cl煤ster
|
665 |
+
calinski_harabasz_scores[rounded_distance_threshold] = ch_score
|
666 |
|
667 |
# Calcular Silhouette Score
|
668 |
if len(np.unique(labels)) > 1:
|
|
|
670 |
sil_score = round(sil_score, 2)
|
671 |
else:
|
672 |
sil_score = -1 # Valor predeterminado si solo hay un cl煤ster
|
673 |
+
silhouette_scores[rounded_distance_threshold] = sil_score
|
674 |
|
675 |
# Placeholder for finding the most similar comment function
|
676 |
+
most_similar_comments[rounded_distance_threshold] = {}
|
677 |
for cluster_id in np.unique(labels):
|
678 |
cluster_data = data[data[cluster_name] == cluster_id]
|
679 |
avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
|
680 |
# Reemplazar con tu implementaci贸n real
|
681 |
most_similar_comment = find_most_similar_comment(cluster_data, avg_embedding)
|
682 |
+
most_similar_comments[rounded_distance_threshold][cluster_id] = most_similar_comment
|
683 |
|
684 |
return (
|
685 |
cluster_assignments,
|