Spaces:

Kuautli
/

ProyectoDS-AnalizaTube

Runtime error

App Files Files Community

Kuautli commited on 25 days ago

Commit

e7d710a

verified ·

1 Parent(s): d1607d8

Update clustering.py

Browse files

Files changed (1) hide show

clustering.py +14 -18

clustering.py CHANGED Viewed

@@ -625,10 +625,9 @@ def perform_clustering(
     embeddings_matrix = np.array(data[embeddings_col].tolist())
-    if not threshold_values:
         threshold_values = np.round(np.linspace(min_eps, max_eps, n), 6).astype(float)
         log_message(f"perform_clustering {threshold_values}")
-        # threshold_values = np.linspace(min_eps, max_eps, n)
     cluster_assignments = {}
     cluster_counts = {}
@@ -638,6 +637,7 @@ def perform_clustering(
     for distance_threshold in threshold_values:
         log_message(distance_threshold)
         clustering = AgglomerativeClustering(
             n_clusters=None,
             distance_threshold=distance_threshold,
@@ -645,21 +645,19 @@ def perform_clustering(
             metric="cosine",
         )
-        data[f"cluster_{distance_threshold}"] = clustering.fit_predict(
-            embeddings_matrix
-        )
-        cluster_assignments[distance_threshold] = data[f"cluster_{distance_threshold}"]
-        cluster_counts[distance_threshold] = data[
-            f"cluster_{distance_threshold}"
-        ].value_counts()
-        labels = data[f"cluster_{distance_threshold}"]
         # Calcular Calinski-Harabasz Score
         if len(np.unique(labels)) > 1:
             # Recalcular matriz de distancias con base en los clusters
-            euclidean_distances = pairwise_distances(
-                embeddings_matrix, metric="euclidean"
-            )
             ch_score = calinski_harabasz_score(euclidean_distances, labels)
             ch_score = round(ch_score, 2)
         else:
@@ -677,12 +675,10 @@ def perform_clustering(
         # Placeholder for finding the most similar comment function
         most_similar_comments[distance_threshold] = {}
         for cluster_id in np.unique(labels):
-            cluster_data = data[data[f"cluster_{distance_threshold}"] == cluster_id]
             avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
-            # Replace with your actual implementation
-            most_similar_comment = find_most_similar_comment(
-                cluster_data, avg_embedding
-            )
             most_similar_comments[distance_threshold][cluster_id] = most_similar_comment
     return (

     embeddings_matrix = np.array(data[embeddings_col].tolist())
+    if threshold_values is None:
         threshold_values = np.round(np.linspace(min_eps, max_eps, n), 6).astype(float)
         log_message(f"perform_clustering {threshold_values}")
     cluster_assignments = {}
     cluster_counts = {}
     for distance_threshold in threshold_values:
         log_message(distance_threshold)
         clustering = AgglomerativeClustering(
             n_clusters=None,
             distance_threshold=distance_threshold,
             metric="cosine",
         )
+        # Formatear el nombre de la columna para incluir solo 6 decimales
+        cluster_name = f"cluster_{distance_threshold:.6f}"
+        data[cluster_name] = clustering.fit_predict(embeddings_matrix)
+        # Almacenar los resultados en las estructuras correspondientes
+        cluster_assignments[distance_threshold] = data[cluster_name]
+        cluster_counts[distance_threshold] = data[cluster_name].value_counts()
+        labels = data[cluster_name]
         # Calcular Calinski-Harabasz Score
         if len(np.unique(labels)) > 1:
             # Recalcular matriz de distancias con base en los clusters
+            euclidean_distances = pairwise_distances(embeddings_matrix, metric="euclidean")
             ch_score = calinski_harabasz_score(euclidean_distances, labels)
             ch_score = round(ch_score, 2)
         else:
         # Placeholder for finding the most similar comment function
         most_similar_comments[distance_threshold] = {}
         for cluster_id in np.unique(labels):
+            cluster_data = data[data[cluster_name] == cluster_id]
             avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
+            # Reemplazar con tu implementación real
+            most_similar_comment = find_most_similar_comment(cluster_data, avg_embedding)
             most_similar_comments[distance_threshold][cluster_id] = most_similar_comment
     return (