Kuautli commited on
Commit
e7d710a
verified
1 Parent(s): d1607d8

Update clustering.py

Browse files
Files changed (1) hide show
  1. clustering.py +14 -18
clustering.py CHANGED
@@ -625,10 +625,9 @@ def perform_clustering(
625
 
626
  embeddings_matrix = np.array(data[embeddings_col].tolist())
627
 
628
- if not threshold_values:
629
  threshold_values = np.round(np.linspace(min_eps, max_eps, n), 6).astype(float)
630
  log_message(f"perform_clustering {threshold_values}")
631
- # threshold_values = np.linspace(min_eps, max_eps, n)
632
 
633
  cluster_assignments = {}
634
  cluster_counts = {}
@@ -638,6 +637,7 @@ def perform_clustering(
638
 
639
  for distance_threshold in threshold_values:
640
  log_message(distance_threshold)
 
641
  clustering = AgglomerativeClustering(
642
  n_clusters=None,
643
  distance_threshold=distance_threshold,
@@ -645,21 +645,19 @@ def perform_clustering(
645
  metric="cosine",
646
  )
647
 
648
- data[f"cluster_{distance_threshold}"] = clustering.fit_predict(
649
- embeddings_matrix
650
- )
651
- cluster_assignments[distance_threshold] = data[f"cluster_{distance_threshold}"]
652
- cluster_counts[distance_threshold] = data[
653
- f"cluster_{distance_threshold}"
654
- ].value_counts()
655
- labels = data[f"cluster_{distance_threshold}"]
656
 
657
  # Calcular Calinski-Harabasz Score
658
  if len(np.unique(labels)) > 1:
659
  # Recalcular matriz de distancias con base en los clusters
660
- euclidean_distances = pairwise_distances(
661
- embeddings_matrix, metric="euclidean"
662
- )
663
  ch_score = calinski_harabasz_score(euclidean_distances, labels)
664
  ch_score = round(ch_score, 2)
665
  else:
@@ -677,12 +675,10 @@ def perform_clustering(
677
  # Placeholder for finding the most similar comment function
678
  most_similar_comments[distance_threshold] = {}
679
  for cluster_id in np.unique(labels):
680
- cluster_data = data[data[f"cluster_{distance_threshold}"] == cluster_id]
681
  avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
682
- # Replace with your actual implementation
683
- most_similar_comment = find_most_similar_comment(
684
- cluster_data, avg_embedding
685
- )
686
  most_similar_comments[distance_threshold][cluster_id] = most_similar_comment
687
 
688
  return (
 
625
 
626
  embeddings_matrix = np.array(data[embeddings_col].tolist())
627
 
628
+ if threshold_values is None:
629
  threshold_values = np.round(np.linspace(min_eps, max_eps, n), 6).astype(float)
630
  log_message(f"perform_clustering {threshold_values}")
 
631
 
632
  cluster_assignments = {}
633
  cluster_counts = {}
 
637
 
638
  for distance_threshold in threshold_values:
639
  log_message(distance_threshold)
640
+
641
  clustering = AgglomerativeClustering(
642
  n_clusters=None,
643
  distance_threshold=distance_threshold,
 
645
  metric="cosine",
646
  )
647
 
648
+ # Formatear el nombre de la columna para incluir solo 6 decimales
649
+ cluster_name = f"cluster_{distance_threshold:.6f}"
650
+ data[cluster_name] = clustering.fit_predict(embeddings_matrix)
651
+
652
+ # Almacenar los resultados en las estructuras correspondientes
653
+ cluster_assignments[distance_threshold] = data[cluster_name]
654
+ cluster_counts[distance_threshold] = data[cluster_name].value_counts()
655
+ labels = data[cluster_name]
656
 
657
  # Calcular Calinski-Harabasz Score
658
  if len(np.unique(labels)) > 1:
659
  # Recalcular matriz de distancias con base en los clusters
660
+ euclidean_distances = pairwise_distances(embeddings_matrix, metric="euclidean")
 
 
661
  ch_score = calinski_harabasz_score(euclidean_distances, labels)
662
  ch_score = round(ch_score, 2)
663
  else:
 
675
  # Placeholder for finding the most similar comment function
676
  most_similar_comments[distance_threshold] = {}
677
  for cluster_id in np.unique(labels):
678
+ cluster_data = data[data[cluster_name] == cluster_id]
679
  avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
680
+ # Reemplazar con tu implementaci贸n real
681
+ most_similar_comment = find_most_similar_comment(cluster_data, avg_embedding)
 
 
682
  most_similar_comments[distance_threshold][cluster_id] = most_similar_comment
683
 
684
  return (