Kuautli commited on
Commit
5c6716a
verified
1 Parent(s): 8ee8a76

Update clustering.py

Browse files
Files changed (1) hide show
  1. clustering.py +9 -9
clustering.py CHANGED
@@ -637,21 +637,21 @@ def perform_clustering(
637
 
638
  for distance_threshold in threshold_values:
639
  log_message(distance_threshold)
640
- distance_threshold = round(distance_threshold, 6)
641
  clustering = AgglomerativeClustering(
642
  n_clusters=None,
643
- distance_threshold=distance_threshold,
644
  linkage="complete",
645
  metric="cosine",
646
  )
647
 
648
  # Formatear el nombre de la columna para incluir solo 6 decimales
649
- cluster_name = f"cluster_{distance_threshold:.6f}"
650
  data[cluster_name] = clustering.fit_predict(embeddings_matrix)
651
 
652
  # Almacenar los resultados en las estructuras correspondientes
653
- cluster_assignments[distance_threshold] = data[cluster_name]
654
- cluster_counts[distance_threshold] = data[cluster_name].value_counts()
655
  labels = data[cluster_name]
656
 
657
  # Calcular Calinski-Harabasz Score
@@ -662,7 +662,7 @@ def perform_clustering(
662
  ch_score = round(ch_score, 2)
663
  else:
664
  ch_score = -1 # Valor predeterminado si solo hay un cl煤ster
665
- calinski_harabasz_scores[distance_threshold] = ch_score
666
 
667
  # Calcular Silhouette Score
668
  if len(np.unique(labels)) > 1:
@@ -670,16 +670,16 @@ def perform_clustering(
670
  sil_score = round(sil_score, 2)
671
  else:
672
  sil_score = -1 # Valor predeterminado si solo hay un cl煤ster
673
- silhouette_scores[distance_threshold] = sil_score
674
 
675
  # Placeholder for finding the most similar comment function
676
- most_similar_comments[distance_threshold] = {}
677
  for cluster_id in np.unique(labels):
678
  cluster_data = data[data[cluster_name] == cluster_id]
679
  avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
680
  # Reemplazar con tu implementaci贸n real
681
  most_similar_comment = find_most_similar_comment(cluster_data, avg_embedding)
682
- most_similar_comments[distance_threshold][cluster_id] = most_similar_comment
683
 
684
  return (
685
  cluster_assignments,
 
637
 
638
  for distance_threshold in threshold_values:
639
  log_message(distance_threshold)
640
+ rounded_distance_threshold = round(rounded_distance_threshold, 6)
641
  clustering = AgglomerativeClustering(
642
  n_clusters=None,
643
+ rounded_distance_threshold=rounded_distance_threshold,
644
  linkage="complete",
645
  metric="cosine",
646
  )
647
 
648
  # Formatear el nombre de la columna para incluir solo 6 decimales
649
+ cluster_name = f"cluster_{rounded_distance_threshold:.6f}"
650
  data[cluster_name] = clustering.fit_predict(embeddings_matrix)
651
 
652
  # Almacenar los resultados en las estructuras correspondientes
653
+ cluster_assignments[rounded_distance_threshold] = data[cluster_name]
654
+ cluster_counts[rounded_distance_threshold] = data[cluster_name].value_counts()
655
  labels = data[cluster_name]
656
 
657
  # Calcular Calinski-Harabasz Score
 
662
  ch_score = round(ch_score, 2)
663
  else:
664
  ch_score = -1 # Valor predeterminado si solo hay un cl煤ster
665
+ calinski_harabasz_scores[rounded_distance_threshold] = ch_score
666
 
667
  # Calcular Silhouette Score
668
  if len(np.unique(labels)) > 1:
 
670
  sil_score = round(sil_score, 2)
671
  else:
672
  sil_score = -1 # Valor predeterminado si solo hay un cl煤ster
673
+ silhouette_scores[rounded_distance_threshold] = sil_score
674
 
675
  # Placeholder for finding the most similar comment function
676
+ most_similar_comments[rounded_distance_threshold] = {}
677
  for cluster_id in np.unique(labels):
678
  cluster_data = data[data[cluster_name] == cluster_id]
679
  avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
680
  # Reemplazar con tu implementaci贸n real
681
  most_similar_comment = find_most_similar_comment(cluster_data, avg_embedding)
682
+ most_similar_comments[rounded_distance_threshold][cluster_id] = most_similar_comment
683
 
684
  return (
685
  cluster_assignments,