PhyllisPeh commited on
Commit
3230189
·
1 Parent(s): ac25e1c

fixed cluster sizes

Browse files
Files changed (1) hide show
  1. app.py +102 -21
app.py CHANGED
@@ -257,15 +257,15 @@ def get_max_clusters(num_patents):
257
  return min(50, num_patents // 80) # Very large datasets: 80-100 patents per cluster (increased from 30 max)
258
 
259
  def get_optimal_cluster_size(num_patents):
260
- """Calculate optimal target cluster size range - REDUCED MAX SIZES to prevent mega-clusters"""
261
  if num_patents < 500:
262
- return 25, 60 # min=25, max=60 (reduced from 80)
263
  elif num_patents < 1000:
264
- return 40, 80 # min=40, max=80 (reduced from 120)
265
  elif num_patents < 2000:
266
- return 50, 100 # min=50, max=100 (reduced from 150)
267
  else:
268
- return 60, 120 # min=60, max=120 (reduced from 200)
269
 
270
  if not SERPAPI_API_KEY:
271
  raise ValueError("SERPAPI_API_KEY environment variable is not set")
@@ -735,10 +735,10 @@ def create_3d_visualization(patents):
735
  cluster_indices = np.where(cluster_mask)[0]
736
 
737
  # Calculate how many subclusters we need - MORE AGGRESSIVE subdivision
738
- target_size = max_cluster_size * 0.75 # Target 75% of max size for better separation
739
  n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
740
  # Cap at reasonable maximum but allow more splits if needed
741
- n_subclusters = min(10, n_subclusters) # Increased from 6 to 10
742
  print(f" Splitting into {n_subclusters} subclusters (target size: {target_size:.0f})...")
743
 
744
  # Use KMeans for controlled subdivision
@@ -779,9 +779,9 @@ def create_3d_visualization(patents):
779
  cluster_indices = np.where(cluster_mask)[0]
780
 
781
  # Force more aggressive subdivision
782
- target_size = max_cluster_size * 0.6 # Even more aggressive - 60% of max
783
  n_subclusters = max(3, int(np.ceil(cluster_size / target_size)))
784
- n_subclusters = min(15, n_subclusters) # Allow up to 15 splits if needed
785
  print(f" FORCING split into {n_subclusters} subclusters...")
786
 
787
  # Use KMeans for forced subdivision
@@ -805,42 +805,120 @@ def create_3d_visualization(patents):
805
  else:
806
  print("No additional subdivisions needed - all clusters within size limits")
807
 
808
- # Stage 3: Handle noise points more intelligently
809
  noise_mask = final_clusters == -1
810
  noise_count = sum(noise_mask)
811
 
812
  if noise_count > 0:
813
- print(f"Stage 3 - Reassigning {noise_count} noise points...")
814
 
815
- # Get cluster centers (excluding noise)
816
  cluster_centers = []
817
  cluster_labels = []
 
818
  for label in set(final_clusters):
819
  if label != -1:
820
  cluster_mask = final_clusters == label
821
  center = np.mean(scaled_embeddings[cluster_mask], axis=0)
822
  cluster_centers.append(center)
823
  cluster_labels.append(label)
 
824
 
825
  if cluster_centers:
826
  cluster_centers = np.array(cluster_centers)
827
  noise_points = scaled_embeddings[noise_mask]
828
 
829
- # Find nearest cluster for each noise point
830
- nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
831
  distances, nearest_indices = nbrs.kneighbors(noise_points)
832
 
833
- # Only assign noise points that are reasonably close to a cluster
834
- max_distance = np.percentile(distances, 75) # Use 75th percentile as threshold
835
 
836
  noise_indices = np.where(noise_mask)[0]
837
  reassigned_count = 0
838
- for i, (distance, nearest_idx) in enumerate(zip(distances.flatten(), nearest_indices.flatten())):
839
- if distance <= max_distance:
840
- final_clusters[noise_indices[i]] = cluster_labels[nearest_idx]
841
- reassigned_count += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
842
 
843
  print(f" Reassigned {reassigned_count}/{noise_count} noise points to nearby clusters")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
844
 
845
  clusters = final_clusters
846
 
@@ -944,11 +1022,14 @@ def create_3d_visualization(patents):
944
  for i, (label, size, _) in enumerate(final_cluster_info):
945
  if size > max_cluster_size:
946
  status = "❌ OVERSIZED"
 
947
  elif min_cluster_size <= size <= max_cluster_size:
948
  status = "✅ OPTIMAL"
 
949
  else:
950
  status = "⚠️ SMALL"
951
- print(f" {status} Cluster {i + 1}: {size} patents")
 
952
 
953
  cluster_info = final_cluster_info
954
 
 
257
  return min(50, num_patents // 80) # Very large datasets: 80-100 patents per cluster (increased from 30 max)
258
 
259
  def get_optimal_cluster_size(num_patents):
260
+ """Calculate optimal target cluster size range - ADJUSTED to account for noise point reassignment"""
261
  if num_patents < 500:
262
+ return 25, 90 # min=25, max=90 (increased from 60 to allow room for noise points)
263
  elif num_patents < 1000:
264
+ return 40, 100 # min=40, max=100 (increased from 80)
265
  elif num_patents < 2000:
266
+ return 50, 130 # min=50, max=130 (increased from 100)
267
  else:
268
+ return 60, 150 # min=60, max=150 (increased from 120)
269
 
270
  if not SERPAPI_API_KEY:
271
  raise ValueError("SERPAPI_API_KEY environment variable is not set")
 
735
  cluster_indices = np.where(cluster_mask)[0]
736
 
737
  # Calculate how many subclusters we need - MORE AGGRESSIVE subdivision
738
+ target_size = max_cluster_size * 0.6 # Target 60% of max size for better buffer
739
  n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
740
  # Cap at reasonable maximum but allow more splits if needed
741
+ n_subclusters = min(12, n_subclusters) # Increased from 10 to 12
742
  print(f" Splitting into {n_subclusters} subclusters (target size: {target_size:.0f})...")
743
 
744
  # Use KMeans for controlled subdivision
 
779
  cluster_indices = np.where(cluster_mask)[0]
780
 
781
  # Force more aggressive subdivision
782
+ target_size = max_cluster_size * 0.5 # Even more aggressive - 50% of max
783
  n_subclusters = max(3, int(np.ceil(cluster_size / target_size)))
784
+ n_subclusters = min(20, n_subclusters) # Allow up to 20 splits if needed
785
  print(f" FORCING split into {n_subclusters} subclusters...")
786
 
787
  # Use KMeans for forced subdivision
 
805
  else:
806
  print("No additional subdivisions needed - all clusters within size limits")
807
 
808
+ # Stage 3: Handle noise points more intelligently with size constraints
809
  noise_mask = final_clusters == -1
810
  noise_count = sum(noise_mask)
811
 
812
  if noise_count > 0:
813
+ print(f"Stage 3 - Reassigning {noise_count} noise points with size constraints...")
814
 
815
+ # Get cluster centers and current sizes (excluding noise)
816
  cluster_centers = []
817
  cluster_labels = []
818
+ cluster_sizes = {}
819
  for label in set(final_clusters):
820
  if label != -1:
821
  cluster_mask = final_clusters == label
822
  center = np.mean(scaled_embeddings[cluster_mask], axis=0)
823
  cluster_centers.append(center)
824
  cluster_labels.append(label)
825
+ cluster_sizes[label] = sum(cluster_mask)
826
 
827
  if cluster_centers:
828
  cluster_centers = np.array(cluster_centers)
829
  noise_points = scaled_embeddings[noise_mask]
830
 
831
+ # Find nearest clusters for each noise point
832
+ nbrs = NearestNeighbors(n_neighbors=min(3, len(cluster_centers))).fit(cluster_centers)
833
  distances, nearest_indices = nbrs.kneighbors(noise_points)
834
 
835
+ # Use a tighter distance threshold for reassignment
836
+ max_distance = np.percentile(distances[:, 0], 60) # Use 60th percentile instead of 75th
837
 
838
  noise_indices = np.where(noise_mask)[0]
839
  reassigned_count = 0
840
+ rejected_too_far = 0
841
+ rejected_too_large = 0
842
+
843
+ # Calculate size buffer - leave room for some noise points
844
+ size_buffer = max_cluster_size * 0.85 # Only allow clusters to grow to 85% of max
845
+
846
+ for i, (row_distances, row_nearest_indices) in enumerate(zip(distances, nearest_indices)):
847
+ assigned = False
848
+
849
+ # Try each of the nearest clusters in order
850
+ for dist, nearest_idx in zip(row_distances, row_nearest_indices):
851
+ if dist > max_distance:
852
+ break # All remaining will be too far
853
+
854
+ target_label = cluster_labels[nearest_idx]
855
+ current_size = cluster_sizes[target_label]
856
+
857
+ # Only assign if cluster has room to grow
858
+ if current_size < size_buffer:
859
+ final_clusters[noise_indices[i]] = target_label
860
+ cluster_sizes[target_label] += 1 # Update size tracker
861
+ reassigned_count += 1
862
+ assigned = True
863
+ break
864
+ else:
865
+ rejected_too_large += 1
866
+
867
+ if not assigned and row_distances[0] <= max_distance:
868
+ rejected_too_far += 1
869
 
870
  print(f" Reassigned {reassigned_count}/{noise_count} noise points to nearby clusters")
871
+ print(f" Rejected {rejected_too_large} points (target clusters too large)")
872
+ print(f" Rejected {rejected_too_far} points (too far from suitable clusters)")
873
+ remaining_noise = noise_count - reassigned_count
874
+ if remaining_noise > 0:
875
+ print(f" {remaining_noise} points remain as noise to prevent oversized clusters")
876
+
877
+ # Stage 4: Final post-noise cleanup - subdivide any clusters that grew too large
878
+ print("Stage 4 - Post-noise subdivision check...")
879
+ final_subdivisions = 0
880
+ for cluster_id in set(final_clusters):
881
+ if cluster_id == -1: # Skip noise
882
+ continue
883
+
884
+ cluster_mask = final_clusters == cluster_id
885
+ cluster_size = sum(cluster_mask)
886
+
887
+ # If cluster grew too large after noise reassignment, subdivide again
888
+ if cluster_size > max_cluster_size:
889
+ print(f" Post-noise subdivision of cluster {cluster_id} ({cluster_size} patents)")
890
+ final_subdivisions += 1
891
+
892
+ # Extract data for this oversized cluster
893
+ cluster_data = scaled_embeddings[cluster_mask]
894
+ cluster_indices = np.where(cluster_mask)[0]
895
+
896
+ # Very aggressive subdivision for final cleanup
897
+ target_size = max_cluster_size * 0.7 # Target 70% of max size
898
+ n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
899
+ n_subclusters = min(8, n_subclusters) # Reasonable cap
900
+ print(f" Final split into {n_subclusters} subclusters...")
901
+
902
+ # Use KMeans for final subdivision
903
+ kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
904
+ subclusters = kmeans.fit_predict(cluster_data)
905
+
906
+ # Assign new cluster IDs
907
+ for i, subcluster_id in enumerate(subclusters):
908
+ original_idx = cluster_indices[i]
909
+ if subcluster_id == 0:
910
+ # Keep first subcluster with original ID
911
+ final_clusters[original_idx] = cluster_id
912
+ else:
913
+ # Assign new IDs to other subclusters
914
+ final_clusters[original_idx] = next_cluster_id + subcluster_id - 1
915
+
916
+ next_cluster_id += n_subclusters - 1
917
+
918
+ if final_subdivisions > 0:
919
+ print(f"Performed {final_subdivisions} final post-noise subdivisions")
920
+ else:
921
+ print("No post-noise subdivisions needed")
922
 
923
  clusters = final_clusters
924
 
 
1022
  for i, (label, size, _) in enumerate(final_cluster_info):
1023
  if size > max_cluster_size:
1024
  status = "❌ OVERSIZED"
1025
+ severity = f"(+{size - max_cluster_size} over limit)"
1026
  elif min_cluster_size <= size <= max_cluster_size:
1027
  status = "✅ OPTIMAL"
1028
+ severity = ""
1029
  else:
1030
  status = "⚠️ SMALL"
1031
+ severity = f"({min_cluster_size - size} under target)"
1032
+ print(f" {status} Cluster {i + 1}: {size} patents {severity}")
1033
 
1034
  cluster_info = final_cluster_info
1035