PhyllisPeh commited on
Commit
ac25e1c
·
1 Parent(s): 4e29396

reduced cluster size

Browse files
Files changed (1) hide show
  1. app.py +87 -26
app.py CHANGED
@@ -243,29 +243,29 @@ MIN_PATENTS_FOR_GAPS = 3000 # Minimum patents needed for reliable gap detection
243
  def get_max_clusters(num_patents):
244
  """
245
  Calculate optimal maximum clusters based on dataset size.
246
- Aims for clusters of 50-200 patents for meaningful technological analysis.
247
  """
248
  if num_patents < 200:
249
- return min(6, num_patents // 25) # Very small: 25-35 patents per cluster
250
  elif num_patents < 500:
251
- return min(10, num_patents // 40) # Small datasets: 40-50 patents per cluster
252
  elif num_patents < 1000:
253
- return min(15, num_patents // 60) # Medium datasets: 60-70 patents per cluster
254
  elif num_patents < 2000:
255
- return min(20, num_patents // 80) # Large datasets: 80-100 patents per cluster
256
  else:
257
- return min(30, num_patents // 100) # Very large datasets: 100-150 patents per cluster
258
 
259
  def get_optimal_cluster_size(num_patents):
260
- """Calculate optimal target cluster size range"""
261
  if num_patents < 500:
262
- return 25, 80 # min=25, max=80
263
  elif num_patents < 1000:
264
- return 40, 120 # min=40, max=120
265
  elif num_patents < 2000:
266
- return 60, 150 # min=60, max=150
267
  else:
268
- return 80, 200 # min=80, max=200
269
 
270
  if not SERPAPI_API_KEY:
271
  raise ValueError("SERPAPI_API_KEY environment variable is not set")
@@ -664,10 +664,11 @@ def create_3d_visualization(patents):
664
  update_progress('clustering', 'processing', 'Applying optimized UMAP dimensionality reduction...')
665
  reducer = umap.UMAP(
666
  n_components=3,
667
- n_neighbors=30, # Increased for better global structure
668
- min_dist=0.1, # Reduced for tighter clusters
669
- spread=1.0, # Better cluster separation
670
- random_state=42
 
671
  )
672
  embedding_3d = reducer.fit_transform(embeddings_array)
673
 
@@ -696,18 +697,18 @@ def create_3d_visualization(patents):
696
  print(f"Processing {n_points} patents with improved clustering algorithm...")
697
 
698
  # Stage 1: Initial HDBSCAN with stricter parameters
699
- initial_min_cluster_size = max(min_cluster_size, int(n_points * 0.015)) # More aggressive minimum
700
- initial_min_samples = max(5, int(initial_min_cluster_size * 0.5)) # Stricter density requirement
701
 
702
  print(f"Stage 1 - Initial clustering: min_cluster_size={initial_min_cluster_size}, min_samples={initial_min_samples}")
703
 
704
  hdb = hdbscan.HDBSCAN(
705
  min_cluster_size=initial_min_cluster_size,
706
  min_samples=initial_min_samples,
707
- cluster_selection_epsilon=0.05, # Reduced for better separation
708
  cluster_selection_method='eom',
709
  metric='euclidean',
710
- alpha=1.0 # More conservative clustering
711
  )
712
  initial_clusters = hdb.fit_predict(scaled_embeddings)
713
 
@@ -724,7 +725,7 @@ def create_3d_visualization(patents):
724
  cluster_mask = initial_clusters == cluster_id
725
  cluster_size = sum(cluster_mask)
726
 
727
- # If cluster is too large, subdivide it
728
  if cluster_size > max_cluster_size:
729
  print(f" Subdividing cluster {cluster_id} ({cluster_size} patents) - TOO LARGE")
730
  cluster_subdivisions += 1
@@ -733,9 +734,12 @@ def create_3d_visualization(patents):
733
  cluster_data = scaled_embeddings[cluster_mask]
734
  cluster_indices = np.where(cluster_mask)[0]
735
 
736
- # Calculate how many subclusters we need
737
- n_subclusters = min(6, max(2, cluster_size // max_cluster_size + 1))
738
- print(f" Splitting into {n_subclusters} subclusters...")
 
 
 
739
 
740
  # Use KMeans for controlled subdivision
741
  kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
@@ -755,6 +759,52 @@ def create_3d_visualization(patents):
755
 
756
  print(f"Subdivided {cluster_subdivisions} oversized clusters")
757
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
758
  # Stage 3: Handle noise points more intelligently
759
  noise_mask = final_clusters == -1
760
  noise_count = sum(noise_mask)
@@ -880,13 +930,24 @@ def create_3d_visualization(patents):
880
  # Check if we successfully avoided mega-clusters
881
  oversized_clusters = [size for size in sizes if size > max_cluster_size]
882
  if oversized_clusters:
883
- print(f"⚠️ Warning: {len(oversized_clusters)} clusters still oversized: {oversized_clusters}")
 
 
 
 
 
 
884
  else:
885
- print(f"✅ Success: All clusters within target size range!")
886
 
887
  print("\nCluster Size Distribution:")
888
  for i, (label, size, _) in enumerate(final_cluster_info):
889
- status = "✅" if min_cluster_size <= size <= max_cluster_size else "⚠️"
 
 
 
 
 
890
  print(f" {status} Cluster {i + 1}: {size} patents")
891
 
892
  cluster_info = final_cluster_info
 
243
  def get_max_clusters(num_patents):
244
  """
245
  Calculate optimal maximum clusters based on dataset size.
246
+ REVISED: More clusters for larger datasets to keep individual cluster sizes smaller.
247
  """
248
  if num_patents < 200:
249
+ return min(8, num_patents // 20) # Very small: 20-25 patents per cluster
250
  elif num_patents < 500:
251
+ return min(12, num_patents // 30) # Small datasets: 30-40 patents per cluster
252
  elif num_patents < 1000:
253
+ return min(20, num_patents // 40) # Medium datasets: 40-50 patents per cluster
254
  elif num_patents < 2000:
255
+ return min(30, num_patents // 60) # Large datasets: 60-70 patents per cluster
256
  else:
257
+ return min(50, num_patents // 80) # Very large datasets: 80-100 patents per cluster (increased from 30 max)
258
 
259
  def get_optimal_cluster_size(num_patents):
260
+ """Calculate optimal target cluster size range - REDUCED MAX SIZES to prevent mega-clusters"""
261
  if num_patents < 500:
262
+ return 25, 60 # min=25, max=60 (reduced from 80)
263
  elif num_patents < 1000:
264
+ return 40, 80 # min=40, max=80 (reduced from 120)
265
  elif num_patents < 2000:
266
+ return 50, 100 # min=50, max=100 (reduced from 150)
267
  else:
268
+ return 60, 120 # min=60, max=120 (reduced from 200)
269
 
270
  if not SERPAPI_API_KEY:
271
  raise ValueError("SERPAPI_API_KEY environment variable is not set")
 
664
  update_progress('clustering', 'processing', 'Applying optimized UMAP dimensionality reduction...')
665
  reducer = umap.UMAP(
666
  n_components=3,
667
+ n_neighbors=20, # Reduced from 30 for more local structure
668
+ min_dist=0.05, # Reduced from 0.1 for even tighter clusters
669
+ spread=0.8, # Reduced from 1.0 for better cluster separation
670
+ random_state=42,
671
+ metric='cosine' # Added cosine metric for better semantic clustering
672
  )
673
  embedding_3d = reducer.fit_transform(embeddings_array)
674
 
 
697
  print(f"Processing {n_points} patents with improved clustering algorithm...")
698
 
699
  # Stage 1: Initial HDBSCAN with stricter parameters
700
+ initial_min_cluster_size = max(min_cluster_size, int(n_points * 0.020)) # Increased from 0.015 to 0.020 for stricter minimum
701
+ initial_min_samples = max(8, int(initial_min_cluster_size * 0.6)) # Increased from 0.5 to 0.6 for stricter density
702
 
703
  print(f"Stage 1 - Initial clustering: min_cluster_size={initial_min_cluster_size}, min_samples={initial_min_samples}")
704
 
705
  hdb = hdbscan.HDBSCAN(
706
  min_cluster_size=initial_min_cluster_size,
707
  min_samples=initial_min_samples,
708
+ cluster_selection_epsilon=0.03, # Reduced from 0.05 for tighter clusters
709
  cluster_selection_method='eom',
710
  metric='euclidean',
711
+ alpha=1.2 # Increased from 1.0 for even more conservative clustering
712
  )
713
  initial_clusters = hdb.fit_predict(scaled_embeddings)
714
 
 
725
  cluster_mask = initial_clusters == cluster_id
726
  cluster_size = sum(cluster_mask)
727
 
728
+ # If cluster is too large, subdivide it more aggressively
729
  if cluster_size > max_cluster_size:
730
  print(f" Subdividing cluster {cluster_id} ({cluster_size} patents) - TOO LARGE")
731
  cluster_subdivisions += 1
 
734
  cluster_data = scaled_embeddings[cluster_mask]
735
  cluster_indices = np.where(cluster_mask)[0]
736
 
737
+ # Calculate how many subclusters we need - MORE AGGRESSIVE subdivision
738
+ target_size = max_cluster_size * 0.75 # Target 75% of max size for better separation
739
+ n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
740
+ # Cap at reasonable maximum but allow more splits if needed
741
+ n_subclusters = min(10, n_subclusters) # Increased from 6 to 10
742
+ print(f" Splitting into {n_subclusters} subclusters (target size: {target_size:.0f})...")
743
 
744
  # Use KMeans for controlled subdivision
745
  kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
 
759
 
760
  print(f"Subdivided {cluster_subdivisions} oversized clusters")
761
 
762
+ # Stage 2.5: Additional validation and forced subdivision for any remaining oversized clusters
763
+ print("Stage 2.5 - Final oversized cluster validation...")
764
+ additional_subdivisions = 0
765
+ for cluster_id in set(final_clusters):
766
+ if cluster_id == -1: # Skip noise
767
+ continue
768
+
769
+ cluster_mask = final_clusters == cluster_id
770
+ cluster_size = sum(cluster_mask)
771
+
772
+ # Force subdivision of any clusters still over the limit
773
+ if cluster_size > max_cluster_size:
774
+ print(f" FORCING additional subdivision of cluster {cluster_id} ({cluster_size} patents)")
775
+ additional_subdivisions += 1
776
+
777
+ # Extract data for this still-oversized cluster
778
+ cluster_data = scaled_embeddings[cluster_mask]
779
+ cluster_indices = np.where(cluster_mask)[0]
780
+
781
+ # Force more aggressive subdivision
782
+ target_size = max_cluster_size * 0.6 # Even more aggressive - 60% of max
783
+ n_subclusters = max(3, int(np.ceil(cluster_size / target_size)))
784
+ n_subclusters = min(15, n_subclusters) # Allow up to 15 splits if needed
785
+ print(f" FORCING split into {n_subclusters} subclusters...")
786
+
787
+ # Use KMeans for forced subdivision
788
+ kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
789
+ subclusters = kmeans.fit_predict(cluster_data)
790
+
791
+ # Assign new cluster IDs
792
+ for i, subcluster_id in enumerate(subclusters):
793
+ original_idx = cluster_indices[i]
794
+ if subcluster_id == 0:
795
+ # Keep first subcluster with original ID
796
+ final_clusters[original_idx] = cluster_id
797
+ else:
798
+ # Assign new IDs to other subclusters
799
+ final_clusters[original_idx] = next_cluster_id + subcluster_id - 1
800
+
801
+ next_cluster_id += n_subclusters - 1
802
+
803
+ if additional_subdivisions > 0:
804
+ print(f"Performed {additional_subdivisions} additional forced subdivisions")
805
+ else:
806
+ print("No additional subdivisions needed - all clusters within size limits")
807
+
808
  # Stage 3: Handle noise points more intelligently
809
  noise_mask = final_clusters == -1
810
  noise_count = sum(noise_mask)
 
930
  # Check if we successfully avoided mega-clusters
931
  oversized_clusters = [size for size in sizes if size > max_cluster_size]
932
  if oversized_clusters:
933
+ print(f"⚠️ WARNING: {len(oversized_clusters)} clusters STILL oversized: {oversized_clusters}")
934
+ print(f"❌ FAILED to contain all clusters within target range!")
935
+
936
+ # Log the oversized clusters for debugging
937
+ for i, (label, size, _) in enumerate(final_cluster_info):
938
+ if size > max_cluster_size:
939
+ print(f" Oversized Cluster {i + 1}: {size} patents (EXCEEDS LIMIT of {max_cluster_size})")
940
  else:
941
+ print(f"✅ SUCCESS: All clusters within target size range!")
942
 
943
  print("\nCluster Size Distribution:")
944
  for i, (label, size, _) in enumerate(final_cluster_info):
945
+ if size > max_cluster_size:
946
+ status = "❌ OVERSIZED"
947
+ elif min_cluster_size <= size <= max_cluster_size:
948
+ status = "✅ OPTIMAL"
949
+ else:
950
+ status = "⚠️ SMALL"
951
  print(f" {status} Cluster {i + 1}: {size} patents")
952
 
953
  cluster_info = final_cluster_info