PhyllisPeh commited on
Commit
4e29396
·
1 Parent(s): d8b8d6f

fixed clustering algorithm

Browse files
Files changed (1) hide show
  1. app.py +155 -60
app.py CHANGED
@@ -18,6 +18,7 @@ import umap
18
  import openai
19
  from sklearn.neighbors import NearestNeighbors
20
  from sklearn.preprocessing import StandardScaler
 
21
  import hdbscan
22
  import plotly.graph_objects as go
23
  import requests
@@ -242,16 +243,29 @@ MIN_PATENTS_FOR_GAPS = 3000 # Minimum patents needed for reliable gap detection
242
  def get_max_clusters(num_patents):
243
  """
244
  Calculate optimal maximum clusters based on dataset size.
245
- Aims for clusters of 75-150 patents for meaningful technological analysis.
246
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  if num_patents < 500:
248
- return min(8, num_patents // 30) # Smaller datasets: 30-60 patents per cluster
249
  elif num_patents < 1000:
250
- return min(12, num_patents // 75) # Medium datasets: 75-85 patents per cluster
251
  elif num_patents < 2000:
252
- return min(16, num_patents // 100) # Large datasets: 100-125 patents per cluster
253
  else:
254
- return min(24, num_patents // 125) # Very large datasets: 125-150 patents per cluster
255
 
256
  if not SERPAPI_API_KEY:
257
  raise ValueError("SERPAPI_API_KEY environment variable is not set")
@@ -646,16 +660,27 @@ def create_3d_visualization(patents):
646
 
647
  update_progress('clustering', 'processing', 'Applying UMAP dimensionality reduction...')
648
 
649
- # Apply UMAP dimensionality reduction
650
- reducer = umap.UMAP(n_components=3, random_state=42)
 
 
 
 
 
 
 
651
  embedding_3d = reducer.fit_transform(embeddings_array)
652
 
653
- # Calculate optimal cluster limit for this dataset
654
  max_clusters = get_max_clusters(len(embeddings))
655
- print(f"\nDataset size: {len(embeddings)} patents")
656
- print(f"Optimal cluster limit: {max_clusters} clusters (targeting 75-150 patents per cluster)")
657
 
658
- update_progress('clustering', 'processing', f'Performing HDBSCAN clustering (max {max_clusters} clusters)...')
 
 
 
 
 
659
 
660
  # Create DataFrame for plotting
661
  df = pd.DataFrame(metadata)
@@ -663,47 +688,86 @@ def create_3d_visualization(patents):
663
  df['y'] = embedding_3d[:, 1]
664
  df['z'] = embedding_3d[:, 2]
665
 
666
- # --- Simplified HDBSCAN clustering for technological clusters ---
667
  scaler = StandardScaler()
668
  scaled_embeddings = scaler.fit_transform(embedding_3d)
669
 
670
  n_points = len(scaled_embeddings)
671
- update_progress('clustering', 'processing', f'Analyzing {n_points} patents for clustering...')
672
-
673
- # Dynamically set clustering parameters based on dataset size
674
- if n_points < 100:
675
- min_cluster_size = max(5, int(n_points * 0.08))
676
- elif n_points < 500:
677
- min_cluster_size = max(8, int(n_points * 0.05))
678
- elif n_points < 1000:
679
- min_cluster_size = max(15, int(n_points * 0.03))
680
- else:
681
- min_cluster_size = max(20, int(n_points * 0.02))
682
-
683
- min_samples = max(3, int(min_cluster_size * 0.7))
684
 
685
- print(f"HDBSCAN clustering: min_cluster_size={min_cluster_size}, min_samples={min_samples}")
686
 
687
- # Apply HDBSCAN clustering
688
  hdb = hdbscan.HDBSCAN(
689
- min_cluster_size=min_cluster_size,
690
- min_samples=min_samples,
691
- cluster_selection_epsilon=0.1,
692
  cluster_selection_method='eom',
693
- metric='euclidean'
 
694
  )
695
- clusters = hdb.fit_predict(scaled_embeddings)
696
 
697
- # Assign noise points to nearest cluster
698
- noise_mask = clusters == -1
699
- if any(noise_mask) and len(set(clusters)) > 1:
700
- print(f"Assigning {sum(noise_mask)} noise points to nearest clusters...")
701
- # Get cluster centers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
702
  cluster_centers = []
703
  cluster_labels = []
704
- for label in set(clusters):
705
  if label != -1:
706
- cluster_mask = clusters == label
707
  center = np.mean(scaled_embeddings[cluster_mask], axis=0)
708
  cluster_centers.append(center)
709
  cluster_labels.append(label)
@@ -714,12 +778,21 @@ def create_3d_visualization(patents):
714
 
715
  # Find nearest cluster for each noise point
716
  nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
717
- _, nearest_indices = nbrs.kneighbors(noise_points)
 
 
 
718
 
719
- # Assign noise points to nearest clusters
720
  noise_indices = np.where(noise_mask)[0]
721
- for i, nearest_idx in enumerate(nearest_indices.flatten()):
722
- clusters[noise_indices[i]] = cluster_labels[nearest_idx]
 
 
 
 
 
 
 
723
 
724
  df['cluster'] = clusters
725
 
@@ -776,25 +849,47 @@ def create_3d_visualization(patents):
776
  # Update cluster_info to only include main clusters
777
  cluster_info = main_clusters
778
 
779
- # Recalculate cluster sizes after reassignment
780
- updated_cluster_info = []
781
- for old_label, _, _ in main_clusters:
782
- cluster_mask = clusters == old_label
 
 
 
783
  cluster_patents = df[cluster_mask]
784
- updated_cluster_info.append((old_label, len(cluster_patents), cluster_patents))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
 
786
- # Sort again by new sizes
787
- updated_cluster_info.sort(key=lambda x: x[1], reverse=True)
788
- cluster_info = updated_cluster_info
789
-
790
- print(f"\nFinal Clustering Results:")
791
- print(f"Number of technological clusters: {len(cluster_info)} (limited to max {max_clusters})")
792
- print(f"Total patents clustered: {len(df)}")
793
- avg_cluster_size = len(df) / len(cluster_info) if cluster_info else 0
794
- print(f"Average cluster size: {avg_cluster_size:.1f} patents")
795
- print("\nCluster Size Distribution:")
796
- for i, (label, size, _) in enumerate(cluster_info):
797
- print(f"Cluster {i + 1}: {size} patents")
 
 
 
 
798
 
799
  # Create mapping for new cluster IDs (1-based)
800
  cluster_id_map = {old_label: i + 1 for i, (old_label, _, _) in enumerate(cluster_info)}
 
18
  import openai
19
  from sklearn.neighbors import NearestNeighbors
20
  from sklearn.preprocessing import StandardScaler
21
+ from sklearn.cluster import KMeans
22
  import hdbscan
23
  import plotly.graph_objects as go
24
  import requests
 
243
  def get_max_clusters(num_patents):
244
  """
245
  Calculate optimal maximum clusters based on dataset size.
246
+ Aims for clusters of 50-200 patents for meaningful technological analysis.
247
  """
248
+ if num_patents < 200:
249
+ return min(6, num_patents // 25) # Very small: 25-35 patents per cluster
250
+ elif num_patents < 500:
251
+ return min(10, num_patents // 40) # Small datasets: 40-50 patents per cluster
252
+ elif num_patents < 1000:
253
+ return min(15, num_patents // 60) # Medium datasets: 60-70 patents per cluster
254
+ elif num_patents < 2000:
255
+ return min(20, num_patents // 80) # Large datasets: 80-100 patents per cluster
256
+ else:
257
+ return min(30, num_patents // 100) # Very large datasets: 100-150 patents per cluster
258
+
259
+ def get_optimal_cluster_size(num_patents):
260
+ """Calculate optimal target cluster size range"""
261
  if num_patents < 500:
262
+ return 25, 80 # min=25, max=80
263
  elif num_patents < 1000:
264
+ return 40, 120 # min=40, max=120
265
  elif num_patents < 2000:
266
+ return 60, 150 # min=60, max=150
267
  else:
268
+ return 80, 200 # min=80, max=200
269
 
270
  if not SERPAPI_API_KEY:
271
  raise ValueError("SERPAPI_API_KEY environment variable is not set")
 
660
 
661
  update_progress('clustering', 'processing', 'Applying UMAP dimensionality reduction...')
662
 
663
+ # Apply UMAP dimensionality reduction with better parameters for technology separation
664
+ update_progress('clustering', 'processing', 'Applying optimized UMAP dimensionality reduction...')
665
+ reducer = umap.UMAP(
666
+ n_components=3,
667
+ n_neighbors=30, # Increased for better global structure
668
+ min_dist=0.1, # Reduced for tighter clusters
669
+ spread=1.0, # Better cluster separation
670
+ random_state=42
671
+ )
672
  embedding_3d = reducer.fit_transform(embeddings_array)
673
 
674
+ # Calculate optimal cluster parameters
675
  max_clusters = get_max_clusters(len(embeddings))
676
+ min_cluster_size, max_cluster_size = get_optimal_cluster_size(len(embeddings))
 
677
 
678
+ print(f"\n🎯 IMPROVED CLUSTERING STRATEGY:")
679
+ print(f"Dataset size: {len(embeddings)} patents")
680
+ print(f"Target cluster range: {min_cluster_size}-{max_cluster_size} patents per cluster")
681
+ print(f"Maximum clusters allowed: {max_clusters}")
682
+
683
+ update_progress('clustering', 'processing', f'Performing advanced multi-stage clustering...')
684
 
685
  # Create DataFrame for plotting
686
  df = pd.DataFrame(metadata)
 
688
  df['y'] = embedding_3d[:, 1]
689
  df['z'] = embedding_3d[:, 2]
690
 
691
+ # --- IMPROVED MULTI-STAGE CLUSTERING ALGORITHM ---
692
  scaler = StandardScaler()
693
  scaled_embeddings = scaler.fit_transform(embedding_3d)
694
 
695
  n_points = len(scaled_embeddings)
696
+ print(f"Processing {n_points} patents with improved clustering algorithm...")
697
+
698
+ # Stage 1: Initial HDBSCAN with stricter parameters
699
+ initial_min_cluster_size = max(min_cluster_size, int(n_points * 0.015)) # More aggressive minimum
700
+ initial_min_samples = max(5, int(initial_min_cluster_size * 0.5)) # Stricter density requirement
 
 
 
 
 
 
 
 
701
 
702
+ print(f"Stage 1 - Initial clustering: min_cluster_size={initial_min_cluster_size}, min_samples={initial_min_samples}")
703
 
 
704
  hdb = hdbscan.HDBSCAN(
705
+ min_cluster_size=initial_min_cluster_size,
706
+ min_samples=initial_min_samples,
707
+ cluster_selection_epsilon=0.05, # Reduced for better separation
708
  cluster_selection_method='eom',
709
+ metric='euclidean',
710
+ alpha=1.0 # More conservative clustering
711
  )
712
+ initial_clusters = hdb.fit_predict(scaled_embeddings)
713
 
714
+ # Stage 2: Subdivide oversized clusters
715
+ print("Stage 2 - Subdividing oversized clusters...")
716
+ final_clusters = initial_clusters.copy()
717
+ next_cluster_id = max(initial_clusters) + 1 if len(set(initial_clusters)) > 1 else 0
718
+
719
+ cluster_subdivisions = 0
720
+ for cluster_id in set(initial_clusters):
721
+ if cluster_id == -1: # Skip noise
722
+ continue
723
+
724
+ cluster_mask = initial_clusters == cluster_id
725
+ cluster_size = sum(cluster_mask)
726
+
727
+ # If cluster is too large, subdivide it
728
+ if cluster_size > max_cluster_size:
729
+ print(f" Subdividing cluster {cluster_id} ({cluster_size} patents) - TOO LARGE")
730
+ cluster_subdivisions += 1
731
+
732
+ # Extract data for this oversized cluster
733
+ cluster_data = scaled_embeddings[cluster_mask]
734
+ cluster_indices = np.where(cluster_mask)[0]
735
+
736
+ # Calculate how many subclusters we need
737
+ n_subclusters = min(6, max(2, cluster_size // max_cluster_size + 1))
738
+ print(f" Splitting into {n_subclusters} subclusters...")
739
+
740
+ # Use KMeans for controlled subdivision
741
+ kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
742
+ subclusters = kmeans.fit_predict(cluster_data)
743
+
744
+ # Assign new cluster IDs
745
+ for i, subcluster_id in enumerate(subclusters):
746
+ original_idx = cluster_indices[i]
747
+ if subcluster_id == 0:
748
+ # Keep first subcluster with original ID
749
+ final_clusters[original_idx] = cluster_id
750
+ else:
751
+ # Assign new IDs to other subclusters
752
+ final_clusters[original_idx] = next_cluster_id + subcluster_id - 1
753
+
754
+ next_cluster_id += n_subclusters - 1
755
+
756
+ print(f"Subdivided {cluster_subdivisions} oversized clusters")
757
+
758
+ # Stage 3: Handle noise points more intelligently
759
+ noise_mask = final_clusters == -1
760
+ noise_count = sum(noise_mask)
761
+
762
+ if noise_count > 0:
763
+ print(f"Stage 3 - Reassigning {noise_count} noise points...")
764
+
765
+ # Get cluster centers (excluding noise)
766
  cluster_centers = []
767
  cluster_labels = []
768
+ for label in set(final_clusters):
769
  if label != -1:
770
+ cluster_mask = final_clusters == label
771
  center = np.mean(scaled_embeddings[cluster_mask], axis=0)
772
  cluster_centers.append(center)
773
  cluster_labels.append(label)
 
778
 
779
  # Find nearest cluster for each noise point
780
  nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
781
+ distances, nearest_indices = nbrs.kneighbors(noise_points)
782
+
783
+ # Only assign noise points that are reasonably close to a cluster
784
+ max_distance = np.percentile(distances, 75) # Use 75th percentile as threshold
785
 
 
786
  noise_indices = np.where(noise_mask)[0]
787
+ reassigned_count = 0
788
+ for i, (distance, nearest_idx) in enumerate(zip(distances.flatten(), nearest_indices.flatten())):
789
+ if distance <= max_distance:
790
+ final_clusters[noise_indices[i]] = cluster_labels[nearest_idx]
791
+ reassigned_count += 1
792
+
793
+ print(f" Reassigned {reassigned_count}/{noise_count} noise points to nearby clusters")
794
+
795
+ clusters = final_clusters
796
 
797
  df['cluster'] = clusters
798
 
 
849
  # Update cluster_info to only include main clusters
850
  cluster_info = main_clusters
851
 
852
+ # Final cluster validation and reporting
853
+ final_cluster_info = []
854
+ noise_count = sum(1 for c in clusters if c == -1)
855
+
856
+ for label in set(clusters):
857
+ if label != -1: # Skip noise
858
+ cluster_mask = clusters == label
859
  cluster_patents = df[cluster_mask]
860
+ if len(cluster_patents) > 0:
861
+ final_cluster_info.append((label, len(cluster_patents), cluster_patents))
862
+
863
+ # Sort clusters by size in descending order
864
+ final_cluster_info.sort(key=lambda x: x[1], reverse=True)
865
+
866
+ print(f"\n✅ FINAL CLUSTERING RESULTS:")
867
+ print(f"Total patents processed: {len(df)}")
868
+ print(f"Number of technology clusters: {len(final_cluster_info)}")
869
+ print(f"Noise points (unassigned): {noise_count}")
870
+
871
+ if final_cluster_info:
872
+ sizes = [size for _, size, _ in final_cluster_info]
873
+ avg_size = np.mean(sizes)
874
+ min_size = min(sizes)
875
+ max_size = max(sizes)
876
 
877
+ print(f"Cluster size stats: min={min_size}, avg={avg_size:.1f}, max={max_size}")
878
+ print(f"Target range was: {min_cluster_size}-{max_cluster_size} patents per cluster")
879
+
880
+ # Check if we successfully avoided mega-clusters
881
+ oversized_clusters = [size for size in sizes if size > max_cluster_size]
882
+ if oversized_clusters:
883
+ print(f"⚠️ Warning: {len(oversized_clusters)} clusters still oversized: {oversized_clusters}")
884
+ else:
885
+ print(f" Success: All clusters within target size range!")
886
+
887
+ print("\nCluster Size Distribution:")
888
+ for i, (label, size, _) in enumerate(final_cluster_info):
889
+ status = "✅" if min_cluster_size <= size <= max_cluster_size else "⚠️"
890
+ print(f" {status} Cluster {i + 1}: {size} patents")
891
+
892
+ cluster_info = final_cluster_info
893
 
894
  # Create mapping for new cluster IDs (1-based)
895
  cluster_id_map = {old_label: i + 1 for i, (old_label, _, _) in enumerate(cluster_info)}