Spaces:

PhyllisPeh
/

PatentExplorerApp

Sleeping

App Files Files Community

PhyllisPeh commited on Jun 13

Commit

4e29396

1 Parent(s): d8b8d6f

fixed clustering algorithm

Browse files

Files changed (1) hide show

app.py +155 -60

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ import umap
 import openai
 from sklearn.neighbors import NearestNeighbors
 from sklearn.preprocessing import StandardScaler
 import hdbscan
 import plotly.graph_objects as go
 import requests
@@ -242,16 +243,29 @@ MIN_PATENTS_FOR_GAPS = 3000  # Minimum patents needed for reliable gap detection
 def get_max_clusters(num_patents):
     """
     Calculate optimal maximum clusters based on dataset size.
-    Aims for clusters of 75-150 patents for meaningful technological analysis.
     """
     if num_patents < 500:
-        return min(8, num_patents // 30)  # Smaller datasets: 30-60 patents per cluster
     elif num_patents < 1000:
-        return min(12, num_patents // 75)  # Medium datasets: 75-85 patents per cluster
     elif num_patents < 2000:
-        return min(16, num_patents // 100)  # Large datasets: 100-125 patents per cluster
     else:
-        return min(24, num_patents // 125)  # Very large datasets: 125-150 patents per cluster
 if not SERPAPI_API_KEY:
     raise ValueError("SERPAPI_API_KEY environment variable is not set")
@@ -646,16 +660,27 @@ def create_3d_visualization(patents):
     update_progress('clustering', 'processing', 'Applying UMAP dimensionality reduction...')
-    # Apply UMAP dimensionality reduction
-    reducer = umap.UMAP(n_components=3, random_state=42)
     embedding_3d = reducer.fit_transform(embeddings_array)
-    # Calculate optimal cluster limit for this dataset
     max_clusters = get_max_clusters(len(embeddings))
-    print(f"\nDataset size: {len(embeddings)} patents")
-    print(f"Optimal cluster limit: {max_clusters} clusters (targeting 75-150 patents per cluster)")
-    update_progress('clustering', 'processing', f'Performing HDBSCAN clustering (max {max_clusters} clusters)...')
     # Create DataFrame for plotting
     df = pd.DataFrame(metadata)
@@ -663,47 +688,86 @@ def create_3d_visualization(patents):
     df['y'] = embedding_3d[:, 1]
     df['z'] = embedding_3d[:, 2]
-    # --- Simplified HDBSCAN clustering for technological clusters ---
     scaler = StandardScaler()
     scaled_embeddings = scaler.fit_transform(embedding_3d)
     n_points = len(scaled_embeddings)
-    update_progress('clustering', 'processing', f'Analyzing {n_points} patents for clustering...')
-    # Dynamically set clustering parameters based on dataset size
-    if n_points < 100:
-        min_cluster_size = max(5, int(n_points * 0.08))
-    elif n_points < 500:
-        min_cluster_size = max(8, int(n_points * 0.05))
-    elif n_points < 1000:
-        min_cluster_size = max(15, int(n_points * 0.03))
-    else:
-        min_cluster_size = max(20, int(n_points * 0.02))
-    min_samples = max(3, int(min_cluster_size * 0.7))
-    print(f"HDBSCAN clustering: min_cluster_size={min_cluster_size}, min_samples={min_samples}")
-    # Apply HDBSCAN clustering
     hdb = hdbscan.HDBSCAN(
-        min_cluster_size=min_cluster_size,
-        min_samples=min_samples,
-        cluster_selection_epsilon=0.1,
         cluster_selection_method='eom',
-        metric='euclidean'
     )
-    clusters = hdb.fit_predict(scaled_embeddings)
-    # Assign noise points to nearest cluster
-    noise_mask = clusters == -1
-    if any(noise_mask) and len(set(clusters)) > 1:
-        print(f"Assigning {sum(noise_mask)} noise points to nearest clusters...")
-        # Get cluster centers
         cluster_centers = []
         cluster_labels = []
-        for label in set(clusters):
             if label != -1:
-                cluster_mask = clusters == label
                 center = np.mean(scaled_embeddings[cluster_mask], axis=0)
                 cluster_centers.append(center)
                 cluster_labels.append(label)
@@ -714,12 +778,21 @@ def create_3d_visualization(patents):
             # Find nearest cluster for each noise point
             nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
-            _, nearest_indices = nbrs.kneighbors(noise_points)
-            # Assign noise points to nearest clusters
             noise_indices = np.where(noise_mask)[0]
-            for i, nearest_idx in enumerate(nearest_indices.flatten()):
-                clusters[noise_indices[i]] = cluster_labels[nearest_idx]
     df['cluster'] = clusters
@@ -776,25 +849,47 @@ def create_3d_visualization(patents):
         # Update cluster_info to only include main clusters
         cluster_info = main_clusters
-        # Recalculate cluster sizes after reassignment
-        updated_cluster_info = []
-        for old_label, _, _ in main_clusters:
-            cluster_mask = clusters == old_label
             cluster_patents = df[cluster_mask]
-            updated_cluster_info.append((old_label, len(cluster_patents), cluster_patents))
-        # Sort again by new sizes
-        updated_cluster_info.sort(key=lambda x: x[1], reverse=True)
-        cluster_info = updated_cluster_info
-    print(f"\nFinal Clustering Results:")
-    print(f"Number of technological clusters: {len(cluster_info)} (limited to max {max_clusters})")
-    print(f"Total patents clustered: {len(df)}")
-    avg_cluster_size = len(df) / len(cluster_info) if cluster_info else 0
-    print(f"Average cluster size: {avg_cluster_size:.1f} patents")
-    print("\nCluster Size Distribution:")
-    for i, (label, size, _) in enumerate(cluster_info):
-        print(f"Cluster {i + 1}: {size} patents")
     # Create mapping for new cluster IDs (1-based)
     cluster_id_map = {old_label: i + 1 for i, (old_label, _, _) in enumerate(cluster_info)}

 import openai
 from sklearn.neighbors import NearestNeighbors
 from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import KMeans
 import hdbscan
 import plotly.graph_objects as go
 import requests
 def get_max_clusters(num_patents):
     """
     Calculate optimal maximum clusters based on dataset size.
+    Aims for clusters of 50-200 patents for meaningful technological analysis.
     """
+    if num_patents < 200:
+        return min(6, num_patents // 25)  # Very small: 25-35 patents per cluster
+    elif num_patents < 500:
+        return min(10, num_patents // 40)  # Small datasets: 40-50 patents per cluster
+    elif num_patents < 1000:
+        return min(15, num_patents // 60)  # Medium datasets: 60-70 patents per cluster
+    elif num_patents < 2000:
+        return min(20, num_patents // 80)  # Large datasets: 80-100 patents per cluster
+    else:
+        return min(30, num_patents // 100)  # Very large datasets: 100-150 patents per cluster
+def get_optimal_cluster_size(num_patents):
+    """Calculate optimal target cluster size range"""
     if num_patents < 500:
+        return 25, 80  # min=25, max=80
     elif num_patents < 1000:
+        return 40, 120  # min=40, max=120
     elif num_patents < 2000:
+        return 60, 150  # min=60, max=150
     else:
+        return 80, 200  # min=80, max=200
 if not SERPAPI_API_KEY:
     raise ValueError("SERPAPI_API_KEY environment variable is not set")
     update_progress('clustering', 'processing', 'Applying UMAP dimensionality reduction...')
+    # Apply UMAP dimensionality reduction with better parameters for technology separation
+    update_progress('clustering', 'processing', 'Applying optimized UMAP dimensionality reduction...')
+    reducer = umap.UMAP(
+        n_components=3,
+        n_neighbors=30,        # Increased for better global structure
+        min_dist=0.1,          # Reduced for tighter clusters
+        spread=1.0,            # Better cluster separation
+        random_state=42
+    )
     embedding_3d = reducer.fit_transform(embeddings_array)
+    # Calculate optimal cluster parameters
     max_clusters = get_max_clusters(len(embeddings))
+    min_cluster_size, max_cluster_size = get_optimal_cluster_size(len(embeddings))
+    print(f"\n🎯 IMPROVED CLUSTERING STRATEGY:")
+    print(f"Dataset size: {len(embeddings)} patents")
+    print(f"Target cluster range: {min_cluster_size}-{max_cluster_size} patents per cluster")
+    print(f"Maximum clusters allowed: {max_clusters}")
+    update_progress('clustering', 'processing', f'Performing advanced multi-stage clustering...')
     # Create DataFrame for plotting
     df = pd.DataFrame(metadata)
     df['y'] = embedding_3d[:, 1]
     df['z'] = embedding_3d[:, 2]
+    # --- IMPROVED MULTI-STAGE CLUSTERING ALGORITHM ---
     scaler = StandardScaler()
     scaled_embeddings = scaler.fit_transform(embedding_3d)
     n_points = len(scaled_embeddings)
+    print(f"Processing {n_points} patents with improved clustering algorithm...")
+    # Stage 1: Initial HDBSCAN with stricter parameters
+    initial_min_cluster_size = max(min_cluster_size, int(n_points * 0.015))  # More aggressive minimum
+    initial_min_samples = max(5, int(initial_min_cluster_size * 0.5))  # Stricter density requirement
+    print(f"Stage 1 - Initial clustering: min_cluster_size={initial_min_cluster_size}, min_samples={initial_min_samples}")
     hdb = hdbscan.HDBSCAN(
+        min_cluster_size=initial_min_cluster_size,
+        min_samples=initial_min_samples,
+        cluster_selection_epsilon=0.05,  # Reduced for better separation
         cluster_selection_method='eom',
+        metric='euclidean',
+        alpha=1.0  # More conservative clustering
     )
+    initial_clusters = hdb.fit_predict(scaled_embeddings)
+    # Stage 2: Subdivide oversized clusters
+    print("Stage 2 - Subdividing oversized clusters...")
+    final_clusters = initial_clusters.copy()
+    next_cluster_id = max(initial_clusters) + 1 if len(set(initial_clusters)) > 1 else 0
+    cluster_subdivisions = 0
+    for cluster_id in set(initial_clusters):
+        if cluster_id == -1:  # Skip noise
+            continue
+        cluster_mask = initial_clusters == cluster_id
+        cluster_size = sum(cluster_mask)
+        # If cluster is too large, subdivide it
+        if cluster_size > max_cluster_size:
+            print(f"  Subdividing cluster {cluster_id} ({cluster_size} patents) - TOO LARGE")
+            cluster_subdivisions += 1
+            # Extract data for this oversized cluster
+            cluster_data = scaled_embeddings[cluster_mask]
+            cluster_indices = np.where(cluster_mask)[0]
+            # Calculate how many subclusters we need
+            n_subclusters = min(6, max(2, cluster_size // max_cluster_size + 1))
+            print(f"    Splitting into {n_subclusters} subclusters...")
+            # Use KMeans for controlled subdivision
+            kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
+            subclusters = kmeans.fit_predict(cluster_data)
+            # Assign new cluster IDs
+            for i, subcluster_id in enumerate(subclusters):
+                original_idx = cluster_indices[i]
+                if subcluster_id == 0:
+                    # Keep first subcluster with original ID
+                    final_clusters[original_idx] = cluster_id
+                else:
+                    # Assign new IDs to other subclusters
+                    final_clusters[original_idx] = next_cluster_id + subcluster_id - 1
+            next_cluster_id += n_subclusters - 1
+    print(f"Subdivided {cluster_subdivisions} oversized clusters")
+    # Stage 3: Handle noise points more intelligently
+    noise_mask = final_clusters == -1
+    noise_count = sum(noise_mask)
+    if noise_count > 0:
+        print(f"Stage 3 - Reassigning {noise_count} noise points...")
+        # Get cluster centers (excluding noise)
         cluster_centers = []
         cluster_labels = []
+        for label in set(final_clusters):
             if label != -1:
+                cluster_mask = final_clusters == label
                 center = np.mean(scaled_embeddings[cluster_mask], axis=0)
                 cluster_centers.append(center)
                 cluster_labels.append(label)
             # Find nearest cluster for each noise point
             nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
+            distances, nearest_indices = nbrs.kneighbors(noise_points)
+            # Only assign noise points that are reasonably close to a cluster
+            max_distance = np.percentile(distances, 75)  # Use 75th percentile as threshold
             noise_indices = np.where(noise_mask)[0]
+            reassigned_count = 0
+            for i, (distance, nearest_idx) in enumerate(zip(distances.flatten(), nearest_indices.flatten())):
+                if distance <= max_distance:
+                    final_clusters[noise_indices[i]] = cluster_labels[nearest_idx]
+                    reassigned_count += 1
+            print(f"  Reassigned {reassigned_count}/{noise_count} noise points to nearby clusters")
+    clusters = final_clusters
     df['cluster'] = clusters
         # Update cluster_info to only include main clusters
         cluster_info = main_clusters
+    # Final cluster validation and reporting
+    final_cluster_info = []
+    noise_count = sum(1 for c in clusters if c == -1)
+    for label in set(clusters):
+        if label != -1:  # Skip noise
+            cluster_mask = clusters == label
             cluster_patents = df[cluster_mask]
+            if len(cluster_patents) > 0:
+                final_cluster_info.append((label, len(cluster_patents), cluster_patents))
+    # Sort clusters by size in descending order
+    final_cluster_info.sort(key=lambda x: x[1], reverse=True)
+    print(f"\n✅ FINAL CLUSTERING RESULTS:")
+    print(f"Total patents processed: {len(df)}")
+    print(f"Number of technology clusters: {len(final_cluster_info)}")
+    print(f"Noise points (unassigned): {noise_count}")
+    if final_cluster_info:
+        sizes = [size for _, size, _ in final_cluster_info]
+        avg_size = np.mean(sizes)
+        min_size = min(sizes)
+        max_size = max(sizes)
+        print(f"Cluster size stats: min={min_size}, avg={avg_size:.1f}, max={max_size}")
+        print(f"Target range was: {min_cluster_size}-{max_cluster_size} patents per cluster")
+        # Check if we successfully avoided mega-clusters
+        oversized_clusters = [size for size in sizes if size > max_cluster_size]
+        if oversized_clusters:
+            print(f"⚠️  Warning: {len(oversized_clusters)} clusters still oversized: {oversized_clusters}")
+        else:
+            print(f"✅ Success: All clusters within target size range!")
+        print("\nCluster Size Distribution:")
+        for i, (label, size, _) in enumerate(final_cluster_info):
+            status = "✅" if min_cluster_size <= size <= max_cluster_size else "⚠️"
+            print(f"  {status} Cluster {i + 1}: {size} patents")
+    cluster_info = final_cluster_info
     # Create mapping for new cluster IDs (1-based)
     cluster_id_map = {old_label: i + 1 for i, (old_label, _, _) in enumerate(cluster_info)}