Spaces:

PhyllisPeh
/

PatentExplorerApp

Sleeping

App Files Files Community

PhyllisPeh commited on Jun 13

Commit

ac25e1c

1 Parent(s): 4e29396

reduced cluster size

Browse files

Files changed (1) hide show

app.py +87 -26

app.py CHANGED Viewed

@@ -243,29 +243,29 @@ MIN_PATENTS_FOR_GAPS = 3000  # Minimum patents needed for reliable gap detection
 def get_max_clusters(num_patents):
     """
     Calculate optimal maximum clusters based on dataset size.
-    Aims for clusters of 50-200 patents for meaningful technological analysis.
     """
     if num_patents < 200:
-        return min(6, num_patents // 25)  # Very small: 25-35 patents per cluster
     elif num_patents < 500:
-        return min(10, num_patents // 40)  # Small datasets: 40-50 patents per cluster
     elif num_patents < 1000:
-        return min(15, num_patents // 60)  # Medium datasets: 60-70 patents per cluster
     elif num_patents < 2000:
-        return min(20, num_patents // 80)  # Large datasets: 80-100 patents per cluster
     else:
-        return min(30, num_patents // 100)  # Very large datasets: 100-150 patents per cluster
 def get_optimal_cluster_size(num_patents):
-    """Calculate optimal target cluster size range"""
     if num_patents < 500:
-        return 25, 80  # min=25, max=80
     elif num_patents < 1000:
-        return 40, 120  # min=40, max=120
     elif num_patents < 2000:
-        return 60, 150  # min=60, max=150
     else:
-        return 80, 200  # min=80, max=200
 if not SERPAPI_API_KEY:
     raise ValueError("SERPAPI_API_KEY environment variable is not set")
@@ -664,10 +664,11 @@ def create_3d_visualization(patents):
     update_progress('clustering', 'processing', 'Applying optimized UMAP dimensionality reduction...')
     reducer = umap.UMAP(
         n_components=3,
-        n_neighbors=30,        # Increased for better global structure
-        min_dist=0.1,          # Reduced for tighter clusters
-        spread=1.0,            # Better cluster separation
-        random_state=42
     )
     embedding_3d = reducer.fit_transform(embeddings_array)
@@ -696,18 +697,18 @@ def create_3d_visualization(patents):
     print(f"Processing {n_points} patents with improved clustering algorithm...")
     # Stage 1: Initial HDBSCAN with stricter parameters
-    initial_min_cluster_size = max(min_cluster_size, int(n_points * 0.015))  # More aggressive minimum
-    initial_min_samples = max(5, int(initial_min_cluster_size * 0.5))  # Stricter density requirement
     print(f"Stage 1 - Initial clustering: min_cluster_size={initial_min_cluster_size}, min_samples={initial_min_samples}")
     hdb = hdbscan.HDBSCAN(
         min_cluster_size=initial_min_cluster_size,
         min_samples=initial_min_samples,
-        cluster_selection_epsilon=0.05,  # Reduced for better separation
         cluster_selection_method='eom',
         metric='euclidean',
-        alpha=1.0  # More conservative clustering
     )
     initial_clusters = hdb.fit_predict(scaled_embeddings)
@@ -724,7 +725,7 @@ def create_3d_visualization(patents):
         cluster_mask = initial_clusters == cluster_id
         cluster_size = sum(cluster_mask)
-        # If cluster is too large, subdivide it
         if cluster_size > max_cluster_size:
             print(f"  Subdividing cluster {cluster_id} ({cluster_size} patents) - TOO LARGE")
             cluster_subdivisions += 1
@@ -733,9 +734,12 @@ def create_3d_visualization(patents):
             cluster_data = scaled_embeddings[cluster_mask]
             cluster_indices = np.where(cluster_mask)[0]
-            # Calculate how many subclusters we need
-            n_subclusters = min(6, max(2, cluster_size // max_cluster_size + 1))
-            print(f"    Splitting into {n_subclusters} subclusters...")
             # Use KMeans for controlled subdivision
             kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
@@ -755,6 +759,52 @@ def create_3d_visualization(patents):
     print(f"Subdivided {cluster_subdivisions} oversized clusters")
     # Stage 3: Handle noise points more intelligently
     noise_mask = final_clusters == -1
     noise_count = sum(noise_mask)
@@ -880,13 +930,24 @@ def create_3d_visualization(patents):
         # Check if we successfully avoided mega-clusters
         oversized_clusters = [size for size in sizes if size > max_cluster_size]
         if oversized_clusters:
-            print(f"⚠️  Warning: {len(oversized_clusters)} clusters still oversized: {oversized_clusters}")
         else:
-            print(f"✅ Success: All clusters within target size range!")
         print("\nCluster Size Distribution:")
         for i, (label, size, _) in enumerate(final_cluster_info):
-            status = "✅" if min_cluster_size <= size <= max_cluster_size else "⚠️"
             print(f"  {status} Cluster {i + 1}: {size} patents")
     cluster_info = final_cluster_info

 def get_max_clusters(num_patents):
     """
     Calculate optimal maximum clusters based on dataset size.
+    REVISED: More clusters for larger datasets to keep individual cluster sizes smaller.
     """
     if num_patents < 200:
+        return min(8, num_patents // 20)   # Very small: 20-25 patents per cluster
     elif num_patents < 500:
+        return min(12, num_patents // 30)  # Small datasets: 30-40 patents per cluster
     elif num_patents < 1000:
+        return min(20, num_patents // 40)  # Medium datasets: 40-50 patents per cluster
     elif num_patents < 2000:
+        return min(30, num_patents // 60)  # Large datasets: 60-70 patents per cluster
     else:
+        return min(50, num_patents // 80)  # Very large datasets: 80-100 patents per cluster (increased from 30 max)
 def get_optimal_cluster_size(num_patents):
+    """Calculate optimal target cluster size range - REDUCED MAX SIZES to prevent mega-clusters"""
     if num_patents < 500:
+        return 25, 60  # min=25, max=60 (reduced from 80)
     elif num_patents < 1000:
+        return 40, 80  # min=40, max=80 (reduced from 120)
     elif num_patents < 2000:
+        return 50, 100  # min=50, max=100 (reduced from 150)
     else:
+        return 60, 120  # min=60, max=120 (reduced from 200)
 if not SERPAPI_API_KEY:
     raise ValueError("SERPAPI_API_KEY environment variable is not set")
     update_progress('clustering', 'processing', 'Applying optimized UMAP dimensionality reduction...')
     reducer = umap.UMAP(
         n_components=3,
+        n_neighbors=20,        # Reduced from 30 for more local structure
+        min_dist=0.05,         # Reduced from 0.1 for even tighter clusters
+        spread=0.8,            # Reduced from 1.0 for better cluster separation
+        random_state=42,
+        metric='cosine'        # Added cosine metric for better semantic clustering
     )
     embedding_3d = reducer.fit_transform(embeddings_array)
     print(f"Processing {n_points} patents with improved clustering algorithm...")
     # Stage 1: Initial HDBSCAN with stricter parameters
+    initial_min_cluster_size = max(min_cluster_size, int(n_points * 0.020))  # Increased from 0.015 to 0.020 for stricter minimum
+    initial_min_samples = max(8, int(initial_min_cluster_size * 0.6))  # Increased from 0.5 to 0.6 for stricter density
     print(f"Stage 1 - Initial clustering: min_cluster_size={initial_min_cluster_size}, min_samples={initial_min_samples}")
     hdb = hdbscan.HDBSCAN(
         min_cluster_size=initial_min_cluster_size,
         min_samples=initial_min_samples,
+        cluster_selection_epsilon=0.03,  # Reduced from 0.05 for tighter clusters
         cluster_selection_method='eom',
         metric='euclidean',
+        alpha=1.2  # Increased from 1.0 for even more conservative clustering
     )
     initial_clusters = hdb.fit_predict(scaled_embeddings)
         cluster_mask = initial_clusters == cluster_id
         cluster_size = sum(cluster_mask)
+        # If cluster is too large, subdivide it more aggressively
         if cluster_size > max_cluster_size:
             print(f"  Subdividing cluster {cluster_id} ({cluster_size} patents) - TOO LARGE")
             cluster_subdivisions += 1
             cluster_data = scaled_embeddings[cluster_mask]
             cluster_indices = np.where(cluster_mask)[0]
+            # Calculate how many subclusters we need - MORE AGGRESSIVE subdivision
+            target_size = max_cluster_size * 0.75  # Target 75% of max size for better separation
+            n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
+            # Cap at reasonable maximum but allow more splits if needed
+            n_subclusters = min(10, n_subclusters)  # Increased from 6 to 10
+            print(f"    Splitting into {n_subclusters} subclusters (target size: {target_size:.0f})...")
             # Use KMeans for controlled subdivision
             kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
     print(f"Subdivided {cluster_subdivisions} oversized clusters")
+    # Stage 2.5: Additional validation and forced subdivision for any remaining oversized clusters
+    print("Stage 2.5 - Final oversized cluster validation...")
+    additional_subdivisions = 0
+    for cluster_id in set(final_clusters):
+        if cluster_id == -1:  # Skip noise
+            continue
+        cluster_mask = final_clusters == cluster_id
+        cluster_size = sum(cluster_mask)
+        # Force subdivision of any clusters still over the limit
+        if cluster_size > max_cluster_size:
+            print(f"  FORCING additional subdivision of cluster {cluster_id} ({cluster_size} patents)")
+            additional_subdivisions += 1
+            # Extract data for this still-oversized cluster
+            cluster_data = scaled_embeddings[cluster_mask]
+            cluster_indices = np.where(cluster_mask)[0]
+            # Force more aggressive subdivision
+            target_size = max_cluster_size * 0.6  # Even more aggressive - 60% of max
+            n_subclusters = max(3, int(np.ceil(cluster_size / target_size)))
+            n_subclusters = min(15, n_subclusters)  # Allow up to 15 splits if needed
+            print(f"    FORCING split into {n_subclusters} subclusters...")
+            # Use KMeans for forced subdivision
+            kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
+            subclusters = kmeans.fit_predict(cluster_data)
+            # Assign new cluster IDs
+            for i, subcluster_id in enumerate(subclusters):
+                original_idx = cluster_indices[i]
+                if subcluster_id == 0:
+                    # Keep first subcluster with original ID
+                    final_clusters[original_idx] = cluster_id
+                else:
+                    # Assign new IDs to other subclusters
+                    final_clusters[original_idx] = next_cluster_id + subcluster_id - 1
+            next_cluster_id += n_subclusters - 1
+    if additional_subdivisions > 0:
+        print(f"Performed {additional_subdivisions} additional forced subdivisions")
+    else:
+        print("No additional subdivisions needed - all clusters within size limits")
     # Stage 3: Handle noise points more intelligently
     noise_mask = final_clusters == -1
     noise_count = sum(noise_mask)
         # Check if we successfully avoided mega-clusters
         oversized_clusters = [size for size in sizes if size > max_cluster_size]
         if oversized_clusters:
+            print(f"⚠️  WARNING: {len(oversized_clusters)} clusters STILL oversized: {oversized_clusters}")
+            print(f"❌ FAILED to contain all clusters within target range!")
+            # Log the oversized clusters for debugging
+            for i, (label, size, _) in enumerate(final_cluster_info):
+                if size > max_cluster_size:
+                    print(f"   Oversized Cluster {i + 1}: {size} patents (EXCEEDS LIMIT of {max_cluster_size})")
         else:
+            print(f"✅ SUCCESS: All clusters within target size range!")
         print("\nCluster Size Distribution:")
         for i, (label, size, _) in enumerate(final_cluster_info):
+            if size > max_cluster_size:
+                status = "❌ OVERSIZED"
+            elif min_cluster_size <= size <= max_cluster_size:
+                status = "✅ OPTIMAL"
+            else:
+                status = "⚠️  SMALL"
             print(f"  {status} Cluster {i + 1}: {size} patents")
     cluster_info = final_cluster_info