Spaces:

PhyllisPeh
/

PatentExplorerApp

Sleeping

App Files Files Community

PhyllisPeh commited on Jun 13

Commit

3230189

1 Parent(s): ac25e1c

fixed cluster sizes

Browse files

Files changed (1) hide show

app.py +102 -21

app.py CHANGED Viewed

@@ -257,15 +257,15 @@ def get_max_clusters(num_patents):
         return min(50, num_patents // 80)  # Very large datasets: 80-100 patents per cluster (increased from 30 max)
 def get_optimal_cluster_size(num_patents):
-    """Calculate optimal target cluster size range - REDUCED MAX SIZES to prevent mega-clusters"""
     if num_patents < 500:
-        return 25, 60  # min=25, max=60 (reduced from 80)
     elif num_patents < 1000:
-        return 40, 80  # min=40, max=80 (reduced from 120)
     elif num_patents < 2000:
-        return 50, 100  # min=50, max=100 (reduced from 150)
     else:
-        return 60, 120  # min=60, max=120 (reduced from 200)
 if not SERPAPI_API_KEY:
     raise ValueError("SERPAPI_API_KEY environment variable is not set")
@@ -735,10 +735,10 @@ def create_3d_visualization(patents):
             cluster_indices = np.where(cluster_mask)[0]
             # Calculate how many subclusters we need - MORE AGGRESSIVE subdivision
-            target_size = max_cluster_size * 0.75  # Target 75% of max size for better separation
             n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
             # Cap at reasonable maximum but allow more splits if needed
-            n_subclusters = min(10, n_subclusters)  # Increased from 6 to 10
             print(f"    Splitting into {n_subclusters} subclusters (target size: {target_size:.0f})...")
             # Use KMeans for controlled subdivision
@@ -779,9 +779,9 @@ def create_3d_visualization(patents):
             cluster_indices = np.where(cluster_mask)[0]
             # Force more aggressive subdivision
-            target_size = max_cluster_size * 0.6  # Even more aggressive - 60% of max
             n_subclusters = max(3, int(np.ceil(cluster_size / target_size)))
-            n_subclusters = min(15, n_subclusters)  # Allow up to 15 splits if needed
             print(f"    FORCING split into {n_subclusters} subclusters...")
             # Use KMeans for forced subdivision
@@ -805,42 +805,120 @@ def create_3d_visualization(patents):
     else:
         print("No additional subdivisions needed - all clusters within size limits")
-    # Stage 3: Handle noise points more intelligently
     noise_mask = final_clusters == -1
     noise_count = sum(noise_mask)
     if noise_count > 0:
-        print(f"Stage 3 - Reassigning {noise_count} noise points...")
-        # Get cluster centers (excluding noise)
         cluster_centers = []
         cluster_labels = []
         for label in set(final_clusters):
             if label != -1:
                 cluster_mask = final_clusters == label
                 center = np.mean(scaled_embeddings[cluster_mask], axis=0)
                 cluster_centers.append(center)
                 cluster_labels.append(label)
         if cluster_centers:
             cluster_centers = np.array(cluster_centers)
             noise_points = scaled_embeddings[noise_mask]
-            # Find nearest cluster for each noise point
-            nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
             distances, nearest_indices = nbrs.kneighbors(noise_points)
-            # Only assign noise points that are reasonably close to a cluster
-            max_distance = np.percentile(distances, 75)  # Use 75th percentile as threshold
             noise_indices = np.where(noise_mask)[0]
             reassigned_count = 0
-            for i, (distance, nearest_idx) in enumerate(zip(distances.flatten(), nearest_indices.flatten())):
-                if distance <= max_distance:
-                    final_clusters[noise_indices[i]] = cluster_labels[nearest_idx]
-                    reassigned_count += 1
             print(f"  Reassigned {reassigned_count}/{noise_count} noise points to nearby clusters")
     clusters = final_clusters
@@ -944,11 +1022,14 @@ def create_3d_visualization(patents):
         for i, (label, size, _) in enumerate(final_cluster_info):
             if size > max_cluster_size:
                 status = "❌ OVERSIZED"
             elif min_cluster_size <= size <= max_cluster_size:
                 status = "✅ OPTIMAL"
             else:
                 status = "⚠️  SMALL"
-            print(f"  {status} Cluster {i + 1}: {size} patents")
     cluster_info = final_cluster_info

         return min(50, num_patents // 80)  # Very large datasets: 80-100 patents per cluster (increased from 30 max)
 def get_optimal_cluster_size(num_patents):
+    """Calculate optimal target cluster size range - ADJUSTED to account for noise point reassignment"""
     if num_patents < 500:
+        return 25, 90  # min=25, max=90 (increased from 60 to allow room for noise points)
     elif num_patents < 1000:
+        return 40, 100  # min=40, max=100 (increased from 80)
     elif num_patents < 2000:
+        return 50, 130  # min=50, max=130 (increased from 100)
     else:
+        return 60, 150  # min=60, max=150 (increased from 120)
 if not SERPAPI_API_KEY:
     raise ValueError("SERPAPI_API_KEY environment variable is not set")
             cluster_indices = np.where(cluster_mask)[0]
             # Calculate how many subclusters we need - MORE AGGRESSIVE subdivision
+            target_size = max_cluster_size * 0.6  # Target 60% of max size for better buffer
             n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
             # Cap at reasonable maximum but allow more splits if needed
+            n_subclusters = min(12, n_subclusters)  # Increased from 10 to 12
             print(f"    Splitting into {n_subclusters} subclusters (target size: {target_size:.0f})...")
             # Use KMeans for controlled subdivision
             cluster_indices = np.where(cluster_mask)[0]
             # Force more aggressive subdivision
+            target_size = max_cluster_size * 0.5  # Even more aggressive - 50% of max
             n_subclusters = max(3, int(np.ceil(cluster_size / target_size)))
+            n_subclusters = min(20, n_subclusters)  # Allow up to 20 splits if needed
             print(f"    FORCING split into {n_subclusters} subclusters...")
             # Use KMeans for forced subdivision
     else:
         print("No additional subdivisions needed - all clusters within size limits")
+    # Stage 3: Handle noise points more intelligently with size constraints
     noise_mask = final_clusters == -1
     noise_count = sum(noise_mask)
     if noise_count > 0:
+        print(f"Stage 3 - Reassigning {noise_count} noise points with size constraints...")
+        # Get cluster centers and current sizes (excluding noise)
         cluster_centers = []
         cluster_labels = []
+        cluster_sizes = {}
         for label in set(final_clusters):
             if label != -1:
                 cluster_mask = final_clusters == label
                 center = np.mean(scaled_embeddings[cluster_mask], axis=0)
                 cluster_centers.append(center)
                 cluster_labels.append(label)
+                cluster_sizes[label] = sum(cluster_mask)
         if cluster_centers:
             cluster_centers = np.array(cluster_centers)
             noise_points = scaled_embeddings[noise_mask]
+            # Find nearest clusters for each noise point
+            nbrs = NearestNeighbors(n_neighbors=min(3, len(cluster_centers))).fit(cluster_centers)
             distances, nearest_indices = nbrs.kneighbors(noise_points)
+            # Use a tighter distance threshold for reassignment
+            max_distance = np.percentile(distances[:, 0], 60)  # Use 60th percentile instead of 75th
             noise_indices = np.where(noise_mask)[0]
             reassigned_count = 0
+            rejected_too_far = 0
+            rejected_too_large = 0
+            # Calculate size buffer - leave room for some noise points
+            size_buffer = max_cluster_size * 0.85  # Only allow clusters to grow to 85% of max
+            for i, (row_distances, row_nearest_indices) in enumerate(zip(distances, nearest_indices)):
+                assigned = False
+                # Try each of the nearest clusters in order
+                for dist, nearest_idx in zip(row_distances, row_nearest_indices):
+                    if dist > max_distance:
+                        break  # All remaining will be too far
+                    target_label = cluster_labels[nearest_idx]
+                    current_size = cluster_sizes[target_label]
+                    # Only assign if cluster has room to grow
+                    if current_size < size_buffer:
+                        final_clusters[noise_indices[i]] = target_label
+                        cluster_sizes[target_label] += 1  # Update size tracker
+                        reassigned_count += 1
+                        assigned = True
+                        break
+                    else:
+                        rejected_too_large += 1
+                if not assigned and row_distances[0] <= max_distance:
+                    rejected_too_far += 1
             print(f"  Reassigned {reassigned_count}/{noise_count} noise points to nearby clusters")
+            print(f"  Rejected {rejected_too_large} points (target clusters too large)")
+            print(f"  Rejected {rejected_too_far} points (too far from suitable clusters)")
+            remaining_noise = noise_count - reassigned_count
+            if remaining_noise > 0:
+                print(f"  {remaining_noise} points remain as noise to prevent oversized clusters")
+    # Stage 4: Final post-noise cleanup - subdivide any clusters that grew too large
+    print("Stage 4 - Post-noise subdivision check...")
+    final_subdivisions = 0
+    for cluster_id in set(final_clusters):
+        if cluster_id == -1:  # Skip noise
+            continue
+        cluster_mask = final_clusters == cluster_id
+        cluster_size = sum(cluster_mask)
+        # If cluster grew too large after noise reassignment, subdivide again
+        if cluster_size > max_cluster_size:
+            print(f"  Post-noise subdivision of cluster {cluster_id} ({cluster_size} patents)")
+            final_subdivisions += 1
+            # Extract data for this oversized cluster
+            cluster_data = scaled_embeddings[cluster_mask]
+            cluster_indices = np.where(cluster_mask)[0]
+            # Very aggressive subdivision for final cleanup
+            target_size = max_cluster_size * 0.7  # Target 70% of max size
+            n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
+            n_subclusters = min(8, n_subclusters)  # Reasonable cap
+            print(f"    Final split into {n_subclusters} subclusters...")
+            # Use KMeans for final subdivision
+            kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
+            subclusters = kmeans.fit_predict(cluster_data)
+            # Assign new cluster IDs
+            for i, subcluster_id in enumerate(subclusters):
+                original_idx = cluster_indices[i]
+                if subcluster_id == 0:
+                    # Keep first subcluster with original ID
+                    final_clusters[original_idx] = cluster_id
+                else:
+                    # Assign new IDs to other subclusters
+                    final_clusters[original_idx] = next_cluster_id + subcluster_id - 1
+            next_cluster_id += n_subclusters - 1
+    if final_subdivisions > 0:
+        print(f"Performed {final_subdivisions} final post-noise subdivisions")
+    else:
+        print("No post-noise subdivisions needed")
     clusters = final_clusters
         for i, (label, size, _) in enumerate(final_cluster_info):
             if size > max_cluster_size:
                 status = "❌ OVERSIZED"
+                severity = f"(+{size - max_cluster_size} over limit)"
             elif min_cluster_size <= size <= max_cluster_size:
                 status = "✅ OPTIMAL"
+                severity = ""
             else:
                 status = "⚠️  SMALL"
+                severity = f"({min_cluster_size - size} under target)"
+            print(f"  {status} Cluster {i + 1}: {size} patents {severity}")
     cluster_info = final_cluster_info