Spaces:
Sleeping
Sleeping
Commit
·
3230189
1
Parent(s):
ac25e1c
fixed cluster sizes
Browse files
app.py
CHANGED
@@ -257,15 +257,15 @@ def get_max_clusters(num_patents):
|
|
257 |
return min(50, num_patents // 80) # Very large datasets: 80-100 patents per cluster (increased from 30 max)
|
258 |
|
259 |
def get_optimal_cluster_size(num_patents):
|
260 |
-
"""Calculate optimal target cluster size range -
|
261 |
if num_patents < 500:
|
262 |
-
return 25,
|
263 |
elif num_patents < 1000:
|
264 |
-
return 40,
|
265 |
elif num_patents < 2000:
|
266 |
-
return 50,
|
267 |
else:
|
268 |
-
return 60,
|
269 |
|
270 |
if not SERPAPI_API_KEY:
|
271 |
raise ValueError("SERPAPI_API_KEY environment variable is not set")
|
@@ -735,10 +735,10 @@ def create_3d_visualization(patents):
|
|
735 |
cluster_indices = np.where(cluster_mask)[0]
|
736 |
|
737 |
# Calculate how many subclusters we need - MORE AGGRESSIVE subdivision
|
738 |
-
target_size = max_cluster_size * 0.
|
739 |
n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
|
740 |
# Cap at reasonable maximum but allow more splits if needed
|
741 |
-
n_subclusters = min(
|
742 |
print(f" Splitting into {n_subclusters} subclusters (target size: {target_size:.0f})...")
|
743 |
|
744 |
# Use KMeans for controlled subdivision
|
@@ -779,9 +779,9 @@ def create_3d_visualization(patents):
|
|
779 |
cluster_indices = np.where(cluster_mask)[0]
|
780 |
|
781 |
# Force more aggressive subdivision
|
782 |
-
target_size = max_cluster_size * 0.
|
783 |
n_subclusters = max(3, int(np.ceil(cluster_size / target_size)))
|
784 |
-
n_subclusters = min(
|
785 |
print(f" FORCING split into {n_subclusters} subclusters...")
|
786 |
|
787 |
# Use KMeans for forced subdivision
|
@@ -805,42 +805,120 @@ def create_3d_visualization(patents):
|
|
805 |
else:
|
806 |
print("No additional subdivisions needed - all clusters within size limits")
|
807 |
|
808 |
-
# Stage 3: Handle noise points more intelligently
|
809 |
noise_mask = final_clusters == -1
|
810 |
noise_count = sum(noise_mask)
|
811 |
|
812 |
if noise_count > 0:
|
813 |
-
print(f"Stage 3 - Reassigning {noise_count} noise points...")
|
814 |
|
815 |
-
# Get cluster centers (excluding noise)
|
816 |
cluster_centers = []
|
817 |
cluster_labels = []
|
|
|
818 |
for label in set(final_clusters):
|
819 |
if label != -1:
|
820 |
cluster_mask = final_clusters == label
|
821 |
center = np.mean(scaled_embeddings[cluster_mask], axis=0)
|
822 |
cluster_centers.append(center)
|
823 |
cluster_labels.append(label)
|
|
|
824 |
|
825 |
if cluster_centers:
|
826 |
cluster_centers = np.array(cluster_centers)
|
827 |
noise_points = scaled_embeddings[noise_mask]
|
828 |
|
829 |
-
# Find nearest
|
830 |
-
nbrs = NearestNeighbors(n_neighbors=
|
831 |
distances, nearest_indices = nbrs.kneighbors(noise_points)
|
832 |
|
833 |
-
#
|
834 |
-
max_distance = np.percentile(distances,
|
835 |
|
836 |
noise_indices = np.where(noise_mask)[0]
|
837 |
reassigned_count = 0
|
838 |
-
|
839 |
-
|
840 |
-
|
841 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
842 |
|
843 |
print(f" Reassigned {reassigned_count}/{noise_count} noise points to nearby clusters")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
844 |
|
845 |
clusters = final_clusters
|
846 |
|
@@ -944,11 +1022,14 @@ def create_3d_visualization(patents):
|
|
944 |
for i, (label, size, _) in enumerate(final_cluster_info):
|
945 |
if size > max_cluster_size:
|
946 |
status = "❌ OVERSIZED"
|
|
|
947 |
elif min_cluster_size <= size <= max_cluster_size:
|
948 |
status = "✅ OPTIMAL"
|
|
|
949 |
else:
|
950 |
status = "⚠️ SMALL"
|
951 |
-
|
|
|
952 |
|
953 |
cluster_info = final_cluster_info
|
954 |
|
|
|
257 |
return min(50, num_patents // 80) # Very large datasets: 80-100 patents per cluster (increased from 30 max)
|
258 |
|
259 |
def get_optimal_cluster_size(num_patents):
|
260 |
+
"""Calculate optimal target cluster size range - ADJUSTED to account for noise point reassignment"""
|
261 |
if num_patents < 500:
|
262 |
+
return 25, 90 # min=25, max=90 (increased from 60 to allow room for noise points)
|
263 |
elif num_patents < 1000:
|
264 |
+
return 40, 100 # min=40, max=100 (increased from 80)
|
265 |
elif num_patents < 2000:
|
266 |
+
return 50, 130 # min=50, max=130 (increased from 100)
|
267 |
else:
|
268 |
+
return 60, 150 # min=60, max=150 (increased from 120)
|
269 |
|
270 |
if not SERPAPI_API_KEY:
|
271 |
raise ValueError("SERPAPI_API_KEY environment variable is not set")
|
|
|
735 |
cluster_indices = np.where(cluster_mask)[0]
|
736 |
|
737 |
# Calculate how many subclusters we need - MORE AGGRESSIVE subdivision
|
738 |
+
target_size = max_cluster_size * 0.6 # Target 60% of max size for better buffer
|
739 |
n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
|
740 |
# Cap at reasonable maximum but allow more splits if needed
|
741 |
+
n_subclusters = min(12, n_subclusters) # Increased from 10 to 12
|
742 |
print(f" Splitting into {n_subclusters} subclusters (target size: {target_size:.0f})...")
|
743 |
|
744 |
# Use KMeans for controlled subdivision
|
|
|
779 |
cluster_indices = np.where(cluster_mask)[0]
|
780 |
|
781 |
# Force more aggressive subdivision
|
782 |
+
target_size = max_cluster_size * 0.5 # Even more aggressive - 50% of max
|
783 |
n_subclusters = max(3, int(np.ceil(cluster_size / target_size)))
|
784 |
+
n_subclusters = min(20, n_subclusters) # Allow up to 20 splits if needed
|
785 |
print(f" FORCING split into {n_subclusters} subclusters...")
|
786 |
|
787 |
# Use KMeans for forced subdivision
|
|
|
805 |
else:
|
806 |
print("No additional subdivisions needed - all clusters within size limits")
|
807 |
|
808 |
+
# Stage 3: Handle noise points more intelligently with size constraints
|
809 |
noise_mask = final_clusters == -1
|
810 |
noise_count = sum(noise_mask)
|
811 |
|
812 |
if noise_count > 0:
|
813 |
+
print(f"Stage 3 - Reassigning {noise_count} noise points with size constraints...")
|
814 |
|
815 |
+
# Get cluster centers and current sizes (excluding noise)
|
816 |
cluster_centers = []
|
817 |
cluster_labels = []
|
818 |
+
cluster_sizes = {}
|
819 |
for label in set(final_clusters):
|
820 |
if label != -1:
|
821 |
cluster_mask = final_clusters == label
|
822 |
center = np.mean(scaled_embeddings[cluster_mask], axis=0)
|
823 |
cluster_centers.append(center)
|
824 |
cluster_labels.append(label)
|
825 |
+
cluster_sizes[label] = sum(cluster_mask)
|
826 |
|
827 |
if cluster_centers:
|
828 |
cluster_centers = np.array(cluster_centers)
|
829 |
noise_points = scaled_embeddings[noise_mask]
|
830 |
|
831 |
+
# Find nearest clusters for each noise point
|
832 |
+
nbrs = NearestNeighbors(n_neighbors=min(3, len(cluster_centers))).fit(cluster_centers)
|
833 |
distances, nearest_indices = nbrs.kneighbors(noise_points)
|
834 |
|
835 |
+
# Use a tighter distance threshold for reassignment
|
836 |
+
max_distance = np.percentile(distances[:, 0], 60) # Use 60th percentile instead of 75th
|
837 |
|
838 |
noise_indices = np.where(noise_mask)[0]
|
839 |
reassigned_count = 0
|
840 |
+
rejected_too_far = 0
|
841 |
+
rejected_too_large = 0
|
842 |
+
|
843 |
+
# Calculate size buffer - leave room for some noise points
|
844 |
+
size_buffer = max_cluster_size * 0.85 # Only allow clusters to grow to 85% of max
|
845 |
+
|
846 |
+
for i, (row_distances, row_nearest_indices) in enumerate(zip(distances, nearest_indices)):
|
847 |
+
assigned = False
|
848 |
+
|
849 |
+
# Try each of the nearest clusters in order
|
850 |
+
for dist, nearest_idx in zip(row_distances, row_nearest_indices):
|
851 |
+
if dist > max_distance:
|
852 |
+
break # All remaining will be too far
|
853 |
+
|
854 |
+
target_label = cluster_labels[nearest_idx]
|
855 |
+
current_size = cluster_sizes[target_label]
|
856 |
+
|
857 |
+
# Only assign if cluster has room to grow
|
858 |
+
if current_size < size_buffer:
|
859 |
+
final_clusters[noise_indices[i]] = target_label
|
860 |
+
cluster_sizes[target_label] += 1 # Update size tracker
|
861 |
+
reassigned_count += 1
|
862 |
+
assigned = True
|
863 |
+
break
|
864 |
+
else:
|
865 |
+
rejected_too_large += 1
|
866 |
+
|
867 |
+
if not assigned and row_distances[0] <= max_distance:
|
868 |
+
rejected_too_far += 1
|
869 |
|
870 |
print(f" Reassigned {reassigned_count}/{noise_count} noise points to nearby clusters")
|
871 |
+
print(f" Rejected {rejected_too_large} points (target clusters too large)")
|
872 |
+
print(f" Rejected {rejected_too_far} points (too far from suitable clusters)")
|
873 |
+
remaining_noise = noise_count - reassigned_count
|
874 |
+
if remaining_noise > 0:
|
875 |
+
print(f" {remaining_noise} points remain as noise to prevent oversized clusters")
|
876 |
+
|
877 |
+
# Stage 4: Final post-noise cleanup - subdivide any clusters that grew too large
|
878 |
+
print("Stage 4 - Post-noise subdivision check...")
|
879 |
+
final_subdivisions = 0
|
880 |
+
for cluster_id in set(final_clusters):
|
881 |
+
if cluster_id == -1: # Skip noise
|
882 |
+
continue
|
883 |
+
|
884 |
+
cluster_mask = final_clusters == cluster_id
|
885 |
+
cluster_size = sum(cluster_mask)
|
886 |
+
|
887 |
+
# If cluster grew too large after noise reassignment, subdivide again
|
888 |
+
if cluster_size > max_cluster_size:
|
889 |
+
print(f" Post-noise subdivision of cluster {cluster_id} ({cluster_size} patents)")
|
890 |
+
final_subdivisions += 1
|
891 |
+
|
892 |
+
# Extract data for this oversized cluster
|
893 |
+
cluster_data = scaled_embeddings[cluster_mask]
|
894 |
+
cluster_indices = np.where(cluster_mask)[0]
|
895 |
+
|
896 |
+
# Very aggressive subdivision for final cleanup
|
897 |
+
target_size = max_cluster_size * 0.7 # Target 70% of max size
|
898 |
+
n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
|
899 |
+
n_subclusters = min(8, n_subclusters) # Reasonable cap
|
900 |
+
print(f" Final split into {n_subclusters} subclusters...")
|
901 |
+
|
902 |
+
# Use KMeans for final subdivision
|
903 |
+
kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
|
904 |
+
subclusters = kmeans.fit_predict(cluster_data)
|
905 |
+
|
906 |
+
# Assign new cluster IDs
|
907 |
+
for i, subcluster_id in enumerate(subclusters):
|
908 |
+
original_idx = cluster_indices[i]
|
909 |
+
if subcluster_id == 0:
|
910 |
+
# Keep first subcluster with original ID
|
911 |
+
final_clusters[original_idx] = cluster_id
|
912 |
+
else:
|
913 |
+
# Assign new IDs to other subclusters
|
914 |
+
final_clusters[original_idx] = next_cluster_id + subcluster_id - 1
|
915 |
+
|
916 |
+
next_cluster_id += n_subclusters - 1
|
917 |
+
|
918 |
+
if final_subdivisions > 0:
|
919 |
+
print(f"Performed {final_subdivisions} final post-noise subdivisions")
|
920 |
+
else:
|
921 |
+
print("No post-noise subdivisions needed")
|
922 |
|
923 |
clusters = final_clusters
|
924 |
|
|
|
1022 |
for i, (label, size, _) in enumerate(final_cluster_info):
|
1023 |
if size > max_cluster_size:
|
1024 |
status = "❌ OVERSIZED"
|
1025 |
+
severity = f"(+{size - max_cluster_size} over limit)"
|
1026 |
elif min_cluster_size <= size <= max_cluster_size:
|
1027 |
status = "✅ OPTIMAL"
|
1028 |
+
severity = ""
|
1029 |
else:
|
1030 |
status = "⚠️ SMALL"
|
1031 |
+
severity = f"({min_cluster_size - size} under target)"
|
1032 |
+
print(f" {status} Cluster {i + 1}: {size} patents {severity}")
|
1033 |
|
1034 |
cluster_info = final_cluster_info
|
1035 |
|