Spaces:
Sleeping
Sleeping
Commit
·
ac25e1c
1
Parent(s):
4e29396
reduced cluster size
Browse files
app.py
CHANGED
@@ -243,29 +243,29 @@ MIN_PATENTS_FOR_GAPS = 3000 # Minimum patents needed for reliable gap detection
|
|
243 |
def get_max_clusters(num_patents):
|
244 |
"""
|
245 |
Calculate optimal maximum clusters based on dataset size.
|
246 |
-
|
247 |
"""
|
248 |
if num_patents < 200:
|
249 |
-
return min(
|
250 |
elif num_patents < 500:
|
251 |
-
return min(
|
252 |
elif num_patents < 1000:
|
253 |
-
return min(
|
254 |
elif num_patents < 2000:
|
255 |
-
return min(
|
256 |
else:
|
257 |
-
return min(
|
258 |
|
259 |
def get_optimal_cluster_size(num_patents):
|
260 |
-
"""Calculate optimal target cluster size range"""
|
261 |
if num_patents < 500:
|
262 |
-
return 25,
|
263 |
elif num_patents < 1000:
|
264 |
-
return 40,
|
265 |
elif num_patents < 2000:
|
266 |
-
return
|
267 |
else:
|
268 |
-
return
|
269 |
|
270 |
if not SERPAPI_API_KEY:
|
271 |
raise ValueError("SERPAPI_API_KEY environment variable is not set")
|
@@ -664,10 +664,11 @@ def create_3d_visualization(patents):
|
|
664 |
update_progress('clustering', 'processing', 'Applying optimized UMAP dimensionality reduction...')
|
665 |
reducer = umap.UMAP(
|
666 |
n_components=3,
|
667 |
-
n_neighbors=
|
668 |
-
min_dist=0.
|
669 |
-
spread=
|
670 |
-
random_state=42
|
|
|
671 |
)
|
672 |
embedding_3d = reducer.fit_transform(embeddings_array)
|
673 |
|
@@ -696,18 +697,18 @@ def create_3d_visualization(patents):
|
|
696 |
print(f"Processing {n_points} patents with improved clustering algorithm...")
|
697 |
|
698 |
# Stage 1: Initial HDBSCAN with stricter parameters
|
699 |
-
initial_min_cluster_size = max(min_cluster_size, int(n_points * 0.
|
700 |
-
initial_min_samples = max(
|
701 |
|
702 |
print(f"Stage 1 - Initial clustering: min_cluster_size={initial_min_cluster_size}, min_samples={initial_min_samples}")
|
703 |
|
704 |
hdb = hdbscan.HDBSCAN(
|
705 |
min_cluster_size=initial_min_cluster_size,
|
706 |
min_samples=initial_min_samples,
|
707 |
-
cluster_selection_epsilon=0.
|
708 |
cluster_selection_method='eom',
|
709 |
metric='euclidean',
|
710 |
-
alpha=1.
|
711 |
)
|
712 |
initial_clusters = hdb.fit_predict(scaled_embeddings)
|
713 |
|
@@ -724,7 +725,7 @@ def create_3d_visualization(patents):
|
|
724 |
cluster_mask = initial_clusters == cluster_id
|
725 |
cluster_size = sum(cluster_mask)
|
726 |
|
727 |
-
# If cluster is too large, subdivide it
|
728 |
if cluster_size > max_cluster_size:
|
729 |
print(f" Subdividing cluster {cluster_id} ({cluster_size} patents) - TOO LARGE")
|
730 |
cluster_subdivisions += 1
|
@@ -733,9 +734,12 @@ def create_3d_visualization(patents):
|
|
733 |
cluster_data = scaled_embeddings[cluster_mask]
|
734 |
cluster_indices = np.where(cluster_mask)[0]
|
735 |
|
736 |
-
# Calculate how many subclusters we need
|
737 |
-
|
738 |
-
|
|
|
|
|
|
|
739 |
|
740 |
# Use KMeans for controlled subdivision
|
741 |
kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
|
@@ -755,6 +759,52 @@ def create_3d_visualization(patents):
|
|
755 |
|
756 |
print(f"Subdivided {cluster_subdivisions} oversized clusters")
|
757 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
758 |
# Stage 3: Handle noise points more intelligently
|
759 |
noise_mask = final_clusters == -1
|
760 |
noise_count = sum(noise_mask)
|
@@ -880,13 +930,24 @@ def create_3d_visualization(patents):
|
|
880 |
# Check if we successfully avoided mega-clusters
|
881 |
oversized_clusters = [size for size in sizes if size > max_cluster_size]
|
882 |
if oversized_clusters:
|
883 |
-
print(f"⚠️
|
|
|
|
|
|
|
|
|
|
|
|
|
884 |
else:
|
885 |
-
print(f"✅
|
886 |
|
887 |
print("\nCluster Size Distribution:")
|
888 |
for i, (label, size, _) in enumerate(final_cluster_info):
|
889 |
-
|
|
|
|
|
|
|
|
|
|
|
890 |
print(f" {status} Cluster {i + 1}: {size} patents")
|
891 |
|
892 |
cluster_info = final_cluster_info
|
|
|
243 |
def get_max_clusters(num_patents):
|
244 |
"""
|
245 |
Calculate optimal maximum clusters based on dataset size.
|
246 |
+
REVISED: More clusters for larger datasets to keep individual cluster sizes smaller.
|
247 |
"""
|
248 |
if num_patents < 200:
|
249 |
+
return min(8, num_patents // 20) # Very small: 20-25 patents per cluster
|
250 |
elif num_patents < 500:
|
251 |
+
return min(12, num_patents // 30) # Small datasets: 30-40 patents per cluster
|
252 |
elif num_patents < 1000:
|
253 |
+
return min(20, num_patents // 40) # Medium datasets: 40-50 patents per cluster
|
254 |
elif num_patents < 2000:
|
255 |
+
return min(30, num_patents // 60) # Large datasets: 60-70 patents per cluster
|
256 |
else:
|
257 |
+
return min(50, num_patents // 80) # Very large datasets: 80-100 patents per cluster (increased from 30 max)
|
258 |
|
259 |
def get_optimal_cluster_size(num_patents):
|
260 |
+
"""Calculate optimal target cluster size range - REDUCED MAX SIZES to prevent mega-clusters"""
|
261 |
if num_patents < 500:
|
262 |
+
return 25, 60 # min=25, max=60 (reduced from 80)
|
263 |
elif num_patents < 1000:
|
264 |
+
return 40, 80 # min=40, max=80 (reduced from 120)
|
265 |
elif num_patents < 2000:
|
266 |
+
return 50, 100 # min=50, max=100 (reduced from 150)
|
267 |
else:
|
268 |
+
return 60, 120 # min=60, max=120 (reduced from 200)
|
269 |
|
270 |
if not SERPAPI_API_KEY:
|
271 |
raise ValueError("SERPAPI_API_KEY environment variable is not set")
|
|
|
664 |
update_progress('clustering', 'processing', 'Applying optimized UMAP dimensionality reduction...')
|
665 |
reducer = umap.UMAP(
|
666 |
n_components=3,
|
667 |
+
n_neighbors=20, # Reduced from 30 for more local structure
|
668 |
+
min_dist=0.05, # Reduced from 0.1 for even tighter clusters
|
669 |
+
spread=0.8, # Reduced from 1.0 for better cluster separation
|
670 |
+
random_state=42,
|
671 |
+
metric='cosine' # Added cosine metric for better semantic clustering
|
672 |
)
|
673 |
embedding_3d = reducer.fit_transform(embeddings_array)
|
674 |
|
|
|
697 |
print(f"Processing {n_points} patents with improved clustering algorithm...")
|
698 |
|
699 |
# Stage 1: Initial HDBSCAN with stricter parameters
|
700 |
+
initial_min_cluster_size = max(min_cluster_size, int(n_points * 0.020)) # Increased from 0.015 to 0.020 for stricter minimum
|
701 |
+
initial_min_samples = max(8, int(initial_min_cluster_size * 0.6)) # Increased from 0.5 to 0.6 for stricter density
|
702 |
|
703 |
print(f"Stage 1 - Initial clustering: min_cluster_size={initial_min_cluster_size}, min_samples={initial_min_samples}")
|
704 |
|
705 |
hdb = hdbscan.HDBSCAN(
|
706 |
min_cluster_size=initial_min_cluster_size,
|
707 |
min_samples=initial_min_samples,
|
708 |
+
cluster_selection_epsilon=0.03, # Reduced from 0.05 for tighter clusters
|
709 |
cluster_selection_method='eom',
|
710 |
metric='euclidean',
|
711 |
+
alpha=1.2 # Increased from 1.0 for even more conservative clustering
|
712 |
)
|
713 |
initial_clusters = hdb.fit_predict(scaled_embeddings)
|
714 |
|
|
|
725 |
cluster_mask = initial_clusters == cluster_id
|
726 |
cluster_size = sum(cluster_mask)
|
727 |
|
728 |
+
# If cluster is too large, subdivide it more aggressively
|
729 |
if cluster_size > max_cluster_size:
|
730 |
print(f" Subdividing cluster {cluster_id} ({cluster_size} patents) - TOO LARGE")
|
731 |
cluster_subdivisions += 1
|
|
|
734 |
cluster_data = scaled_embeddings[cluster_mask]
|
735 |
cluster_indices = np.where(cluster_mask)[0]
|
736 |
|
737 |
+
# Calculate how many subclusters we need - MORE AGGRESSIVE subdivision
|
738 |
+
target_size = max_cluster_size * 0.75 # Target 75% of max size for better separation
|
739 |
+
n_subclusters = max(2, int(np.ceil(cluster_size / target_size)))
|
740 |
+
# Cap at reasonable maximum but allow more splits if needed
|
741 |
+
n_subclusters = min(10, n_subclusters) # Increased from 6 to 10
|
742 |
+
print(f" Splitting into {n_subclusters} subclusters (target size: {target_size:.0f})...")
|
743 |
|
744 |
# Use KMeans for controlled subdivision
|
745 |
kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
|
|
|
759 |
|
760 |
print(f"Subdivided {cluster_subdivisions} oversized clusters")
|
761 |
|
762 |
+
# Stage 2.5: Additional validation and forced subdivision for any remaining oversized clusters
|
763 |
+
print("Stage 2.5 - Final oversized cluster validation...")
|
764 |
+
additional_subdivisions = 0
|
765 |
+
for cluster_id in set(final_clusters):
|
766 |
+
if cluster_id == -1: # Skip noise
|
767 |
+
continue
|
768 |
+
|
769 |
+
cluster_mask = final_clusters == cluster_id
|
770 |
+
cluster_size = sum(cluster_mask)
|
771 |
+
|
772 |
+
# Force subdivision of any clusters still over the limit
|
773 |
+
if cluster_size > max_cluster_size:
|
774 |
+
print(f" FORCING additional subdivision of cluster {cluster_id} ({cluster_size} patents)")
|
775 |
+
additional_subdivisions += 1
|
776 |
+
|
777 |
+
# Extract data for this still-oversized cluster
|
778 |
+
cluster_data = scaled_embeddings[cluster_mask]
|
779 |
+
cluster_indices = np.where(cluster_mask)[0]
|
780 |
+
|
781 |
+
# Force more aggressive subdivision
|
782 |
+
target_size = max_cluster_size * 0.6 # Even more aggressive - 60% of max
|
783 |
+
n_subclusters = max(3, int(np.ceil(cluster_size / target_size)))
|
784 |
+
n_subclusters = min(15, n_subclusters) # Allow up to 15 splits if needed
|
785 |
+
print(f" FORCING split into {n_subclusters} subclusters...")
|
786 |
+
|
787 |
+
# Use KMeans for forced subdivision
|
788 |
+
kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
|
789 |
+
subclusters = kmeans.fit_predict(cluster_data)
|
790 |
+
|
791 |
+
# Assign new cluster IDs
|
792 |
+
for i, subcluster_id in enumerate(subclusters):
|
793 |
+
original_idx = cluster_indices[i]
|
794 |
+
if subcluster_id == 0:
|
795 |
+
# Keep first subcluster with original ID
|
796 |
+
final_clusters[original_idx] = cluster_id
|
797 |
+
else:
|
798 |
+
# Assign new IDs to other subclusters
|
799 |
+
final_clusters[original_idx] = next_cluster_id + subcluster_id - 1
|
800 |
+
|
801 |
+
next_cluster_id += n_subclusters - 1
|
802 |
+
|
803 |
+
if additional_subdivisions > 0:
|
804 |
+
print(f"Performed {additional_subdivisions} additional forced subdivisions")
|
805 |
+
else:
|
806 |
+
print("No additional subdivisions needed - all clusters within size limits")
|
807 |
+
|
808 |
# Stage 3: Handle noise points more intelligently
|
809 |
noise_mask = final_clusters == -1
|
810 |
noise_count = sum(noise_mask)
|
|
|
930 |
# Check if we successfully avoided mega-clusters
|
931 |
oversized_clusters = [size for size in sizes if size > max_cluster_size]
|
932 |
if oversized_clusters:
|
933 |
+
print(f"⚠️ WARNING: {len(oversized_clusters)} clusters STILL oversized: {oversized_clusters}")
|
934 |
+
print(f"❌ FAILED to contain all clusters within target range!")
|
935 |
+
|
936 |
+
# Log the oversized clusters for debugging
|
937 |
+
for i, (label, size, _) in enumerate(final_cluster_info):
|
938 |
+
if size > max_cluster_size:
|
939 |
+
print(f" Oversized Cluster {i + 1}: {size} patents (EXCEEDS LIMIT of {max_cluster_size})")
|
940 |
else:
|
941 |
+
print(f"✅ SUCCESS: All clusters within target size range!")
|
942 |
|
943 |
print("\nCluster Size Distribution:")
|
944 |
for i, (label, size, _) in enumerate(final_cluster_info):
|
945 |
+
if size > max_cluster_size:
|
946 |
+
status = "❌ OVERSIZED"
|
947 |
+
elif min_cluster_size <= size <= max_cluster_size:
|
948 |
+
status = "✅ OPTIMAL"
|
949 |
+
else:
|
950 |
+
status = "⚠️ SMALL"
|
951 |
print(f" {status} Cluster {i + 1}: {size} patents")
|
952 |
|
953 |
cluster_info = final_cluster_info
|