Spaces:
Sleeping
Sleeping
Commit
·
4e29396
1
Parent(s):
d8b8d6f
fixed clustering algorithm
Browse files
app.py
CHANGED
@@ -18,6 +18,7 @@ import umap
|
|
18 |
import openai
|
19 |
from sklearn.neighbors import NearestNeighbors
|
20 |
from sklearn.preprocessing import StandardScaler
|
|
|
21 |
import hdbscan
|
22 |
import plotly.graph_objects as go
|
23 |
import requests
|
@@ -242,16 +243,29 @@ MIN_PATENTS_FOR_GAPS = 3000 # Minimum patents needed for reliable gap detection
|
|
242 |
def get_max_clusters(num_patents):
|
243 |
"""
|
244 |
Calculate optimal maximum clusters based on dataset size.
|
245 |
-
Aims for clusters of
|
246 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
if num_patents < 500:
|
248 |
-
return
|
249 |
elif num_patents < 1000:
|
250 |
-
return
|
251 |
elif num_patents < 2000:
|
252 |
-
return
|
253 |
else:
|
254 |
-
return
|
255 |
|
256 |
if not SERPAPI_API_KEY:
|
257 |
raise ValueError("SERPAPI_API_KEY environment variable is not set")
|
@@ -646,16 +660,27 @@ def create_3d_visualization(patents):
|
|
646 |
|
647 |
update_progress('clustering', 'processing', 'Applying UMAP dimensionality reduction...')
|
648 |
|
649 |
-
# Apply UMAP dimensionality reduction
|
650 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
651 |
embedding_3d = reducer.fit_transform(embeddings_array)
|
652 |
|
653 |
-
# Calculate optimal cluster
|
654 |
max_clusters = get_max_clusters(len(embeddings))
|
655 |
-
|
656 |
-
print(f"Optimal cluster limit: {max_clusters} clusters (targeting 75-150 patents per cluster)")
|
657 |
|
658 |
-
|
|
|
|
|
|
|
|
|
|
|
659 |
|
660 |
# Create DataFrame for plotting
|
661 |
df = pd.DataFrame(metadata)
|
@@ -663,47 +688,86 @@ def create_3d_visualization(patents):
|
|
663 |
df['y'] = embedding_3d[:, 1]
|
664 |
df['z'] = embedding_3d[:, 2]
|
665 |
|
666 |
-
# ---
|
667 |
scaler = StandardScaler()
|
668 |
scaled_embeddings = scaler.fit_transform(embedding_3d)
|
669 |
|
670 |
n_points = len(scaled_embeddings)
|
671 |
-
|
672 |
-
|
673 |
-
#
|
674 |
-
|
675 |
-
|
676 |
-
elif n_points < 500:
|
677 |
-
min_cluster_size = max(8, int(n_points * 0.05))
|
678 |
-
elif n_points < 1000:
|
679 |
-
min_cluster_size = max(15, int(n_points * 0.03))
|
680 |
-
else:
|
681 |
-
min_cluster_size = max(20, int(n_points * 0.02))
|
682 |
-
|
683 |
-
min_samples = max(3, int(min_cluster_size * 0.7))
|
684 |
|
685 |
-
print(f"
|
686 |
|
687 |
-
# Apply HDBSCAN clustering
|
688 |
hdb = hdbscan.HDBSCAN(
|
689 |
-
min_cluster_size=
|
690 |
-
min_samples=
|
691 |
-
cluster_selection_epsilon=0.
|
692 |
cluster_selection_method='eom',
|
693 |
-
metric='euclidean'
|
|
|
694 |
)
|
695 |
-
|
696 |
|
697 |
-
#
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
702 |
cluster_centers = []
|
703 |
cluster_labels = []
|
704 |
-
for label in set(
|
705 |
if label != -1:
|
706 |
-
cluster_mask =
|
707 |
center = np.mean(scaled_embeddings[cluster_mask], axis=0)
|
708 |
cluster_centers.append(center)
|
709 |
cluster_labels.append(label)
|
@@ -714,12 +778,21 @@ def create_3d_visualization(patents):
|
|
714 |
|
715 |
# Find nearest cluster for each noise point
|
716 |
nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
|
717 |
-
|
|
|
|
|
|
|
718 |
|
719 |
-
# Assign noise points to nearest clusters
|
720 |
noise_indices = np.where(noise_mask)[0]
|
721 |
-
|
722 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
723 |
|
724 |
df['cluster'] = clusters
|
725 |
|
@@ -776,25 +849,47 @@ def create_3d_visualization(patents):
|
|
776 |
# Update cluster_info to only include main clusters
|
777 |
cluster_info = main_clusters
|
778 |
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
|
|
|
|
|
|
783 |
cluster_patents = df[cluster_mask]
|
784 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
785 |
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
|
|
|
|
|
|
|
|
|
798 |
|
799 |
# Create mapping for new cluster IDs (1-based)
|
800 |
cluster_id_map = {old_label: i + 1 for i, (old_label, _, _) in enumerate(cluster_info)}
|
|
|
18 |
import openai
|
19 |
from sklearn.neighbors import NearestNeighbors
|
20 |
from sklearn.preprocessing import StandardScaler
|
21 |
+
from sklearn.cluster import KMeans
|
22 |
import hdbscan
|
23 |
import plotly.graph_objects as go
|
24 |
import requests
|
|
|
243 |
def get_max_clusters(num_patents):
|
244 |
"""
|
245 |
Calculate optimal maximum clusters based on dataset size.
|
246 |
+
Aims for clusters of 50-200 patents for meaningful technological analysis.
|
247 |
"""
|
248 |
+
if num_patents < 200:
|
249 |
+
return min(6, num_patents // 25) # Very small: 25-35 patents per cluster
|
250 |
+
elif num_patents < 500:
|
251 |
+
return min(10, num_patents // 40) # Small datasets: 40-50 patents per cluster
|
252 |
+
elif num_patents < 1000:
|
253 |
+
return min(15, num_patents // 60) # Medium datasets: 60-70 patents per cluster
|
254 |
+
elif num_patents < 2000:
|
255 |
+
return min(20, num_patents // 80) # Large datasets: 80-100 patents per cluster
|
256 |
+
else:
|
257 |
+
return min(30, num_patents // 100) # Very large datasets: 100-150 patents per cluster
|
258 |
+
|
259 |
+
def get_optimal_cluster_size(num_patents):
|
260 |
+
"""Calculate optimal target cluster size range"""
|
261 |
if num_patents < 500:
|
262 |
+
return 25, 80 # min=25, max=80
|
263 |
elif num_patents < 1000:
|
264 |
+
return 40, 120 # min=40, max=120
|
265 |
elif num_patents < 2000:
|
266 |
+
return 60, 150 # min=60, max=150
|
267 |
else:
|
268 |
+
return 80, 200 # min=80, max=200
|
269 |
|
270 |
if not SERPAPI_API_KEY:
|
271 |
raise ValueError("SERPAPI_API_KEY environment variable is not set")
|
|
|
660 |
|
661 |
update_progress('clustering', 'processing', 'Applying UMAP dimensionality reduction...')
|
662 |
|
663 |
+
# Apply UMAP dimensionality reduction with better parameters for technology separation
|
664 |
+
update_progress('clustering', 'processing', 'Applying optimized UMAP dimensionality reduction...')
|
665 |
+
reducer = umap.UMAP(
|
666 |
+
n_components=3,
|
667 |
+
n_neighbors=30, # Increased for better global structure
|
668 |
+
min_dist=0.1, # Reduced for tighter clusters
|
669 |
+
spread=1.0, # Better cluster separation
|
670 |
+
random_state=42
|
671 |
+
)
|
672 |
embedding_3d = reducer.fit_transform(embeddings_array)
|
673 |
|
674 |
+
# Calculate optimal cluster parameters
|
675 |
max_clusters = get_max_clusters(len(embeddings))
|
676 |
+
min_cluster_size, max_cluster_size = get_optimal_cluster_size(len(embeddings))
|
|
|
677 |
|
678 |
+
print(f"\n🎯 IMPROVED CLUSTERING STRATEGY:")
|
679 |
+
print(f"Dataset size: {len(embeddings)} patents")
|
680 |
+
print(f"Target cluster range: {min_cluster_size}-{max_cluster_size} patents per cluster")
|
681 |
+
print(f"Maximum clusters allowed: {max_clusters}")
|
682 |
+
|
683 |
+
update_progress('clustering', 'processing', f'Performing advanced multi-stage clustering...')
|
684 |
|
685 |
# Create DataFrame for plotting
|
686 |
df = pd.DataFrame(metadata)
|
|
|
688 |
df['y'] = embedding_3d[:, 1]
|
689 |
df['z'] = embedding_3d[:, 2]
|
690 |
|
691 |
+
# --- IMPROVED MULTI-STAGE CLUSTERING ALGORITHM ---
|
692 |
scaler = StandardScaler()
|
693 |
scaled_embeddings = scaler.fit_transform(embedding_3d)
|
694 |
|
695 |
n_points = len(scaled_embeddings)
|
696 |
+
print(f"Processing {n_points} patents with improved clustering algorithm...")
|
697 |
+
|
698 |
+
# Stage 1: Initial HDBSCAN with stricter parameters
|
699 |
+
initial_min_cluster_size = max(min_cluster_size, int(n_points * 0.015)) # More aggressive minimum
|
700 |
+
initial_min_samples = max(5, int(initial_min_cluster_size * 0.5)) # Stricter density requirement
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
701 |
|
702 |
+
print(f"Stage 1 - Initial clustering: min_cluster_size={initial_min_cluster_size}, min_samples={initial_min_samples}")
|
703 |
|
|
|
704 |
hdb = hdbscan.HDBSCAN(
|
705 |
+
min_cluster_size=initial_min_cluster_size,
|
706 |
+
min_samples=initial_min_samples,
|
707 |
+
cluster_selection_epsilon=0.05, # Reduced for better separation
|
708 |
cluster_selection_method='eom',
|
709 |
+
metric='euclidean',
|
710 |
+
alpha=1.0 # More conservative clustering
|
711 |
)
|
712 |
+
initial_clusters = hdb.fit_predict(scaled_embeddings)
|
713 |
|
714 |
+
# Stage 2: Subdivide oversized clusters
|
715 |
+
print("Stage 2 - Subdividing oversized clusters...")
|
716 |
+
final_clusters = initial_clusters.copy()
|
717 |
+
next_cluster_id = max(initial_clusters) + 1 if len(set(initial_clusters)) > 1 else 0
|
718 |
+
|
719 |
+
cluster_subdivisions = 0
|
720 |
+
for cluster_id in set(initial_clusters):
|
721 |
+
if cluster_id == -1: # Skip noise
|
722 |
+
continue
|
723 |
+
|
724 |
+
cluster_mask = initial_clusters == cluster_id
|
725 |
+
cluster_size = sum(cluster_mask)
|
726 |
+
|
727 |
+
# If cluster is too large, subdivide it
|
728 |
+
if cluster_size > max_cluster_size:
|
729 |
+
print(f" Subdividing cluster {cluster_id} ({cluster_size} patents) - TOO LARGE")
|
730 |
+
cluster_subdivisions += 1
|
731 |
+
|
732 |
+
# Extract data for this oversized cluster
|
733 |
+
cluster_data = scaled_embeddings[cluster_mask]
|
734 |
+
cluster_indices = np.where(cluster_mask)[0]
|
735 |
+
|
736 |
+
# Calculate how many subclusters we need
|
737 |
+
n_subclusters = min(6, max(2, cluster_size // max_cluster_size + 1))
|
738 |
+
print(f" Splitting into {n_subclusters} subclusters...")
|
739 |
+
|
740 |
+
# Use KMeans for controlled subdivision
|
741 |
+
kmeans = KMeans(n_clusters=n_subclusters, random_state=42, n_init=10)
|
742 |
+
subclusters = kmeans.fit_predict(cluster_data)
|
743 |
+
|
744 |
+
# Assign new cluster IDs
|
745 |
+
for i, subcluster_id in enumerate(subclusters):
|
746 |
+
original_idx = cluster_indices[i]
|
747 |
+
if subcluster_id == 0:
|
748 |
+
# Keep first subcluster with original ID
|
749 |
+
final_clusters[original_idx] = cluster_id
|
750 |
+
else:
|
751 |
+
# Assign new IDs to other subclusters
|
752 |
+
final_clusters[original_idx] = next_cluster_id + subcluster_id - 1
|
753 |
+
|
754 |
+
next_cluster_id += n_subclusters - 1
|
755 |
+
|
756 |
+
print(f"Subdivided {cluster_subdivisions} oversized clusters")
|
757 |
+
|
758 |
+
# Stage 3: Handle noise points more intelligently
|
759 |
+
noise_mask = final_clusters == -1
|
760 |
+
noise_count = sum(noise_mask)
|
761 |
+
|
762 |
+
if noise_count > 0:
|
763 |
+
print(f"Stage 3 - Reassigning {noise_count} noise points...")
|
764 |
+
|
765 |
+
# Get cluster centers (excluding noise)
|
766 |
cluster_centers = []
|
767 |
cluster_labels = []
|
768 |
+
for label in set(final_clusters):
|
769 |
if label != -1:
|
770 |
+
cluster_mask = final_clusters == label
|
771 |
center = np.mean(scaled_embeddings[cluster_mask], axis=0)
|
772 |
cluster_centers.append(center)
|
773 |
cluster_labels.append(label)
|
|
|
778 |
|
779 |
# Find nearest cluster for each noise point
|
780 |
nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
|
781 |
+
distances, nearest_indices = nbrs.kneighbors(noise_points)
|
782 |
+
|
783 |
+
# Only assign noise points that are reasonably close to a cluster
|
784 |
+
max_distance = np.percentile(distances, 75) # Use 75th percentile as threshold
|
785 |
|
|
|
786 |
noise_indices = np.where(noise_mask)[0]
|
787 |
+
reassigned_count = 0
|
788 |
+
for i, (distance, nearest_idx) in enumerate(zip(distances.flatten(), nearest_indices.flatten())):
|
789 |
+
if distance <= max_distance:
|
790 |
+
final_clusters[noise_indices[i]] = cluster_labels[nearest_idx]
|
791 |
+
reassigned_count += 1
|
792 |
+
|
793 |
+
print(f" Reassigned {reassigned_count}/{noise_count} noise points to nearby clusters")
|
794 |
+
|
795 |
+
clusters = final_clusters
|
796 |
|
797 |
df['cluster'] = clusters
|
798 |
|
|
|
849 |
# Update cluster_info to only include main clusters
|
850 |
cluster_info = main_clusters
|
851 |
|
852 |
+
# Final cluster validation and reporting
|
853 |
+
final_cluster_info = []
|
854 |
+
noise_count = sum(1 for c in clusters if c == -1)
|
855 |
+
|
856 |
+
for label in set(clusters):
|
857 |
+
if label != -1: # Skip noise
|
858 |
+
cluster_mask = clusters == label
|
859 |
cluster_patents = df[cluster_mask]
|
860 |
+
if len(cluster_patents) > 0:
|
861 |
+
final_cluster_info.append((label, len(cluster_patents), cluster_patents))
|
862 |
+
|
863 |
+
# Sort clusters by size in descending order
|
864 |
+
final_cluster_info.sort(key=lambda x: x[1], reverse=True)
|
865 |
+
|
866 |
+
print(f"\n✅ FINAL CLUSTERING RESULTS:")
|
867 |
+
print(f"Total patents processed: {len(df)}")
|
868 |
+
print(f"Number of technology clusters: {len(final_cluster_info)}")
|
869 |
+
print(f"Noise points (unassigned): {noise_count}")
|
870 |
+
|
871 |
+
if final_cluster_info:
|
872 |
+
sizes = [size for _, size, _ in final_cluster_info]
|
873 |
+
avg_size = np.mean(sizes)
|
874 |
+
min_size = min(sizes)
|
875 |
+
max_size = max(sizes)
|
876 |
|
877 |
+
print(f"Cluster size stats: min={min_size}, avg={avg_size:.1f}, max={max_size}")
|
878 |
+
print(f"Target range was: {min_cluster_size}-{max_cluster_size} patents per cluster")
|
879 |
+
|
880 |
+
# Check if we successfully avoided mega-clusters
|
881 |
+
oversized_clusters = [size for size in sizes if size > max_cluster_size]
|
882 |
+
if oversized_clusters:
|
883 |
+
print(f"⚠️ Warning: {len(oversized_clusters)} clusters still oversized: {oversized_clusters}")
|
884 |
+
else:
|
885 |
+
print(f"✅ Success: All clusters within target size range!")
|
886 |
+
|
887 |
+
print("\nCluster Size Distribution:")
|
888 |
+
for i, (label, size, _) in enumerate(final_cluster_info):
|
889 |
+
status = "✅" if min_cluster_size <= size <= max_cluster_size else "⚠️"
|
890 |
+
print(f" {status} Cluster {i + 1}: {size} patents")
|
891 |
+
|
892 |
+
cluster_info = final_cluster_info
|
893 |
|
894 |
# Create mapping for new cluster IDs (1-based)
|
895 |
cluster_id_map = {old_label: i + 1 for i, (old_label, _, _) in enumerate(cluster_info)}
|