PhyllisPeh commited on
Commit
0fbc86f
Β·
1 Parent(s): 398e8e4

removed transitional areas and underexplored areas

Browse files
Files changed (2) hide show
  1. app.py +124 -759
  2. templates/index.html +9 -49
app.py CHANGED
@@ -489,7 +489,7 @@ def search_patents(keywords, page_size=100):
489
  return all_patents
490
 
491
  def analyze_patent_group(patents, group_type, label, max_retries=3):
492
- """Analyze patent groups using ChatGPT with improved formatting and concise output"""
493
  # Extract key information from all patents in the group
494
  patent_count = len(patents)
495
  years_range = f"{patents['year'].min()}-{patents['year'].max()}"
@@ -535,11 +535,8 @@ def analyze_patent_group(patents, group_type, label, max_retries=3):
535
  else:
536
  top_assignees = ", ".join(patents['assignee'].unique())
537
 
538
- # Enhanced prompt templates for better analysis quality while maintaining conciseness
539
- # Improved structure and specific guidance for more actionable insights
540
- prompts = {
541
- 'cluster': (
542
- f"""Patent cluster analysis ({patent_count} patents, {years_range}):
543
  Key players: {top_assignees}
544
  Core technologies: {key_terms}
545
  Sample innovations: {example_titles}
@@ -547,40 +544,9 @@ Sample innovations: {example_titles}
547
  Provide concise analysis in exactly this format:
548
  **Technology Focus:** [What specific problem/need this cluster addresses]
549
  **Market Applications:** [Primary commercial uses and target industries]
550
- **Innovation Trajectory:** [How this technology is evolving and future direction]""",
551
 
552
- "You are a patent analyst providing strategic technology insights. Focus on commercial relevance and market opportunities."
553
- ),
554
- 'transitional': (
555
- f"""Transitional technology area ({patent_count} patents, {years_range}):
556
- Key players: {top_assignees}
557
- Bridge technologies: {key_terms}
558
- Sample innovations: {example_titles}
559
-
560
- Provide concise analysis in exactly this format:
561
- **Technology Bridge:** [Which established fields this area connects]
562
- **Integration Value:** [Why combining these technologies creates value]
563
- **Market Potential:** [Commercial opportunities from this convergence]""",
564
-
565
- "You are a patent analyst identifying technology convergence opportunities. Focus on cross-domain innovation potential."
566
- ),
567
- 'innovation_subcluster': (
568
- f"""Underexplored technology area ({patent_count} patents, {years_range}):
569
- Current players: {top_assignees}
570
- Emerging concepts: {key_terms}
571
- Early innovations: {example_titles}
572
-
573
- Provide concise analysis in exactly this format:
574
- **Market Gap:** [Unmet need or problem this area could solve]
575
- **Technical Approach:** [Current methods and their limitations]
576
- **Innovation Opportunity:** [Specific R&D directions with commercial potential]""",
577
-
578
- "You are a patent analyst identifying innovation opportunities. Focus on market gaps and commercial potential for R&D investment."
579
- )
580
- }
581
-
582
- base_prompt = prompts[group_type][0]
583
- system_prompt = prompts[group_type][1]
584
 
585
  retry_count = 0
586
  while retry_count < max_retries:
@@ -603,12 +569,6 @@ Provide concise analysis in exactly this format:
603
  analysis = re.sub(r'(?i)technology focus:', '**Technology Focus:**', analysis)
604
  analysis = re.sub(r'(?i)market applications:', '**Market Applications:**', analysis)
605
  analysis = re.sub(r'(?i)innovation trajectory:', '**Innovation Trajectory:**', analysis)
606
- analysis = re.sub(r'(?i)technology bridge:', '**Technology Bridge:**', analysis)
607
- analysis = re.sub(r'(?i)integration value:', '**Integration Value:**', analysis)
608
- analysis = re.sub(r'(?i)market potential:', '**Market Potential:**', analysis)
609
- analysis = re.sub(r'(?i)market gap:', '**Market Gap:**', analysis)
610
- analysis = re.sub(r'(?i)technical approach:', '**Technical Approach:**', analysis)
611
- analysis = re.sub(r'(?i)innovation opportunity:', '**Innovation Opportunity:**', analysis)
612
 
613
  # Clean up whitespace and formatting
614
  analysis = re.sub(r'\n\s*\n', '\n', analysis) # Remove multiple blank lines
@@ -632,10 +592,8 @@ def create_3d_visualization(patents):
632
  """
633
  Create a 3D visualization of patent embeddings using UMAP and Plotly
634
  """
635
- # Initialize variables for tracking different point types
636
  df = pd.DataFrame(patents)
637
- df['point_type'] = 'cluster' # Default type for all points
638
- transitional_areas = [] # Initialize empty list for transitional areas
639
 
640
  if not patents:
641
  return None
@@ -690,698 +648,165 @@ def create_3d_visualization(patents):
690
  df['y'] = embedding_3d[:, 1]
691
  df['z'] = embedding_3d[:, 2]
692
 
693
- # --- Improved HDBSCAN clustering logic for sparse region detection ---
694
  scaler = StandardScaler()
695
  scaled_embeddings = scaler.fit_transform(embedding_3d)
696
 
697
  n_points = len(scaled_embeddings)
698
  update_progress('clustering', 'processing', f'Analyzing {n_points} patents for clustering...')
699
 
700
- # Dynamically set max_clusters and target_noise based on number of patents
701
  if n_points < 100:
702
- max_clusters = 4
703
- max_retries = 2
704
- target_noise_ratio = 0.08
705
  elif n_points < 500:
706
- max_clusters = 6
707
- max_retries = 3
708
- target_noise_ratio = 0.06
709
  elif n_points < 1000:
710
- max_clusters = 8
711
- max_retries = 4
712
- target_noise_ratio = 0.05
713
- else:
714
- max_clusters = 15 # Increased from 12 to force more granular clustering
715
- max_retries = 8 # More retries to find optimal clustering
716
- target_noise_ratio = 0.03 # Keep low noise ratio
717
-
718
- # Even more aggressive cluster parameters for large datasets
719
- if n_points >= 1000:
720
- min_cluster_size = max(5, int(n_points * 0.015)) # Further reduced to 1.5% for large datasets
721
- min_samples = max(3, int(min_cluster_size * 0.95)) # Increased to 0.95 for even stricter formation
722
  else:
723
- min_cluster_size = max(5, int(n_points * 0.02)) # 2% for smaller datasets
724
- min_samples = max(3, int(min_cluster_size * 0.9)) # 0.9 ratio for smaller datasets
725
 
726
- target_noise = int(n_points * target_noise_ratio)
727
- print(f"Initial HDBSCAN: min_cluster_size={min_cluster_size}, min_samples={min_samples}, max_clusters={max_clusters}, max_retries={max_retries}, target_noise={target_noise}")
728
- retry = 0
729
- clusters = None
730
- n_clusters = 0
731
- n_noise = 0
732
- best_result = None
733
- best_score = float('-inf')
734
-
735
- while retry < max_retries:
736
- hdb = hdbscan.HDBSCAN(
737
- min_cluster_size=min_cluster_size,
738
- min_samples=min_samples,
739
- cluster_selection_epsilon=0.03, # Reduced further to force even tighter clusters
740
- cluster_selection_method='eom',
741
- metric='euclidean',
742
- prediction_data=True
743
- )
744
- clusters = hdb.fit_predict(scaled_embeddings)
745
- n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
746
- n_noise = list(clusters).count(-1)
747
- noise_ratio = n_noise / len(clusters)
748
- avg_cluster_size = (len(clusters) - n_noise) / n_clusters if n_clusters > 0 else float('inf')
749
-
750
- print(f"\nClustering Statistics (try {retry+1}):")
751
- print(f"Number of clusters: {n_clusters}")
752
- print(f"Number of patents in sparse regions: {n_noise}")
753
- print(f"Total number of patents: {len(clusters)}")
754
- print(f"Noise ratio: {noise_ratio:.2%}")
755
- print(f"Average cluster size: {avg_cluster_size:.1f} patents")
756
-
757
- update_progress('clustering', 'processing',
758
- f'Optimizing clusters (attempt {retry + 1}/{max_retries}): ' +
759
- f'Found {n_clusters} clusters with avg size {avg_cluster_size:.1f} patents')
760
-
761
- # Calculate a score for this clustering result
762
- # Penalize both too many and too few clusters, and reward good noise ratio
763
- score = -abs(n_clusters - max_clusters) + \
764
- -abs(noise_ratio - target_noise_ratio) * 10 + \
765
- -abs(avg_cluster_size - (n_points / max_clusters)) / 10
766
-
767
- if score > best_score:
768
- best_score = score
769
- best_result = (clusters, n_clusters, n_noise, noise_ratio, avg_cluster_size)
770
-
771
- # Adjust parameters based on results
772
- if n_clusters > max_clusters:
773
- print("Too many clusters, increasing parameters more aggressively...")
774
- min_cluster_size = int(min_cluster_size * 1.5) # More aggressive increase
775
- min_samples = int(min_samples * 1.4)
776
- elif n_clusters == 1 and avg_cluster_size > len(clusters) * 0.8:
777
- print("Single dominant cluster detected, adjusting for better separation...")
778
- min_cluster_size = max(5, int(min_cluster_size * 0.6)) # More aggressive decrease
779
- min_samples = max(3, int(min_samples * 0.6))
780
- elif n_noise < target_noise * 0.5:
781
- print("Too few noise points, adjusting parameters...")
782
- min_cluster_size = int(min_cluster_size * 1.2)
783
- min_samples = max(3, int(min_samples * 0.8))
784
- elif n_clusters < max_clusters * 0.5:
785
- print("Too few clusters, decreasing parameters...")
786
- min_cluster_size = max(5, int(min_cluster_size * 0.8))
787
- min_samples = max(3, int(min_samples * 0.7))
788
- else:
789
- print("Acceptable clustering found.")
790
- break
791
 
792
- retry += 1
793
-
794
- # Use the best result if we didn't find an acceptable one
795
- if retry == max_retries and best_result is not None:
796
- print("Using best clustering result found...")
797
- clusters, n_clusters, n_noise, noise_ratio, avg_cluster_size = best_result
 
 
798
 
799
  df['cluster'] = clusters
800
 
801
- # --- First gather all existing clusters and their sizes ---
802
  cluster_info = []
 
 
803
  for label in set(clusters):
804
- if label != -1: # Skip noise points
805
- cluster_mask = clusters == label
806
- cluster_patents = df[cluster_mask]
807
- if len(cluster_patents) > 0:
808
- cluster_info.append((label, len(cluster_patents), cluster_patents))
809
 
810
  # Sort clusters by size in descending order
811
  cluster_info.sort(key=lambda x: x[1], reverse=True)
812
 
 
 
 
813
  print("\nCluster Size Distribution:")
814
  for i, (label, size, _) in enumerate(cluster_info):
815
- print(f"Cluster {i} (originally {label}): {size} patents")
816
 
817
- # Create mapping for new cluster IDs
818
- cluster_id_map = {old_label: i for i, (old_label, _, _) in enumerate(cluster_info)}
819
 
820
- # Update cluster IDs in DataFrame
821
  new_clusters = clusters.copy()
822
  for old_label, new_label in cluster_id_map.items():
823
  new_clusters[clusters == old_label] = new_label
824
  df['cluster'] = new_clusters
825
 
826
- update_progress('clustering', 'processing', 'Identifying technology clusters and underexplored areas...')
827
 
828
- # --- Initialize point types ---
829
- df['point_type'] = 'unassigned' # Start with all points unassigned
830
- cluster_insights = [] # Initialize insights list
831
-
832
- # First handle clustered points
833
  total_clusters = len(cluster_info)
834
- for new_id, (_, size, cluster_patents) in enumerate(cluster_info):
835
- update_progress('clustering', 'processing', f'Analyzing cluster {new_id + 1} of {total_clusters} ({size} patents)...')
836
- description = analyze_patent_group(cluster_patents, 'cluster', new_id)
837
- df.loc[cluster_patents.index, 'point_type'] = 'cluster' # Mark clustered points
838
  cluster_insights.append({
839
  'type': 'cluster',
840
- 'id': int(new_id) + 1, # Store as 1-based ID
841
  'size': size,
842
- 'label': f"Cluster {new_id + 1}",
843
  'description': description
844
  })
845
 
846
- # --- Improved two-stage density analysis for noise points ---
847
- noise_mask = df['cluster'] == -1
848
- noise_points = scaled_embeddings[noise_mask]
849
- noise_indices = df[noise_mask].index
850
- dense_noise_indices = [] # Initialize empty list for dense noise points
851
- true_sparse_indices = [] # Initialize empty list for sparse points
852
-
853
- if len(noise_points) >= 3:
854
- update_progress('clustering', 'processing', f'Analyzing {len(noise_points)} potential underexplored areas...')
855
- print(f"\nStructural Analysis for Underexplored Area Detection:")
856
-
857
- # Initialize sparse indices
858
- true_sparse_indices = []
859
-
860
- # Stage 1: Calculate local and global density metrics
861
- n_neighbors = min(max(5, int(len(noise_points) * 0.05)), 15)
862
- print(f"Using {n_neighbors} nearest neighbors for density calculation")
863
-
864
- # Calculate local density for noise points
865
- nbrs_local = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean').fit(noise_points)
866
- local_distances, local_indices = nbrs_local.kneighbors(noise_points)
867
- local_densities = 1 / (np.mean(local_distances, axis=1) + 1e-6) # Add small epsilon to avoid division by zero
868
-
869
- # Calculate distances to cluster centers and their densities
870
- cluster_centers = []
871
- cluster_densities = [] # Store density of each cluster
872
- for label in set(clusters) - {-1}:
873
- cluster_mask = clusters == label
874
- cluster_points = scaled_embeddings[cluster_mask]
875
- center = np.mean(cluster_points, axis=0)
876
- cluster_centers.append(center)
877
-
878
- # Calculate cluster density using its member points
879
- if len(cluster_points) > 1:
880
- nbrs_cluster = NearestNeighbors(n_neighbors=min(5, len(cluster_points))).fit(cluster_points)
881
- cluster_dists, _ = nbrs_cluster.kneighbors(cluster_points)
882
- cluster_density = 1 / (np.mean(cluster_dists) + 1e-6)
883
- else:
884
- cluster_density = 0
885
- cluster_densities.append(cluster_density)
886
-
887
- cluster_centers = np.array(cluster_centers)
888
- cluster_densities = np.array(cluster_densities)
889
-
890
- if len(cluster_centers) > 0:
891
- # Calculate distances and density ratios to nearest clusters
892
- nbrs_clusters = NearestNeighbors(n_neighbors=1, metric='euclidean').fit(cluster_centers)
893
- cluster_distances, nearest_cluster_indices = nbrs_clusters.kneighbors(noise_points)
894
- cluster_distances = cluster_distances.flatten()
895
-
896
- # Get density of nearest cluster for each point
897
- nearest_cluster_densities = cluster_densities[nearest_cluster_indices.flatten()]
898
-
899
- # Calculate density ratios (local density / nearest cluster density)
900
- density_ratios = local_densities / (nearest_cluster_densities + 1e-6)
901
-
902
- print("\nDensity Analysis Statistics:")
903
- print(f"Mean local density: {np.mean(local_densities):.3f}")
904
- print(f"Mean cluster density: {np.mean(cluster_densities):.3f}")
905
- print(f"Mean density ratio: {np.mean(density_ratios):.3f}")
906
-
907
- # Identify structural gaps using multiple criteria with more sensitive thresholds
908
- # 1. Density Isolation: Points with very low density compared to clusters
909
- # 2. Spatial Isolation: Points far from both clusters and other noise points
910
- # 3. Structural Stability: Points whose local neighborhood is also sparse
911
-
912
- # Calculate isolation scores with more balanced thresholds
913
- density_isolation = density_ratios < np.percentile(density_ratios, 65) # More balanced threshold
914
- spatial_isolation = cluster_distances > np.percentile(cluster_distances, 50) # Median distance threshold
915
-
916
- # Calculate structural stability with more balanced criteria
917
- structural_stability = np.zeros(len(noise_points), dtype=bool)
918
- for i, neighbors in enumerate(local_indices):
919
- neighbor_densities = local_densities[neighbors]
920
- # Point is stable if its neighborhood is relatively sparse
921
- structural_stability[i] = np.mean(neighbor_densities) < np.percentile(local_densities, 50) # Use median
922
-
923
- # Use more balanced criteria - only need to meet any 1 of 3 criteria initially
924
- candidate_sparse_indices = [
925
- idx for i, idx in enumerate(noise_indices)
926
- if sum([density_isolation[i], spatial_isolation[i], structural_stability[i]]) >= 1 # Only need 1 out of 3 criteria
927
- ]
928
-
929
- # Start by assuming all non-candidate points are dense noise
930
- dense_noise_indices = [idx for idx in noise_indices if idx not in candidate_sparse_indices]
931
-
932
- # Now calculate distances between candidates and dense noise points with more sensitive threshold
933
- min_distance_threshold = np.percentile(cluster_distances, 40) # More sensitive threshold
934
- # Filter candidates based on distance from dense noise regions
935
- if len(candidate_sparse_indices) > 0 and len(dense_noise_indices) > 0:
936
- dense_noise_points = scaled_embeddings[dense_noise_indices]
937
- true_sparse_indices = []
938
-
939
- for idx in candidate_sparse_indices:
940
- point = scaled_embeddings[idx].reshape(1, -1)
941
- distances_to_dense = NearestNeighbors(n_neighbors=1).fit(dense_noise_points).kneighbors(point)[0][0]
942
- if distances_to_dense > min_distance_threshold:
943
- true_sparse_indices.append(idx)
944
-
945
- # Update dense_noise_indices to include rejected candidates
946
- rejected_indices = [idx for idx in candidate_sparse_indices if idx not in true_sparse_indices]
947
- dense_noise_indices.extend(rejected_indices)
948
- else:
949
- true_sparse_indices = candidate_sparse_indices
950
- else:
951
- # Fallback using only local density analysis
952
- density_threshold = np.percentile(local_densities, 25) # Bottom 25% sparsest points
953
- true_sparse_indices = [idx for i, idx in enumerate(noise_indices)
954
- if local_densities[i] < density_threshold]
955
- dense_noise_indices = [idx for idx in noise_indices if idx not in true_sparse_indices]
956
-
957
- print(f"\nFinal Classification:")
958
- print(f"True underexplored areas identified: {len(true_sparse_indices)}")
959
- print(f"Transitional areas identified: {len(dense_noise_indices)}")
960
- if len(true_sparse_indices) > 0:
961
- print(f"Underexplored area ratio: {len(true_sparse_indices)/len(noise_points):.2%}")
962
- print("\nUnderexplored Area Criteria Used:")
963
- print("1. Density Isolation: Significantly lower density than nearest cluster")
964
- print("2. Spatial Isolation: Far from both clusters and other points")
965
- print("3. Structural Stability: Forms stable sparse regions with neighbors")
966
-
967
- # Update point types in DataFrame for sparse points and dense noise
968
- for idx in true_sparse_indices:
969
- df.at[idx, 'point_type'] = 'sparse'
970
- for idx in dense_noise_indices:
971
- df.at[idx, 'point_type'] = 'dense_noise'
972
-
973
- # --- Handle dense noise points as transitional areas ---
974
- transitional_areas = [] # Store transitional areas for sorting
975
- if len(dense_noise_indices) >= 3:
976
- update_progress('clustering', 'processing', f'Analyzing {len(dense_noise_indices)} potential transitional areas...')
977
- print("\nAnalyzing dense noise points as transitional areas...")
978
- dense_noise_points = scaled_embeddings[dense_noise_indices]
979
-
980
- # Use HDBSCAN to find subgroups within transitional areas
981
- min_size = max(3, len(dense_noise_points) // 10)
982
- print(f"Attempting to identify transitional area subgroups with min_size={min_size}")
983
-
984
- hdb_dense = hdbscan.HDBSCAN(
985
- min_cluster_size=min_size,
986
- min_samples=max(2, min_size // 2),
987
- cluster_selection_epsilon=0.3,
988
- cluster_selection_method='leaf'
989
- )
990
- dense_labels = hdb_dense.fit_predict(dense_noise_points)
991
-
992
- # Count potential transitional areas
993
- unique_dense_labels = set(dense_labels) - {-1}
994
- n_transitional = len(unique_dense_labels)
995
- print(f"Found {n_transitional} distinct transitional areas")
996
-
997
- # First get all transitional points, including scattered ones
998
- all_transitional_points = {}
999
- # Count sizes first
1000
- label_sizes = {}
1001
- for label in dense_labels:
1002
- if label != -1:
1003
- label_sizes[label] = label_sizes.get(label, 0) + 1
1004
-
1005
- # Then collect points with their pre-calculated sizes
1006
- for i, label in enumerate(dense_labels):
1007
- idx = dense_noise_indices[i]
1008
- if label != -1: # Regular transitional area
1009
- if label not in all_transitional_points:
1010
- all_transitional_points[label] = {'indices': [], 'size': label_sizes[label]}
1011
- all_transitional_points[label]['indices'].append(idx)
1012
- else: # Scattered points
1013
- label_key = 'scattered'
1014
- if label_key not in all_transitional_points:
1015
- all_transitional_points[label_key] = {'indices': [], 'size': 0}
1016
- all_transitional_points[label_key]['indices'].append(idx)
1017
- all_transitional_points[label_key]['size'] += 1
1018
-
1019
- # Sort transitional areas by size and create insights
1020
- # Filter out areas that are too small and sort by size
1021
- min_area_size = 3 # Minimum size for a valid transitional area
1022
- valid_areas = [(k, v) for k, v in all_transitional_points.items()
1023
- if k != 'scattered' and v['size'] >= min_area_size]
1024
- sorted_areas = sorted(valid_areas, key=lambda x: x[1]['size'], reverse=True)
1025
-
1026
- # Add regular transitional areas to insights
1027
- total_areas = len(sorted_areas)
1028
- for area_idx, (label, area_info) in enumerate(sorted_areas):
1029
- update_progress('clustering', 'processing', f'Analyzing transitional area {area_idx + 1} of {total_areas} ({area_info["size"]} patents)...')
1030
- area_patents = df.iloc[area_info['indices']]
1031
- description = analyze_patent_group(area_patents, 'transitional', label)
1032
- area_number = area_idx + 1 # 1-based numbering for display
1033
-
1034
- # Create label without duplicate size info
1035
- area_label = f"Transitional Area {area_number}"
1036
- transitional_areas.append({
1037
- 'label': area_label,
1038
- 'indices': area_info['indices'],
1039
- 'size': area_info['size'],
1040
- 'patents': area_patents,
1041
- 'description': description
1042
- })
1043
- area_insight = {
1044
- 'type': 'transitional',
1045
- 'id': area_idx + 1, # Store as 1-based ID
1046
- 'size': area_info['size'],
1047
- 'label': f"{area_label} ({area_info['size']} patents)",
1048
- 'description': description
1049
- }
1050
- cluster_insights.append(area_insight)
1051
-
1052
- # Handle scattered points by analyzing them individually
1053
- if 'scattered' in all_transitional_points:
1054
- scattered_indices = all_transitional_points['scattered']['indices']
1055
- if len(scattered_indices) > 0:
1056
- print(f"\nAnalyzing {len(scattered_indices)} scattered points...")
1057
- scattered_points = scaled_embeddings[scattered_indices]
1058
-
1059
- # Calculate distances to nearest cluster and transitional area
1060
- distances_to_clusters = []
1061
- distances_to_transitional = []
1062
-
1063
- print("\nDistance analysis for each scattered point:")
1064
- point_counter = 0
1065
-
1066
- # First calculate all distances
1067
- for point in scattered_points:
1068
- point = point.reshape(1, -1)
1069
- # Distance to nearest cluster
1070
- if len(cluster_centers) > 0:
1071
- dist_cluster = NearestNeighbors(n_neighbors=1).fit(cluster_centers).kneighbors(point)[0][0][0]
1072
- else:
1073
- dist_cluster = float('inf')
1074
-
1075
- # Distance to nearest transitional area (excluding scattered points)
1076
- if len(dense_noise_points) > 0:
1077
- # Get only the transitional area points (excluding scattered points)
1078
- transitional_points = []
1079
- for i, point_idx in enumerate(dense_noise_indices):
1080
- if point_idx not in scattered_indices:
1081
- transitional_points.append(dense_noise_points[i])
1082
-
1083
- if transitional_points:
1084
- transitional_points = np.array(transitional_points)
1085
- nbrs_trans = NearestNeighbors(n_neighbors=1).fit(transitional_points)
1086
- dist_trans = nbrs_trans.kneighbors(point.reshape(1, -1))[0][0][0]
1087
- else:
1088
- dist_trans = float('inf')
1089
- else:
1090
- dist_trans = float('inf')
1091
-
1092
- # Store distances for ratio calculation
1093
- distances_to_clusters.append(dist_cluster)
1094
- distances_to_transitional.append(dist_trans)
1095
-
1096
- total_classified_as_gaps = 0
1097
- total_classified_as_transitional = 0
1098
-
1099
- # Use more aggressive thresholds for scattered points
1100
- cluster_distance_threshold = np.percentile(distances_to_clusters, 35) # Even more lenient
1101
- transitional_distance_threshold = np.percentile(distances_to_transitional, 35) # Even more lenient
1102
-
1103
- print(f"\nClassification thresholds:")
1104
- print(f"- Cluster distance threshold: {cluster_distance_threshold:.3f}")
1105
- print(f"- Transitional distance threshold: {transitional_distance_threshold:.3f}")
1106
-
1107
- # Classify scattered points
1108
- for idx, (dist_c, dist_t) in zip(scattered_indices, zip(distances_to_clusters, distances_to_transitional)):
1109
- # 1. Check absolute distances with more lenient thresholds
1110
- cluster_dist_threshold = np.percentile(distances_to_clusters, 60) # Use 60th percentile
1111
- trans_dist_threshold = np.percentile(distances_to_transitional, 60) # Use 60th percentile
1112
-
1113
- # Point is isolated if it's farther than median distance from both clusters and transitional areas
1114
- is_isolated = (dist_c > cluster_dist_threshold or dist_t > trans_dist_threshold)
1115
-
1116
- # 2. Calculate isolation based on absolute difference rather than ratio
1117
- isolation_diff = dist_t - dist_c # Positive means farther from transitional areas
1118
- is_relatively_isolated = isolation_diff > 0 # Any positive difference counts
1119
-
1120
- # 3. Simplified region formation check
1121
- nearby_transitional = sum(1 for d in distances_to_transitional if d < trans_dist_threshold)
1122
- nearby_clusters = sum(1 for d in distances_to_clusters if d < cluster_dist_threshold)
1123
-
1124
- # Point forms new region if it has any cluster neighbors
1125
- forms_new_region = nearby_clusters > 0
1126
-
1127
- # Classification decision and immediate DataFrame update
1128
- # More lenient classification - if the point is isolated OR relatively isolated, mark as gap
1129
- if is_isolated or is_relatively_isolated:
1130
- true_sparse_indices.append(idx)
1131
- df.at[idx, 'point_type'] = 'sparse' # Immediately update DataFrame
1132
- total_classified_as_gaps += 1
1133
- else:
1134
- dense_noise_indices.append(idx)
1135
- df.at[idx, 'point_type'] = 'dense_noise' # Immediately update DataFrame
1136
- total_classified_as_transitional += 1
1137
-
1138
- print(f"\nFinal classification summary for scattered points:")
1139
- print(f"- Total scattered points: {len(scattered_indices)}")
1140
- print(f"- Classified as underexplored areas: {total_classified_as_gaps}")
1141
- print(f"- Classified as transitional: {total_classified_as_transitional}")
1142
- if total_classified_as_gaps == 0:
1143
- print("\nWarning: No scattered points were classified as underexplored areas!")
1144
- print("Possible reasons:")
1145
- print("1. Distance thresholds may be too high")
1146
- print("2. Relative distance ratio may be too strict")
1147
- print("3. Nearby points criterion may be too restrictive")
1148
-
1149
- if total_classified_as_transitional > 0:
1150
- # Create a transitional area for scattered points
1151
- scattered_transitional_patents = df.iloc[dense_noise_indices[-total_classified_as_transitional:]]
1152
- description = analyze_patent_group(scattered_transitional_patents, 'transitional', 'scattered')
1153
- area_number = len(transitional_areas) + 1 # 1-based numbering for display
1154
-
1155
- # Add to transitional areas
1156
- area_label = f"Transitional Area {area_number}"
1157
- transitional_areas.append({
1158
- 'label': area_label,
1159
- 'indices': dense_noise_indices[-total_classified_as_transitional:],
1160
- 'size': total_classified_as_transitional,
1161
- 'patents': scattered_transitional_patents,
1162
- 'description': description
1163
- })
1164
-
1165
- # Add to insights
1166
- area_insight = {
1167
- 'type': 'transitional',
1168
- 'id': -1, # Special ID for scattered points
1169
- 'size': total_classified_as_transitional,
1170
- 'label': f"{area_label} ({total_classified_as_transitional} patents)",
1171
- 'description': description
1172
- }
1173
- cluster_insights.append(area_insight)
1174
-
1175
- # --- Analyze underexplored areas ---
1176
- if len(true_sparse_indices) > 0:
1177
- update_progress('clustering', 'processing', f'Analyzing {len(true_sparse_indices)} potential underexplored areas...')
1178
- print(f"\nProcessing {len(true_sparse_indices)} underexplored areas...")
1179
- sparse_patents = df.iloc[true_sparse_indices]
1180
- sparse_points = scaled_embeddings[true_sparse_indices]
1181
-
1182
- # Ensure points are marked as sparse in the DataFrame
1183
- df.loc[true_sparse_indices, 'point_type'] = 'sparse'
1184
-
1185
- # More lenient subclustering parameters for underexplored areas
1186
- min_subcluster_size = max(2, min(5, len(true_sparse_indices) // 10)) # More lenient minimum size
1187
- sparse_clusterer = hdbscan.HDBSCAN(
1188
- min_cluster_size=min_subcluster_size,
1189
- min_samples=1, # Most lenient possible
1190
- cluster_selection_epsilon=0.8, # Even more lenient
1191
- cluster_selection_method='leaf', # Changed to leaf for finer subcluster detection
1192
- metric='euclidean'
1193
- )
1194
- sparse_labels = sparse_clusterer.fit_predict(sparse_points)
1195
-
1196
- # Collect innovation subclusters for sorting
1197
- innovation_subclusters = []
1198
- for label in set(sparse_labels):
1199
- subcluster_mask = sparse_labels == label
1200
- subcluster_patents = sparse_patents[subcluster_mask]
1201
- subcluster_size = len(subcluster_patents)
1202
-
1203
- # Accept all subclusters, even single points
1204
- description = analyze_patent_group(subcluster_patents, 'innovation_subcluster', label)
1205
- innovation_subclusters.append({
1206
- 'label': label,
1207
- 'size': subcluster_size,
1208
- 'patents': subcluster_patents,
1209
- 'description': description
1210
- })
1211
-
1212
- # Sort innovation subclusters by size in descending order
1213
- innovation_subclusters.sort(key=lambda x: x['size'], reverse=True)
1214
-
1215
- # Add sorted innovation subclusters to insights
1216
- total_subclusters = len(innovation_subclusters)
1217
- for idx, subcluster in enumerate(innovation_subclusters):
1218
- update_progress('clustering', 'processing', f'Analyzing underexplored area opportunity {idx + 1} of {total_subclusters} ({subcluster["size"]} patents)...')
1219
- cluster_insights.append({
1220
- 'type': 'innovation_subcluster',
1221
- 'id': idx + 1, # Store as 1-based ID
1222
- 'size': subcluster['size'],
1223
- 'label': f"Underexplored Area {idx + 1}",
1224
- 'description': subcluster['description']
1225
- })
1226
- else:
1227
- cluster_insights.append({
1228
- 'type': 'innovation_subcluster',
1229
- 'id': -1,
1230
- 'size': 0,
1231
- 'label': 'No Underexplored Areas',
1232
- 'description': 'No significant underexplored areas were detected in this technology space.'
1233
- })
1234
-
1235
  update_progress('visualization', 'processing', 'Creating interactive plot...')
1236
 
1237
- # Create Plotly figure with clusters
1238
- # Ensure all points are properly categorized
1239
- unassigned_mask = df['point_type'] == 'unassigned'
1240
- if any(unassigned_mask):
1241
- print(f"Warning: {sum(unassigned_mask)} points remain unassigned")
1242
- df.loc[unassigned_mask, 'point_type'] = 'cluster' # Default unassigned to clusters
1243
-
1244
- # Separate points into three categories: clusters, underexplored areas, and dense noise
1245
- cluster_mask = df['point_type'] == 'cluster'
1246
- innovation_gaps_mask = df['point_type'] == 'sparse'
1247
- dense_noise_mask = df['point_type'] == 'dense_noise'
1248
 
 
1249
  # Create hover text for all points
1250
  hover_text = []
1251
- # Create mapping for underexplored area points to their numbers
1252
- innovation_gap_map = {}
1253
-
1254
- # Map underexplored areas using the analyzed subclusters to ensure consistent numbering
1255
- if len(true_sparse_indices) > 0:
1256
- for idx, subcluster in enumerate(innovation_subclusters, 1):
1257
- for patent in subcluster['patents'].index:
1258
- innovation_gap_map[patent] = idx
1259
-
1260
- # Create mapping for transitional areas
1261
- transitional_area_map = {}
1262
- for area_idx, area in enumerate(transitional_areas):
1263
- for idx in area['indices']:
1264
- transitional_area_map[idx] = {'number': area_idx + 1}
1265
-
1266
- # Generate hover text for each point
1267
  for idx, row in df.iterrows():
1268
- point_info = ""
1269
- if row['point_type'] == 'sparse':
1270
- gap_number = innovation_gap_map.get(idx)
1271
- if gap_number:
1272
- point_info = f"<br><b>Region:</b> Underexplored Area {gap_number}"
1273
- else:
1274
- point_info = "<br><b>Region:</b> Potential Innovation Area"
1275
- elif row['point_type'] == 'dense_noise':
1276
- area_info = transitional_area_map.get(idx)
1277
- if area_info:
1278
- point_info = f"<br><b>Region:</b> Transitional Area {area_info['number']}"
1279
- else:
1280
- # This is a scattered transitional point
1281
- point_info = f"<br><b>Region:</b> Transitional Area {len(transitional_areas)} (Scattered)"
1282
- else:
1283
- point_info = f"<br><b>Cluster:</b> {int(row['cluster']) + 1}" # Cluster IDs are still 0-based in the DataFrame
1284
-
1285
  text = (
1286
  f"<b>{row['title']}</b><br><br>"
1287
  f"<b>By:</b> {row['assignee']} ({row['year']})<br>"
1288
- f"{point_info}<br><br>"
1289
  f"<b>Abstract:</b><br>{row['abstract']}"
1290
  )
1291
  hover_text.append(text)
1292
 
1293
- # Create three separate traces: clusters, underexplored areas, and dense noise points
1294
  cluster_trace = go.Scatter3d(
1295
- x=df[cluster_mask]['x'],
1296
- y=df[cluster_mask]['y'],
1297
- z=df[cluster_mask]['z'],
1298
  mode='markers',
1299
  marker=dict(
1300
  size=6,
1301
- color=clusters[cluster_mask] + 1, # Add 1 to shift cluster numbers from 0-based to 1-based
1302
  colorscale='Viridis',
1303
- opacity=0.5,
1304
  showscale=True,
1305
  colorbar=dict(
1306
- title="Clusters",
1307
- ticktext=[f"Cluster {i+1}" for i in range(n_clusters)], # Custom tick labels
1308
- tickvals=list(range(1, n_clusters + 1)), # Values to match the 1-based cluster numbers
1309
- tickmode="array",
1310
  tickfont=dict(size=10),
1311
- titlefont=dict(size=10)
1312
  )
1313
  ),
1314
- text=[hover_text[i] for i in range(len(hover_text)) if cluster_mask[i]],
1315
  hoverinfo='text',
1316
- name='Clusters',
1317
  hoverlabel=dict(
1318
  bgcolor="white",
1319
  font_size=12,
1320
  font_family="Arial",
1321
  align="left"
1322
  ),
1323
- customdata=[df['link'].tolist()[i] for i in range(len(df)) if cluster_mask[i]]
1324
  )
1325
 
1326
- innovation_gaps_trace = go.Scatter3d(
1327
- x=df[innovation_gaps_mask]['x'],
1328
- y=df[innovation_gaps_mask]['y'],
1329
- z=df[innovation_gaps_mask]['z'],
1330
- mode='markers',
1331
- marker=dict(
1332
- size=6, # Same size as other points
1333
- color='rgb(255, 0, 0)', # Pure bright red
1334
- symbol='diamond',
1335
- opacity=1.0, # Full opacity for visibility
1336
- line=dict(
1337
- color='white',
1338
- width=1 # Thinner border to match other points
1339
- )
1340
- ),
1341
- text=[hover_text[i] for i in range(len(hover_text)) if innovation_gaps_mask[i]],
1342
- hoverinfo='text',
1343
- name='Underexplored Areas',
1344
- hoverlabel=dict(
1345
- bgcolor="white",
1346
- font_size=12,
1347
- font_family="Arial",
1348
- align="left"
1349
- ),
1350
- customdata=[df['link'].tolist()[i] for i in range(len(df)) if innovation_gaps_mask[i]]
1351
- )
1352
-
1353
- dense_noise_trace = go.Scatter3d(
1354
- x=df[dense_noise_mask]['x'],
1355
- y=df[dense_noise_mask]['y'],
1356
- z=df[dense_noise_mask]['z'],
1357
- mode='markers',
1358
- marker=dict(
1359
- size=6, # Same size as other points
1360
- color='rgb(255, 165, 0)', # Orange for transitional areas
1361
- symbol='circle',
1362
- opacity=0.7, # Less opacity to make gaps more visible
1363
- line=dict(
1364
- color='white',
1365
- width=1 # Thin border
1366
- )
1367
- ),
1368
- text=[hover_text[i] for i in range(len(hover_text)) if dense_noise_mask[i]],
1369
- hoverinfo='text',
1370
- name='Transitional Areas',
1371
- hoverlabel=dict(
1372
- bgcolor="white",
1373
- font_size=12,
1374
- font_family="Arial",
1375
- align="left"
1376
- ),
1377
- customdata=[df['link'].tolist()[i] for i in range(len(df)) if dense_noise_mask[i]]
1378
- )
1379
-
1380
- fig = go.Figure(data=[cluster_trace, innovation_gaps_trace, dense_noise_trace])
1381
 
1382
  # Update layout
1383
  fig.update_layout(
1384
- title="Patent Technology Landscape",
1385
  scene=dict(
1386
  xaxis_title="UMAP 1",
1387
  yaxis_title="UMAP 2",
@@ -1389,28 +814,16 @@ def create_3d_visualization(patents):
1389
  camera=dict(
1390
  up=dict(x=0, y=0, z=1),
1391
  center=dict(x=0, y=0, z=0),
1392
- eye=dict(x=1.8, y=1.8, z=1.8) # Slightly further out for better overview
1393
  ),
1394
- aspectmode='cube' # Force equal scaling
1395
  ),
1396
  margin=dict(l=0, r=0, b=0, t=30),
1397
- showlegend=True,
1398
  template="plotly_dark",
1399
  hoverlabel_align='left',
1400
  hoverdistance=100,
1401
- hovermode='closest',
1402
- legend=dict(
1403
- yanchor="top",
1404
- y=0.99,
1405
- xanchor="left",
1406
- x=0.01,
1407
- bgcolor="rgba(0,0,0,0.7)", # Darker background for better contrast
1408
- font=dict(
1409
- color="white",
1410
- size=12
1411
- ),
1412
- itemsizing='constant' # Keep legend marker sizes consistent
1413
- )
1414
  )
1415
 
1416
  # Configure hover behavior
@@ -1455,7 +868,7 @@ def generate_analysis(prompt, cluster_insights):
1455
  analysis = response.choices[0].message['content']
1456
 
1457
  # Validate that analysis references valid areas
1458
- area_pattern = r'(?:Cluster|Transitional Area|Underexplored Area)\s+(\d+)'
1459
  referenced_areas = set(int(num) for num in re.findall(area_pattern, analysis))
1460
 
1461
  # Extract valid area numbers from insights
@@ -1478,92 +891,70 @@ def generate_analysis(prompt, cluster_insights):
1478
 
1479
  def analyze_innovation_opportunities(cluster_insights):
1480
  """
1481
- Analyze relationships between different areas to identify potential innovation opportunities.
1482
- Returns focused analysis of high-value innovation opportunities between existing technology areas.
1483
  """
1484
  # Extract cluster numbers and validate
1485
  cluster_nums = set()
1486
- transitional_nums = set()
1487
- underexplored_nums = set()
1488
 
1489
  # Parse and validate cluster numbers with explicit error checking
1490
  for insight in cluster_insights:
1491
  area_type = insight.get('type', '')
1492
  area_id = insight.get('id', -1)
1493
 
1494
- if area_id < 0 and area_type != 'cluster':
1495
- continue
1496
-
1497
- if area_type == 'cluster':
1498
  cluster_nums.add(area_id)
1499
- elif area_type == 'transitional':
1500
- transitional_nums.add(area_id)
1501
- elif area_type == 'innovation_subcluster':
1502
- if area_id >= 1: # Skip the "No underexplored areas" entry
1503
- underexplored_nums.add(area_id)
1504
-
1505
- # Format areas with validation
1506
- def format_area_list(area_nums):
1507
- return f"Areas {', '.join(str(n) for n in sorted(area_nums))}" if area_nums else "None identified"
1508
 
1509
- # Only generate analysis if we have areas to analyze
1510
- if not any([cluster_nums, transitional_nums, underexplored_nums]):
1511
- return "No distinct areas found. Try broadening search terms or increasing patent count."
1512
 
1513
- # Create descriptions list with more detailed information
1514
  descriptions = []
1515
  cluster_details = {}
1516
- transitional_details = {}
1517
- underexplored_details = {}
1518
 
1519
  for insight in cluster_insights:
1520
- if insight.get('description'):
1521
- area_type = insight.get('type', '')
1522
  area_id = int(insight.get('id', -1)) # 1-based IDs
1523
  area_size = insight.get('size', 0)
1524
 
1525
- if area_type == 'cluster':
1526
- desc = f"C{area_id}:{insight['description']}"
1527
- descriptions.append(desc)
1528
- cluster_details[area_id] = {'description': insight['description'], 'size': area_size}
1529
- elif area_type == 'transitional':
1530
- desc = f"T{area_id}:{insight['description']}"
1531
- descriptions.append(desc)
1532
- transitional_details[area_id] = {'description': insight['description'], 'size': area_size}
1533
- elif area_type == 'innovation_subcluster' and insight['id'] >= 1:
1534
- desc = f"U{area_id}:{insight['description']}"
1535
- descriptions.append(desc)
1536
- underexplored_details[area_id] = {'description': insight['description'], 'size': area_size}
1537
 
1538
  # Format descriptions as a string with newlines
1539
  descriptions_text = '\n'.join(descriptions)
1540
 
1541
- prompt = f"""Available Areas:
1542
- Clusters: {format_area_list(cluster_nums)}
1543
- Transitional Areas: {format_area_list(transitional_nums)}
1544
- Underexplored Areas: {format_area_list(underexplored_nums)}
1545
- Area Descriptions:
1546
  {descriptions_text}
1547
- I need you to identify 3-4 high-value innovation opportunities in this patent landscape. Focus on creating REAL business value through either:
1548
- A) Connecting complementary technologies from different areas, OR
1549
- B) Developing promising technologies within underexplored/transitional areas
 
 
1550
  For each opportunity:
1551
- 1. Select either ONE area with internal innovation potential OR two technologically adjacent areas that can be connected
1552
- 2. Identify a specific technical or market gap (either within the area or between areas)
1553
  3. Propose a concrete solution that addresses this gap
1554
  4. Quantify potential business impact and competitive advantage
 
1555
  Follow this precise format:
1556
  Opportunity N: [Title that describes the innovation]
1557
- Source: [Single area (e.g., "Underexplored Area 2") OR combination (e.g., "Cluster 1 + Transitional Area 3")]
1558
  - Gap: [Specific technical or market gap that represents an unmet need]
1559
  - Solution: [Practical, implementable technical approach]
1560
  - Impact: [Specific business value creation - market size, efficiency gains, cost reduction]
1561
  - Timeline: [Short-term (1-2 years) or medium-term (3-5 years)]
 
1562
  Prioritize opportunities based on:
1563
  1. Commercial potential (market size, growth potential)
1564
  2. Technical feasibility (can be implemented with current or near-term technology)
1565
  3. Competitive advantage (uniqueness, barriers to entry)
1566
  4. Alignment with industry trends (sustainability, automation, digitalization)
 
1567
  Focus on practical innovations that could realistically be implemented by a company rather than theoretical or speculative concepts."""
1568
 
1569
  # Get analysis from LLM
@@ -2000,9 +1391,7 @@ def download_plot():
2000
  <h1>Patent Technology Landscape</h1>
2001
  <p><strong>Instructions:</strong> Click on any point to open the corresponding Google Patents page in a new tab.</p>
2002
  <p><strong>Legend:</strong>
2003
- <span style="color: #636EFA;">● Clusters</span> |
2004
- <span style="color: #FF0000;">♦ Underexplored Areas</span> |
2005
- <span style="color: #FFA500;">● Transitional Areas</span>
2006
  </p>
2007
  </div>
2008
  <div id="plot"></div>
@@ -2244,30 +1633,6 @@ def download_insights():
2244
  cluster_count += 1
2245
  print(f"Added {cluster_count} clusters")
2246
 
2247
- # Add transitional areas
2248
- print("Adding transitional areas section...")
2249
- story.append(Paragraph("Transitional Areas", heading_style))
2250
- trans_count = 0
2251
- for insight in insights:
2252
- if insight['type'] == 'transitional':
2253
- text = f"<b>Transitional Area {insight['id']}:</b> {insight['description']}"
2254
- story.append(Paragraph(text, normal_style))
2255
- story.append(Spacer(1, 12))
2256
- trans_count += 1
2257
- print(f"Added {trans_count} transitional areas")
2258
-
2259
- # Add underexplored areas
2260
- print("Adding underexplored areas section...")
2261
- story.append(Paragraph("Underexplored Areas", heading_style))
2262
- underexplored_count = 0
2263
- for insight in insights:
2264
- if insight['type'] == 'innovation_subcluster':
2265
- text = f"<b>Underexplored Area {insight['id']}:</b> {insight['description']}"
2266
- story.append(Paragraph(text, normal_style))
2267
- story.append(Spacer(1, 12))
2268
- underexplored_count += 1
2269
- print(f"Added {underexplored_count} underexplored areas")
2270
-
2271
  # Build PDF
2272
  print("Building final PDF document...")
2273
  doc.build(story)
 
489
  return all_patents
490
 
491
  def analyze_patent_group(patents, group_type, label, max_retries=3):
492
+ """Analyze patent clusters using ChatGPT with improved formatting and concise output"""
493
  # Extract key information from all patents in the group
494
  patent_count = len(patents)
495
  years_range = f"{patents['year'].min()}-{patents['year'].max()}"
 
535
  else:
536
  top_assignees = ", ".join(patents['assignee'].unique())
537
 
538
+ # Enhanced prompt template for cluster analysis
539
+ base_prompt = f"""Patent cluster analysis ({patent_count} patents, {years_range}):
 
 
 
540
  Key players: {top_assignees}
541
  Core technologies: {key_terms}
542
  Sample innovations: {example_titles}
 
544
  Provide concise analysis in exactly this format:
545
  **Technology Focus:** [What specific problem/need this cluster addresses]
546
  **Market Applications:** [Primary commercial uses and target industries]
547
+ **Innovation Trajectory:** [How this technology is evolving and future direction]"""
548
 
549
+ system_prompt = "You are a patent analyst providing strategic technology insights. Focus on commercial relevance and market opportunities."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
 
551
  retry_count = 0
552
  while retry_count < max_retries:
 
569
  analysis = re.sub(r'(?i)technology focus:', '**Technology Focus:**', analysis)
570
  analysis = re.sub(r'(?i)market applications:', '**Market Applications:**', analysis)
571
  analysis = re.sub(r'(?i)innovation trajectory:', '**Innovation Trajectory:**', analysis)
 
 
 
 
 
 
572
 
573
  # Clean up whitespace and formatting
574
  analysis = re.sub(r'\n\s*\n', '\n', analysis) # Remove multiple blank lines
 
592
  """
593
  Create a 3D visualization of patent embeddings using UMAP and Plotly
594
  """
595
+ # Initialize variables for tracking clusters
596
  df = pd.DataFrame(patents)
 
 
597
 
598
  if not patents:
599
  return None
 
648
  df['y'] = embedding_3d[:, 1]
649
  df['z'] = embedding_3d[:, 2]
650
 
651
+ # --- Simplified HDBSCAN clustering for technological clusters ---
652
  scaler = StandardScaler()
653
  scaled_embeddings = scaler.fit_transform(embedding_3d)
654
 
655
  n_points = len(scaled_embeddings)
656
  update_progress('clustering', 'processing', f'Analyzing {n_points} patents for clustering...')
657
 
658
+ # Dynamically set clustering parameters based on dataset size
659
  if n_points < 100:
660
+ min_cluster_size = max(5, int(n_points * 0.08))
 
 
661
  elif n_points < 500:
662
+ min_cluster_size = max(8, int(n_points * 0.05))
 
 
663
  elif n_points < 1000:
664
+ min_cluster_size = max(15, int(n_points * 0.03))
 
 
 
 
 
 
 
 
 
 
 
665
  else:
666
+ min_cluster_size = max(20, int(n_points * 0.02))
 
667
 
668
+ min_samples = max(3, int(min_cluster_size * 0.7))
669
+
670
+ print(f"HDBSCAN clustering: min_cluster_size={min_cluster_size}, min_samples={min_samples}")
671
+
672
+ # Apply HDBSCAN clustering
673
+ hdb = hdbscan.HDBSCAN(
674
+ min_cluster_size=min_cluster_size,
675
+ min_samples=min_samples,
676
+ cluster_selection_epsilon=0.1,
677
+ cluster_selection_method='eom',
678
+ metric='euclidean'
679
+ )
680
+ clusters = hdb.fit_predict(scaled_embeddings)
681
+
682
+ # Assign noise points to nearest cluster
683
+ noise_mask = clusters == -1
684
+ if any(noise_mask) and len(set(clusters)) > 1:
685
+ print(f"Assigning {sum(noise_mask)} noise points to nearest clusters...")
686
+ # Get cluster centers
687
+ cluster_centers = []
688
+ cluster_labels = []
689
+ for label in set(clusters):
690
+ if label != -1:
691
+ cluster_mask = clusters == label
692
+ center = np.mean(scaled_embeddings[cluster_mask], axis=0)
693
+ cluster_centers.append(center)
694
+ cluster_labels.append(label)
695
+
696
+ if cluster_centers:
697
+ cluster_centers = np.array(cluster_centers)
698
+ noise_points = scaled_embeddings[noise_mask]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699
 
700
+ # Find nearest cluster for each noise point
701
+ nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
702
+ _, nearest_indices = nbrs.kneighbors(noise_points)
703
+
704
+ # Assign noise points to nearest clusters
705
+ noise_indices = np.where(noise_mask)[0]
706
+ for i, nearest_idx in enumerate(nearest_indices.flatten()):
707
+ clusters[noise_indices[i]] = cluster_labels[nearest_idx]
708
 
709
  df['cluster'] = clusters
710
 
711
+ # --- Gather clusters and analyze them ---
712
  cluster_info = []
713
+ n_clusters = len(set(clusters))
714
+
715
  for label in set(clusters):
716
+ cluster_mask = clusters == label
717
+ cluster_patents = df[cluster_mask]
718
+ if len(cluster_patents) > 0:
719
+ cluster_info.append((label, len(cluster_patents), cluster_patents))
 
720
 
721
  # Sort clusters by size in descending order
722
  cluster_info.sort(key=lambda x: x[1], reverse=True)
723
 
724
+ print(f"\nFinal Clustering Results:")
725
+ print(f"Number of technological clusters: {n_clusters}")
726
+ print(f"Total patents clustered: {len(df)}")
727
  print("\nCluster Size Distribution:")
728
  for i, (label, size, _) in enumerate(cluster_info):
729
+ print(f"Cluster {i + 1}: {size} patents")
730
 
731
+ # Create mapping for new cluster IDs (1-based)
732
+ cluster_id_map = {old_label: i + 1 for i, (old_label, _, _) in enumerate(cluster_info)}
733
 
734
+ # Update cluster IDs in DataFrame to be 1-based
735
  new_clusters = clusters.copy()
736
  for old_label, new_label in cluster_id_map.items():
737
  new_clusters[clusters == old_label] = new_label
738
  df['cluster'] = new_clusters
739
 
740
+ update_progress('clustering', 'processing', 'Analyzing technological clusters...')
741
 
742
+ # Analyze each cluster
743
+ cluster_insights = []
 
 
 
744
  total_clusters = len(cluster_info)
745
+ for i, (_, size, cluster_patents) in enumerate(cluster_info):
746
+ cluster_id = i + 1 # 1-based cluster ID
747
+ update_progress('clustering', 'processing', f'Analyzing cluster {cluster_id} of {total_clusters} ({size} patents)...')
748
+ description = analyze_patent_group(cluster_patents, 'cluster', cluster_id)
749
  cluster_insights.append({
750
  'type': 'cluster',
751
+ 'id': cluster_id,
752
  'size': size,
753
+ 'label': f"Cluster {cluster_id}",
754
  'description': description
755
  })
756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
  update_progress('visualization', 'processing', 'Creating interactive plot...')
758
 
 
 
 
 
 
 
 
 
 
 
 
759
 
760
+ # Create Plotly figure with clusters only
761
  # Create hover text for all points
762
  hover_text = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
763
  for idx, row in df.iterrows():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
764
  text = (
765
  f"<b>{row['title']}</b><br><br>"
766
  f"<b>By:</b> {row['assignee']} ({row['year']})<br>"
767
+ f"<b>Cluster:</b> {int(row['cluster'])}<br><br>"
768
  f"<b>Abstract:</b><br>{row['abstract']}"
769
  )
770
  hover_text.append(text)
771
 
772
+ # Create single trace for all clusters
773
  cluster_trace = go.Scatter3d(
774
+ x=df['x'],
775
+ y=df['y'],
776
+ z=df['z'],
777
  mode='markers',
778
  marker=dict(
779
  size=6,
780
+ color=df['cluster'],
781
  colorscale='Viridis',
782
+ opacity=0.7,
783
  showscale=True,
784
  colorbar=dict(
785
+ title="Technology Clusters",
786
+ tickmode="linear",
787
+ tick0=1,
788
+ dtick=1,
789
  tickfont=dict(size=10),
790
+ titlefont=dict(size=12)
791
  )
792
  ),
793
+ text=hover_text,
794
  hoverinfo='text',
795
+ name='Technology Clusters',
796
  hoverlabel=dict(
797
  bgcolor="white",
798
  font_size=12,
799
  font_family="Arial",
800
  align="left"
801
  ),
802
+ customdata=df['link'].tolist()
803
  )
804
 
805
+ fig = go.Figure(data=[cluster_trace])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
806
 
807
  # Update layout
808
  fig.update_layout(
809
+ title="Patent Technology Landscape - Cluster Analysis",
810
  scene=dict(
811
  xaxis_title="UMAP 1",
812
  yaxis_title="UMAP 2",
 
814
  camera=dict(
815
  up=dict(x=0, y=0, z=1),
816
  center=dict(x=0, y=0, z=0),
817
+ eye=dict(x=1.8, y=1.8, z=1.8)
818
  ),
819
+ aspectmode='cube'
820
  ),
821
  margin=dict(l=0, r=0, b=0, t=30),
822
+ showlegend=False, # Single trace doesn't need legend
823
  template="plotly_dark",
824
  hoverlabel_align='left',
825
  hoverdistance=100,
826
+ hovermode='closest'
 
 
 
 
 
 
 
 
 
 
 
 
827
  )
828
 
829
  # Configure hover behavior
 
868
  analysis = response.choices[0].message['content']
869
 
870
  # Validate that analysis references valid areas
871
+ area_pattern = r'(?:Cluster)\s+(\d+)'
872
  referenced_areas = set(int(num) for num in re.findall(area_pattern, analysis))
873
 
874
  # Extract valid area numbers from insights
 
891
 
892
  def analyze_innovation_opportunities(cluster_insights):
893
  """
894
+ Analyze technology clusters to identify potential innovation opportunities.
895
+ Returns focused analysis of high-value innovation opportunities within and between technology clusters.
896
  """
897
  # Extract cluster numbers and validate
898
  cluster_nums = set()
 
 
899
 
900
  # Parse and validate cluster numbers with explicit error checking
901
  for insight in cluster_insights:
902
  area_type = insight.get('type', '')
903
  area_id = insight.get('id', -1)
904
 
905
+ if area_type == 'cluster' and area_id > 0:
 
 
 
906
  cluster_nums.add(area_id)
 
 
 
 
 
 
 
 
 
907
 
908
+ # Only generate analysis if we have clusters to analyze
909
+ if not cluster_nums:
910
+ return "No technology clusters found. Try broadening search terms or increasing patent count."
911
 
912
+ # Create descriptions list with cluster information
913
  descriptions = []
914
  cluster_details = {}
 
 
915
 
916
  for insight in cluster_insights:
917
+ if insight.get('description') and insight.get('type') == 'cluster':
 
918
  area_id = int(insight.get('id', -1)) # 1-based IDs
919
  area_size = insight.get('size', 0)
920
 
921
+ desc = f"C{area_id}:{insight['description']}"
922
+ descriptions.append(desc)
923
+ cluster_details[area_id] = {'description': insight['description'], 'size': area_size}
 
 
 
 
 
 
 
 
 
924
 
925
  # Format descriptions as a string with newlines
926
  descriptions_text = '\n'.join(descriptions)
927
 
928
+ prompt = f"""Technology Clusters Available:
929
+ Clusters: {', '.join(f'Cluster {n}' for n in sorted(cluster_nums))}
930
+
931
+ Cluster Descriptions:
 
932
  {descriptions_text}
933
+
934
+ I need you to identify 3-4 high-value innovation opportunities in this patent technology landscape. Focus on creating REAL business value through either:
935
+ A) Cross-pollinating technologies between different clusters, OR
936
+ B) Identifying innovation gaps within individual clusters
937
+
938
  For each opportunity:
939
+ 1. Select either ONE cluster with internal innovation potential OR two complementary clusters that can be combined
940
+ 2. Identify a specific technical or market gap within or between the selected clusters
941
  3. Propose a concrete solution that addresses this gap
942
  4. Quantify potential business impact and competitive advantage
943
+
944
  Follow this precise format:
945
  Opportunity N: [Title that describes the innovation]
946
+ Source: [Single cluster (e.g., "Cluster 2") OR combination (e.g., "Cluster 1 + Cluster 3")]
947
  - Gap: [Specific technical or market gap that represents an unmet need]
948
  - Solution: [Practical, implementable technical approach]
949
  - Impact: [Specific business value creation - market size, efficiency gains, cost reduction]
950
  - Timeline: [Short-term (1-2 years) or medium-term (3-5 years)]
951
+
952
  Prioritize opportunities based on:
953
  1. Commercial potential (market size, growth potential)
954
  2. Technical feasibility (can be implemented with current or near-term technology)
955
  3. Competitive advantage (uniqueness, barriers to entry)
956
  4. Alignment with industry trends (sustainability, automation, digitalization)
957
+
958
  Focus on practical innovations that could realistically be implemented by a company rather than theoretical or speculative concepts."""
959
 
960
  # Get analysis from LLM
 
1391
  <h1>Patent Technology Landscape</h1>
1392
  <p><strong>Instructions:</strong> Click on any point to open the corresponding Google Patents page in a new tab.</p>
1393
  <p><strong>Legend:</strong>
1394
+ <span style="color: #636EFA;">● Technology Clusters</span>
 
 
1395
  </p>
1396
  </div>
1397
  <div id="plot"></div>
 
1633
  cluster_count += 1
1634
  print(f"Added {cluster_count} clusters")
1635
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1636
  # Build PDF
1637
  print("Building final PDF document...")
1638
  doc.build(story)
templates/index.html CHANGED
@@ -4,7 +4,12 @@
4
  <title>Patent Explorer</title>
5
  <meta charset="utf-8">
6
  <meta name="viewport" content="width=device-width, initial-scale=1">
7
- <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/tailwind.min.css" rel="stylesheet">
 
 
 
 
 
8
  <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
9
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
10
  <style>
@@ -209,7 +214,7 @@
209
  <h3 class="text-lg font-semibold text-blue-300 mb-3">πŸ“ Interactive Visualization Guide</h3>
210
  <div class="text-gray-300 mb-3">
211
  <p class="mb-2"><strong>Click any point</strong> to open the corresponding Google Patents page in a new tab.</p>
212
- <p class="mb-3"><strong>Hover over points</strong> to see detailed patent information including title, assignee, year, and main technical ideas.</p>
213
  </div>
214
  <div class="flex flex-wrap items-center">
215
  <span class="text-sm font-medium text-gray-400 mr-3">Legend:</span>
@@ -471,11 +476,7 @@
471
  if (response.insights) {
472
  console.log('Displaying insights...');
473
  const clusters = response.insights.filter(i => i.type === 'cluster');
474
- const innovationSubclusters = response.insights.filter(i => i.type === 'innovation_subcluster');
475
- const transitionalAreas = response.insights.filter(i => i.type === 'transitional');
476
  console.log('Found clusters:', clusters.length);
477
- console.log('Found innovation subclusters:', innovationSubclusters.length);
478
- console.log('Found transitional areas:', transitionalAreas.length);
479
 
480
  // Start with Innovation Analysis
481
  let insightsHtml = '';
@@ -490,11 +491,8 @@
490
  `;
491
  }
492
 
493
- // Add the grid for clusters, transitional areas, and underexplored areas
494
- insightsHtml += '<div class="grid grid-cols-1 lg:grid-cols-3 gap-6 p-6">';
495
-
496
- // Left column: Technology Clusters
497
- insightsHtml += '<div class="col-span-1">';
498
  insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-blue-400">Technology Clusters</h3>';
499
 
500
  if (clusters.length > 0) {
@@ -512,45 +510,7 @@
512
  insightsHtml += '<p class="text-gray-400">No technology clusters identified.</p>';
513
  }
514
  insightsHtml += '</div>';
515
- // Middle column: Transitional Areas
516
- insightsHtml += '<div class="col-span-1">';
517
- insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-orange-400">Transitional Areas</h3>';
518
 
519
- if (transitionalAreas.length > 0) {
520
- insightsHtml += '<div class="space-y-4">';
521
- transitionalAreas.forEach(area => {
522
- insightsHtml += `
523
- <div class="transitional-card p-6 text-base" style="background-color: #4d3d2d;">
524
- <div class="text-orange-300 text-lg font-bold mb-3">${area.label}</div>
525
- <div class="text-gray-300 whitespace-pre-line leading-relaxed">${area.description}</div>
526
- </div>
527
- `;
528
- });
529
- insightsHtml += '</div>';
530
- } else {
531
- insightsHtml += '<p class="text-gray-400">No transitional areas identified.</p>';
532
- }
533
- insightsHtml += '</div>';
534
- // Right column: Underexplored Areas
535
- insightsHtml += '<div class="col-span-1">';
536
- insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-green-400">Underexplored Areas</h3>';
537
-
538
- if (innovationSubclusters.length > 0) {
539
- insightsHtml += '<div class="space-y-4">';
540
- innovationSubclusters.forEach(subcluster => {
541
- insightsHtml += `
542
- <div class="opportunity-card p-6 text-base">
543
- <div class="text-green-300 text-lg font-bold mb-3">${subcluster.label} (${subcluster.size} patents)</div>
544
- <div class="text-gray-300 whitespace-pre-line leading-relaxed">${subcluster.description}</div>
545
- </div>
546
- `;
547
- });
548
- insightsHtml += '</div>';
549
- } else {
550
- insightsHtml += '<p class="text-gray-400">No significant underexplored areas identified in this technology space.</p>';
551
- }
552
- insightsHtml += '</div>';
553
- insightsHtml += '</div>';
554
  $('#insights').html(insightsHtml);
555
  } else {
556
  console.warn('No insights data received');
 
4
  <title>Patent Explorer</title>
5
  <meta charset="utf-8">
6
  <meta name="viewport" content="width=device-width, initial-scale=1">
7
+ <link href="https://cdn.jsdelivr.net/npm/tailw <div class="legend-item">
8
+ <div class="legend-dot" style="background-color: #636EFA;"></div>
9
+ <span class="text-sm">Technology Clusters</span>
10
+ </div>
11
+ </div>
12
+ </div>dist/tailwind.min.css" rel="stylesheet">
13
  <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
14
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
15
  <style>
 
214
  <h3 class="text-lg font-semibold text-blue-300 mb-3">πŸ“ Interactive Visualization Guide</h3>
215
  <div class="text-gray-300 mb-3">
216
  <p class="mb-2"><strong>Click any point</strong> to open the corresponding Google Patents page in a new tab.</p>
217
+ <p class="mb-3"><strong>Hover over points</strong> to see detailed patent information including title, assignee, year, and abstract.</p>
218
  </div>
219
  <div class="flex flex-wrap items-center">
220
  <span class="text-sm font-medium text-gray-400 mr-3">Legend:</span>
 
476
  if (response.insights) {
477
  console.log('Displaying insights...');
478
  const clusters = response.insights.filter(i => i.type === 'cluster');
 
 
479
  console.log('Found clusters:', clusters.length);
 
 
480
 
481
  // Start with Innovation Analysis
482
  let insightsHtml = '';
 
491
  `;
492
  }
493
 
494
+ // Technology Clusters section
495
+ insightsHtml += '<div class="p-6">';
 
 
 
496
  insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-blue-400">Technology Clusters</h3>';
497
 
498
  if (clusters.length > 0) {
 
510
  insightsHtml += '<p class="text-gray-400">No technology clusters identified.</p>';
511
  }
512
  insightsHtml += '</div>';
 
 
 
513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  $('#insights').html(insightsHtml);
515
  } else {
516
  console.warn('No insights data received');