Spaces:
Sleeping
Sleeping
Commit
Β·
0fbc86f
1
Parent(s):
398e8e4
removed transitional areas and underexplored areas
Browse files- app.py +124 -759
- templates/index.html +9 -49
app.py
CHANGED
@@ -489,7 +489,7 @@ def search_patents(keywords, page_size=100):
|
|
489 |
return all_patents
|
490 |
|
491 |
def analyze_patent_group(patents, group_type, label, max_retries=3):
|
492 |
-
"""Analyze patent
|
493 |
# Extract key information from all patents in the group
|
494 |
patent_count = len(patents)
|
495 |
years_range = f"{patents['year'].min()}-{patents['year'].max()}"
|
@@ -535,11 +535,8 @@ def analyze_patent_group(patents, group_type, label, max_retries=3):
|
|
535 |
else:
|
536 |
top_assignees = ", ".join(patents['assignee'].unique())
|
537 |
|
538 |
-
# Enhanced prompt
|
539 |
-
|
540 |
-
prompts = {
|
541 |
-
'cluster': (
|
542 |
-
f"""Patent cluster analysis ({patent_count} patents, {years_range}):
|
543 |
Key players: {top_assignees}
|
544 |
Core technologies: {key_terms}
|
545 |
Sample innovations: {example_titles}
|
@@ -547,40 +544,9 @@ Sample innovations: {example_titles}
|
|
547 |
Provide concise analysis in exactly this format:
|
548 |
**Technology Focus:** [What specific problem/need this cluster addresses]
|
549 |
**Market Applications:** [Primary commercial uses and target industries]
|
550 |
-
**Innovation Trajectory:** [How this technology is evolving and future direction]"""
|
551 |
|
552 |
-
|
553 |
-
),
|
554 |
-
'transitional': (
|
555 |
-
f"""Transitional technology area ({patent_count} patents, {years_range}):
|
556 |
-
Key players: {top_assignees}
|
557 |
-
Bridge technologies: {key_terms}
|
558 |
-
Sample innovations: {example_titles}
|
559 |
-
|
560 |
-
Provide concise analysis in exactly this format:
|
561 |
-
**Technology Bridge:** [Which established fields this area connects]
|
562 |
-
**Integration Value:** [Why combining these technologies creates value]
|
563 |
-
**Market Potential:** [Commercial opportunities from this convergence]""",
|
564 |
-
|
565 |
-
"You are a patent analyst identifying technology convergence opportunities. Focus on cross-domain innovation potential."
|
566 |
-
),
|
567 |
-
'innovation_subcluster': (
|
568 |
-
f"""Underexplored technology area ({patent_count} patents, {years_range}):
|
569 |
-
Current players: {top_assignees}
|
570 |
-
Emerging concepts: {key_terms}
|
571 |
-
Early innovations: {example_titles}
|
572 |
-
|
573 |
-
Provide concise analysis in exactly this format:
|
574 |
-
**Market Gap:** [Unmet need or problem this area could solve]
|
575 |
-
**Technical Approach:** [Current methods and their limitations]
|
576 |
-
**Innovation Opportunity:** [Specific R&D directions with commercial potential]""",
|
577 |
-
|
578 |
-
"You are a patent analyst identifying innovation opportunities. Focus on market gaps and commercial potential for R&D investment."
|
579 |
-
)
|
580 |
-
}
|
581 |
-
|
582 |
-
base_prompt = prompts[group_type][0]
|
583 |
-
system_prompt = prompts[group_type][1]
|
584 |
|
585 |
retry_count = 0
|
586 |
while retry_count < max_retries:
|
@@ -603,12 +569,6 @@ Provide concise analysis in exactly this format:
|
|
603 |
analysis = re.sub(r'(?i)technology focus:', '**Technology Focus:**', analysis)
|
604 |
analysis = re.sub(r'(?i)market applications:', '**Market Applications:**', analysis)
|
605 |
analysis = re.sub(r'(?i)innovation trajectory:', '**Innovation Trajectory:**', analysis)
|
606 |
-
analysis = re.sub(r'(?i)technology bridge:', '**Technology Bridge:**', analysis)
|
607 |
-
analysis = re.sub(r'(?i)integration value:', '**Integration Value:**', analysis)
|
608 |
-
analysis = re.sub(r'(?i)market potential:', '**Market Potential:**', analysis)
|
609 |
-
analysis = re.sub(r'(?i)market gap:', '**Market Gap:**', analysis)
|
610 |
-
analysis = re.sub(r'(?i)technical approach:', '**Technical Approach:**', analysis)
|
611 |
-
analysis = re.sub(r'(?i)innovation opportunity:', '**Innovation Opportunity:**', analysis)
|
612 |
|
613 |
# Clean up whitespace and formatting
|
614 |
analysis = re.sub(r'\n\s*\n', '\n', analysis) # Remove multiple blank lines
|
@@ -632,10 +592,8 @@ def create_3d_visualization(patents):
|
|
632 |
"""
|
633 |
Create a 3D visualization of patent embeddings using UMAP and Plotly
|
634 |
"""
|
635 |
-
# Initialize variables for tracking
|
636 |
df = pd.DataFrame(patents)
|
637 |
-
df['point_type'] = 'cluster' # Default type for all points
|
638 |
-
transitional_areas = [] # Initialize empty list for transitional areas
|
639 |
|
640 |
if not patents:
|
641 |
return None
|
@@ -690,698 +648,165 @@ def create_3d_visualization(patents):
|
|
690 |
df['y'] = embedding_3d[:, 1]
|
691 |
df['z'] = embedding_3d[:, 2]
|
692 |
|
693 |
-
# ---
|
694 |
scaler = StandardScaler()
|
695 |
scaled_embeddings = scaler.fit_transform(embedding_3d)
|
696 |
|
697 |
n_points = len(scaled_embeddings)
|
698 |
update_progress('clustering', 'processing', f'Analyzing {n_points} patents for clustering...')
|
699 |
|
700 |
-
# Dynamically set
|
701 |
if n_points < 100:
|
702 |
-
|
703 |
-
max_retries = 2
|
704 |
-
target_noise_ratio = 0.08
|
705 |
elif n_points < 500:
|
706 |
-
|
707 |
-
max_retries = 3
|
708 |
-
target_noise_ratio = 0.06
|
709 |
elif n_points < 1000:
|
710 |
-
|
711 |
-
max_retries = 4
|
712 |
-
target_noise_ratio = 0.05
|
713 |
-
else:
|
714 |
-
max_clusters = 15 # Increased from 12 to force more granular clustering
|
715 |
-
max_retries = 8 # More retries to find optimal clustering
|
716 |
-
target_noise_ratio = 0.03 # Keep low noise ratio
|
717 |
-
|
718 |
-
# Even more aggressive cluster parameters for large datasets
|
719 |
-
if n_points >= 1000:
|
720 |
-
min_cluster_size = max(5, int(n_points * 0.015)) # Further reduced to 1.5% for large datasets
|
721 |
-
min_samples = max(3, int(min_cluster_size * 0.95)) # Increased to 0.95 for even stricter formation
|
722 |
else:
|
723 |
-
min_cluster_size = max(
|
724 |
-
min_samples = max(3, int(min_cluster_size * 0.9)) # 0.9 ratio for smaller datasets
|
725 |
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
731 |
-
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
)
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
update_progress('clustering', 'processing',
|
758 |
-
f'Optimizing clusters (attempt {retry + 1}/{max_retries}): ' +
|
759 |
-
f'Found {n_clusters} clusters with avg size {avg_cluster_size:.1f} patents')
|
760 |
-
|
761 |
-
# Calculate a score for this clustering result
|
762 |
-
# Penalize both too many and too few clusters, and reward good noise ratio
|
763 |
-
score = -abs(n_clusters - max_clusters) + \
|
764 |
-
-abs(noise_ratio - target_noise_ratio) * 10 + \
|
765 |
-
-abs(avg_cluster_size - (n_points / max_clusters)) / 10
|
766 |
-
|
767 |
-
if score > best_score:
|
768 |
-
best_score = score
|
769 |
-
best_result = (clusters, n_clusters, n_noise, noise_ratio, avg_cluster_size)
|
770 |
-
|
771 |
-
# Adjust parameters based on results
|
772 |
-
if n_clusters > max_clusters:
|
773 |
-
print("Too many clusters, increasing parameters more aggressively...")
|
774 |
-
min_cluster_size = int(min_cluster_size * 1.5) # More aggressive increase
|
775 |
-
min_samples = int(min_samples * 1.4)
|
776 |
-
elif n_clusters == 1 and avg_cluster_size > len(clusters) * 0.8:
|
777 |
-
print("Single dominant cluster detected, adjusting for better separation...")
|
778 |
-
min_cluster_size = max(5, int(min_cluster_size * 0.6)) # More aggressive decrease
|
779 |
-
min_samples = max(3, int(min_samples * 0.6))
|
780 |
-
elif n_noise < target_noise * 0.5:
|
781 |
-
print("Too few noise points, adjusting parameters...")
|
782 |
-
min_cluster_size = int(min_cluster_size * 1.2)
|
783 |
-
min_samples = max(3, int(min_samples * 0.8))
|
784 |
-
elif n_clusters < max_clusters * 0.5:
|
785 |
-
print("Too few clusters, decreasing parameters...")
|
786 |
-
min_cluster_size = max(5, int(min_cluster_size * 0.8))
|
787 |
-
min_samples = max(3, int(min_samples * 0.7))
|
788 |
-
else:
|
789 |
-
print("Acceptable clustering found.")
|
790 |
-
break
|
791 |
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
|
|
|
|
|
798 |
|
799 |
df['cluster'] = clusters
|
800 |
|
801 |
-
# ---
|
802 |
cluster_info = []
|
|
|
|
|
803 |
for label in set(clusters):
|
804 |
-
|
805 |
-
|
806 |
-
|
807 |
-
|
808 |
-
cluster_info.append((label, len(cluster_patents), cluster_patents))
|
809 |
|
810 |
# Sort clusters by size in descending order
|
811 |
cluster_info.sort(key=lambda x: x[1], reverse=True)
|
812 |
|
|
|
|
|
|
|
813 |
print("\nCluster Size Distribution:")
|
814 |
for i, (label, size, _) in enumerate(cluster_info):
|
815 |
-
print(f"Cluster {i
|
816 |
|
817 |
-
# Create mapping for new cluster IDs
|
818 |
-
cluster_id_map = {old_label: i for i, (old_label, _, _) in enumerate(cluster_info)}
|
819 |
|
820 |
-
# Update cluster IDs in DataFrame
|
821 |
new_clusters = clusters.copy()
|
822 |
for old_label, new_label in cluster_id_map.items():
|
823 |
new_clusters[clusters == old_label] = new_label
|
824 |
df['cluster'] = new_clusters
|
825 |
|
826 |
-
update_progress('clustering', 'processing', '
|
827 |
|
828 |
-
#
|
829 |
-
|
830 |
-
cluster_insights = [] # Initialize insights list
|
831 |
-
|
832 |
-
# First handle clustered points
|
833 |
total_clusters = len(cluster_info)
|
834 |
-
for
|
835 |
-
|
836 |
-
|
837 |
-
|
838 |
cluster_insights.append({
|
839 |
'type': 'cluster',
|
840 |
-
'id':
|
841 |
'size': size,
|
842 |
-
'label': f"Cluster {
|
843 |
'description': description
|
844 |
})
|
845 |
|
846 |
-
# --- Improved two-stage density analysis for noise points ---
|
847 |
-
noise_mask = df['cluster'] == -1
|
848 |
-
noise_points = scaled_embeddings[noise_mask]
|
849 |
-
noise_indices = df[noise_mask].index
|
850 |
-
dense_noise_indices = [] # Initialize empty list for dense noise points
|
851 |
-
true_sparse_indices = [] # Initialize empty list for sparse points
|
852 |
-
|
853 |
-
if len(noise_points) >= 3:
|
854 |
-
update_progress('clustering', 'processing', f'Analyzing {len(noise_points)} potential underexplored areas...')
|
855 |
-
print(f"\nStructural Analysis for Underexplored Area Detection:")
|
856 |
-
|
857 |
-
# Initialize sparse indices
|
858 |
-
true_sparse_indices = []
|
859 |
-
|
860 |
-
# Stage 1: Calculate local and global density metrics
|
861 |
-
n_neighbors = min(max(5, int(len(noise_points) * 0.05)), 15)
|
862 |
-
print(f"Using {n_neighbors} nearest neighbors for density calculation")
|
863 |
-
|
864 |
-
# Calculate local density for noise points
|
865 |
-
nbrs_local = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean').fit(noise_points)
|
866 |
-
local_distances, local_indices = nbrs_local.kneighbors(noise_points)
|
867 |
-
local_densities = 1 / (np.mean(local_distances, axis=1) + 1e-6) # Add small epsilon to avoid division by zero
|
868 |
-
|
869 |
-
# Calculate distances to cluster centers and their densities
|
870 |
-
cluster_centers = []
|
871 |
-
cluster_densities = [] # Store density of each cluster
|
872 |
-
for label in set(clusters) - {-1}:
|
873 |
-
cluster_mask = clusters == label
|
874 |
-
cluster_points = scaled_embeddings[cluster_mask]
|
875 |
-
center = np.mean(cluster_points, axis=0)
|
876 |
-
cluster_centers.append(center)
|
877 |
-
|
878 |
-
# Calculate cluster density using its member points
|
879 |
-
if len(cluster_points) > 1:
|
880 |
-
nbrs_cluster = NearestNeighbors(n_neighbors=min(5, len(cluster_points))).fit(cluster_points)
|
881 |
-
cluster_dists, _ = nbrs_cluster.kneighbors(cluster_points)
|
882 |
-
cluster_density = 1 / (np.mean(cluster_dists) + 1e-6)
|
883 |
-
else:
|
884 |
-
cluster_density = 0
|
885 |
-
cluster_densities.append(cluster_density)
|
886 |
-
|
887 |
-
cluster_centers = np.array(cluster_centers)
|
888 |
-
cluster_densities = np.array(cluster_densities)
|
889 |
-
|
890 |
-
if len(cluster_centers) > 0:
|
891 |
-
# Calculate distances and density ratios to nearest clusters
|
892 |
-
nbrs_clusters = NearestNeighbors(n_neighbors=1, metric='euclidean').fit(cluster_centers)
|
893 |
-
cluster_distances, nearest_cluster_indices = nbrs_clusters.kneighbors(noise_points)
|
894 |
-
cluster_distances = cluster_distances.flatten()
|
895 |
-
|
896 |
-
# Get density of nearest cluster for each point
|
897 |
-
nearest_cluster_densities = cluster_densities[nearest_cluster_indices.flatten()]
|
898 |
-
|
899 |
-
# Calculate density ratios (local density / nearest cluster density)
|
900 |
-
density_ratios = local_densities / (nearest_cluster_densities + 1e-6)
|
901 |
-
|
902 |
-
print("\nDensity Analysis Statistics:")
|
903 |
-
print(f"Mean local density: {np.mean(local_densities):.3f}")
|
904 |
-
print(f"Mean cluster density: {np.mean(cluster_densities):.3f}")
|
905 |
-
print(f"Mean density ratio: {np.mean(density_ratios):.3f}")
|
906 |
-
|
907 |
-
# Identify structural gaps using multiple criteria with more sensitive thresholds
|
908 |
-
# 1. Density Isolation: Points with very low density compared to clusters
|
909 |
-
# 2. Spatial Isolation: Points far from both clusters and other noise points
|
910 |
-
# 3. Structural Stability: Points whose local neighborhood is also sparse
|
911 |
-
|
912 |
-
# Calculate isolation scores with more balanced thresholds
|
913 |
-
density_isolation = density_ratios < np.percentile(density_ratios, 65) # More balanced threshold
|
914 |
-
spatial_isolation = cluster_distances > np.percentile(cluster_distances, 50) # Median distance threshold
|
915 |
-
|
916 |
-
# Calculate structural stability with more balanced criteria
|
917 |
-
structural_stability = np.zeros(len(noise_points), dtype=bool)
|
918 |
-
for i, neighbors in enumerate(local_indices):
|
919 |
-
neighbor_densities = local_densities[neighbors]
|
920 |
-
# Point is stable if its neighborhood is relatively sparse
|
921 |
-
structural_stability[i] = np.mean(neighbor_densities) < np.percentile(local_densities, 50) # Use median
|
922 |
-
|
923 |
-
# Use more balanced criteria - only need to meet any 1 of 3 criteria initially
|
924 |
-
candidate_sparse_indices = [
|
925 |
-
idx for i, idx in enumerate(noise_indices)
|
926 |
-
if sum([density_isolation[i], spatial_isolation[i], structural_stability[i]]) >= 1 # Only need 1 out of 3 criteria
|
927 |
-
]
|
928 |
-
|
929 |
-
# Start by assuming all non-candidate points are dense noise
|
930 |
-
dense_noise_indices = [idx for idx in noise_indices if idx not in candidate_sparse_indices]
|
931 |
-
|
932 |
-
# Now calculate distances between candidates and dense noise points with more sensitive threshold
|
933 |
-
min_distance_threshold = np.percentile(cluster_distances, 40) # More sensitive threshold
|
934 |
-
# Filter candidates based on distance from dense noise regions
|
935 |
-
if len(candidate_sparse_indices) > 0 and len(dense_noise_indices) > 0:
|
936 |
-
dense_noise_points = scaled_embeddings[dense_noise_indices]
|
937 |
-
true_sparse_indices = []
|
938 |
-
|
939 |
-
for idx in candidate_sparse_indices:
|
940 |
-
point = scaled_embeddings[idx].reshape(1, -1)
|
941 |
-
distances_to_dense = NearestNeighbors(n_neighbors=1).fit(dense_noise_points).kneighbors(point)[0][0]
|
942 |
-
if distances_to_dense > min_distance_threshold:
|
943 |
-
true_sparse_indices.append(idx)
|
944 |
-
|
945 |
-
# Update dense_noise_indices to include rejected candidates
|
946 |
-
rejected_indices = [idx for idx in candidate_sparse_indices if idx not in true_sparse_indices]
|
947 |
-
dense_noise_indices.extend(rejected_indices)
|
948 |
-
else:
|
949 |
-
true_sparse_indices = candidate_sparse_indices
|
950 |
-
else:
|
951 |
-
# Fallback using only local density analysis
|
952 |
-
density_threshold = np.percentile(local_densities, 25) # Bottom 25% sparsest points
|
953 |
-
true_sparse_indices = [idx for i, idx in enumerate(noise_indices)
|
954 |
-
if local_densities[i] < density_threshold]
|
955 |
-
dense_noise_indices = [idx for idx in noise_indices if idx not in true_sparse_indices]
|
956 |
-
|
957 |
-
print(f"\nFinal Classification:")
|
958 |
-
print(f"True underexplored areas identified: {len(true_sparse_indices)}")
|
959 |
-
print(f"Transitional areas identified: {len(dense_noise_indices)}")
|
960 |
-
if len(true_sparse_indices) > 0:
|
961 |
-
print(f"Underexplored area ratio: {len(true_sparse_indices)/len(noise_points):.2%}")
|
962 |
-
print("\nUnderexplored Area Criteria Used:")
|
963 |
-
print("1. Density Isolation: Significantly lower density than nearest cluster")
|
964 |
-
print("2. Spatial Isolation: Far from both clusters and other points")
|
965 |
-
print("3. Structural Stability: Forms stable sparse regions with neighbors")
|
966 |
-
|
967 |
-
# Update point types in DataFrame for sparse points and dense noise
|
968 |
-
for idx in true_sparse_indices:
|
969 |
-
df.at[idx, 'point_type'] = 'sparse'
|
970 |
-
for idx in dense_noise_indices:
|
971 |
-
df.at[idx, 'point_type'] = 'dense_noise'
|
972 |
-
|
973 |
-
# --- Handle dense noise points as transitional areas ---
|
974 |
-
transitional_areas = [] # Store transitional areas for sorting
|
975 |
-
if len(dense_noise_indices) >= 3:
|
976 |
-
update_progress('clustering', 'processing', f'Analyzing {len(dense_noise_indices)} potential transitional areas...')
|
977 |
-
print("\nAnalyzing dense noise points as transitional areas...")
|
978 |
-
dense_noise_points = scaled_embeddings[dense_noise_indices]
|
979 |
-
|
980 |
-
# Use HDBSCAN to find subgroups within transitional areas
|
981 |
-
min_size = max(3, len(dense_noise_points) // 10)
|
982 |
-
print(f"Attempting to identify transitional area subgroups with min_size={min_size}")
|
983 |
-
|
984 |
-
hdb_dense = hdbscan.HDBSCAN(
|
985 |
-
min_cluster_size=min_size,
|
986 |
-
min_samples=max(2, min_size // 2),
|
987 |
-
cluster_selection_epsilon=0.3,
|
988 |
-
cluster_selection_method='leaf'
|
989 |
-
)
|
990 |
-
dense_labels = hdb_dense.fit_predict(dense_noise_points)
|
991 |
-
|
992 |
-
# Count potential transitional areas
|
993 |
-
unique_dense_labels = set(dense_labels) - {-1}
|
994 |
-
n_transitional = len(unique_dense_labels)
|
995 |
-
print(f"Found {n_transitional} distinct transitional areas")
|
996 |
-
|
997 |
-
# First get all transitional points, including scattered ones
|
998 |
-
all_transitional_points = {}
|
999 |
-
# Count sizes first
|
1000 |
-
label_sizes = {}
|
1001 |
-
for label in dense_labels:
|
1002 |
-
if label != -1:
|
1003 |
-
label_sizes[label] = label_sizes.get(label, 0) + 1
|
1004 |
-
|
1005 |
-
# Then collect points with their pre-calculated sizes
|
1006 |
-
for i, label in enumerate(dense_labels):
|
1007 |
-
idx = dense_noise_indices[i]
|
1008 |
-
if label != -1: # Regular transitional area
|
1009 |
-
if label not in all_transitional_points:
|
1010 |
-
all_transitional_points[label] = {'indices': [], 'size': label_sizes[label]}
|
1011 |
-
all_transitional_points[label]['indices'].append(idx)
|
1012 |
-
else: # Scattered points
|
1013 |
-
label_key = 'scattered'
|
1014 |
-
if label_key not in all_transitional_points:
|
1015 |
-
all_transitional_points[label_key] = {'indices': [], 'size': 0}
|
1016 |
-
all_transitional_points[label_key]['indices'].append(idx)
|
1017 |
-
all_transitional_points[label_key]['size'] += 1
|
1018 |
-
|
1019 |
-
# Sort transitional areas by size and create insights
|
1020 |
-
# Filter out areas that are too small and sort by size
|
1021 |
-
min_area_size = 3 # Minimum size for a valid transitional area
|
1022 |
-
valid_areas = [(k, v) for k, v in all_transitional_points.items()
|
1023 |
-
if k != 'scattered' and v['size'] >= min_area_size]
|
1024 |
-
sorted_areas = sorted(valid_areas, key=lambda x: x[1]['size'], reverse=True)
|
1025 |
-
|
1026 |
-
# Add regular transitional areas to insights
|
1027 |
-
total_areas = len(sorted_areas)
|
1028 |
-
for area_idx, (label, area_info) in enumerate(sorted_areas):
|
1029 |
-
update_progress('clustering', 'processing', f'Analyzing transitional area {area_idx + 1} of {total_areas} ({area_info["size"]} patents)...')
|
1030 |
-
area_patents = df.iloc[area_info['indices']]
|
1031 |
-
description = analyze_patent_group(area_patents, 'transitional', label)
|
1032 |
-
area_number = area_idx + 1 # 1-based numbering for display
|
1033 |
-
|
1034 |
-
# Create label without duplicate size info
|
1035 |
-
area_label = f"Transitional Area {area_number}"
|
1036 |
-
transitional_areas.append({
|
1037 |
-
'label': area_label,
|
1038 |
-
'indices': area_info['indices'],
|
1039 |
-
'size': area_info['size'],
|
1040 |
-
'patents': area_patents,
|
1041 |
-
'description': description
|
1042 |
-
})
|
1043 |
-
area_insight = {
|
1044 |
-
'type': 'transitional',
|
1045 |
-
'id': area_idx + 1, # Store as 1-based ID
|
1046 |
-
'size': area_info['size'],
|
1047 |
-
'label': f"{area_label} ({area_info['size']} patents)",
|
1048 |
-
'description': description
|
1049 |
-
}
|
1050 |
-
cluster_insights.append(area_insight)
|
1051 |
-
|
1052 |
-
# Handle scattered points by analyzing them individually
|
1053 |
-
if 'scattered' in all_transitional_points:
|
1054 |
-
scattered_indices = all_transitional_points['scattered']['indices']
|
1055 |
-
if len(scattered_indices) > 0:
|
1056 |
-
print(f"\nAnalyzing {len(scattered_indices)} scattered points...")
|
1057 |
-
scattered_points = scaled_embeddings[scattered_indices]
|
1058 |
-
|
1059 |
-
# Calculate distances to nearest cluster and transitional area
|
1060 |
-
distances_to_clusters = []
|
1061 |
-
distances_to_transitional = []
|
1062 |
-
|
1063 |
-
print("\nDistance analysis for each scattered point:")
|
1064 |
-
point_counter = 0
|
1065 |
-
|
1066 |
-
# First calculate all distances
|
1067 |
-
for point in scattered_points:
|
1068 |
-
point = point.reshape(1, -1)
|
1069 |
-
# Distance to nearest cluster
|
1070 |
-
if len(cluster_centers) > 0:
|
1071 |
-
dist_cluster = NearestNeighbors(n_neighbors=1).fit(cluster_centers).kneighbors(point)[0][0][0]
|
1072 |
-
else:
|
1073 |
-
dist_cluster = float('inf')
|
1074 |
-
|
1075 |
-
# Distance to nearest transitional area (excluding scattered points)
|
1076 |
-
if len(dense_noise_points) > 0:
|
1077 |
-
# Get only the transitional area points (excluding scattered points)
|
1078 |
-
transitional_points = []
|
1079 |
-
for i, point_idx in enumerate(dense_noise_indices):
|
1080 |
-
if point_idx not in scattered_indices:
|
1081 |
-
transitional_points.append(dense_noise_points[i])
|
1082 |
-
|
1083 |
-
if transitional_points:
|
1084 |
-
transitional_points = np.array(transitional_points)
|
1085 |
-
nbrs_trans = NearestNeighbors(n_neighbors=1).fit(transitional_points)
|
1086 |
-
dist_trans = nbrs_trans.kneighbors(point.reshape(1, -1))[0][0][0]
|
1087 |
-
else:
|
1088 |
-
dist_trans = float('inf')
|
1089 |
-
else:
|
1090 |
-
dist_trans = float('inf')
|
1091 |
-
|
1092 |
-
# Store distances for ratio calculation
|
1093 |
-
distances_to_clusters.append(dist_cluster)
|
1094 |
-
distances_to_transitional.append(dist_trans)
|
1095 |
-
|
1096 |
-
total_classified_as_gaps = 0
|
1097 |
-
total_classified_as_transitional = 0
|
1098 |
-
|
1099 |
-
# Use more aggressive thresholds for scattered points
|
1100 |
-
cluster_distance_threshold = np.percentile(distances_to_clusters, 35) # Even more lenient
|
1101 |
-
transitional_distance_threshold = np.percentile(distances_to_transitional, 35) # Even more lenient
|
1102 |
-
|
1103 |
-
print(f"\nClassification thresholds:")
|
1104 |
-
print(f"- Cluster distance threshold: {cluster_distance_threshold:.3f}")
|
1105 |
-
print(f"- Transitional distance threshold: {transitional_distance_threshold:.3f}")
|
1106 |
-
|
1107 |
-
# Classify scattered points
|
1108 |
-
for idx, (dist_c, dist_t) in zip(scattered_indices, zip(distances_to_clusters, distances_to_transitional)):
|
1109 |
-
# 1. Check absolute distances with more lenient thresholds
|
1110 |
-
cluster_dist_threshold = np.percentile(distances_to_clusters, 60) # Use 60th percentile
|
1111 |
-
trans_dist_threshold = np.percentile(distances_to_transitional, 60) # Use 60th percentile
|
1112 |
-
|
1113 |
-
# Point is isolated if it's farther than median distance from both clusters and transitional areas
|
1114 |
-
is_isolated = (dist_c > cluster_dist_threshold or dist_t > trans_dist_threshold)
|
1115 |
-
|
1116 |
-
# 2. Calculate isolation based on absolute difference rather than ratio
|
1117 |
-
isolation_diff = dist_t - dist_c # Positive means farther from transitional areas
|
1118 |
-
is_relatively_isolated = isolation_diff > 0 # Any positive difference counts
|
1119 |
-
|
1120 |
-
# 3. Simplified region formation check
|
1121 |
-
nearby_transitional = sum(1 for d in distances_to_transitional if d < trans_dist_threshold)
|
1122 |
-
nearby_clusters = sum(1 for d in distances_to_clusters if d < cluster_dist_threshold)
|
1123 |
-
|
1124 |
-
# Point forms new region if it has any cluster neighbors
|
1125 |
-
forms_new_region = nearby_clusters > 0
|
1126 |
-
|
1127 |
-
# Classification decision and immediate DataFrame update
|
1128 |
-
# More lenient classification - if the point is isolated OR relatively isolated, mark as gap
|
1129 |
-
if is_isolated or is_relatively_isolated:
|
1130 |
-
true_sparse_indices.append(idx)
|
1131 |
-
df.at[idx, 'point_type'] = 'sparse' # Immediately update DataFrame
|
1132 |
-
total_classified_as_gaps += 1
|
1133 |
-
else:
|
1134 |
-
dense_noise_indices.append(idx)
|
1135 |
-
df.at[idx, 'point_type'] = 'dense_noise' # Immediately update DataFrame
|
1136 |
-
total_classified_as_transitional += 1
|
1137 |
-
|
1138 |
-
print(f"\nFinal classification summary for scattered points:")
|
1139 |
-
print(f"- Total scattered points: {len(scattered_indices)}")
|
1140 |
-
print(f"- Classified as underexplored areas: {total_classified_as_gaps}")
|
1141 |
-
print(f"- Classified as transitional: {total_classified_as_transitional}")
|
1142 |
-
if total_classified_as_gaps == 0:
|
1143 |
-
print("\nWarning: No scattered points were classified as underexplored areas!")
|
1144 |
-
print("Possible reasons:")
|
1145 |
-
print("1. Distance thresholds may be too high")
|
1146 |
-
print("2. Relative distance ratio may be too strict")
|
1147 |
-
print("3. Nearby points criterion may be too restrictive")
|
1148 |
-
|
1149 |
-
if total_classified_as_transitional > 0:
|
1150 |
-
# Create a transitional area for scattered points
|
1151 |
-
scattered_transitional_patents = df.iloc[dense_noise_indices[-total_classified_as_transitional:]]
|
1152 |
-
description = analyze_patent_group(scattered_transitional_patents, 'transitional', 'scattered')
|
1153 |
-
area_number = len(transitional_areas) + 1 # 1-based numbering for display
|
1154 |
-
|
1155 |
-
# Add to transitional areas
|
1156 |
-
area_label = f"Transitional Area {area_number}"
|
1157 |
-
transitional_areas.append({
|
1158 |
-
'label': area_label,
|
1159 |
-
'indices': dense_noise_indices[-total_classified_as_transitional:],
|
1160 |
-
'size': total_classified_as_transitional,
|
1161 |
-
'patents': scattered_transitional_patents,
|
1162 |
-
'description': description
|
1163 |
-
})
|
1164 |
-
|
1165 |
-
# Add to insights
|
1166 |
-
area_insight = {
|
1167 |
-
'type': 'transitional',
|
1168 |
-
'id': -1, # Special ID for scattered points
|
1169 |
-
'size': total_classified_as_transitional,
|
1170 |
-
'label': f"{area_label} ({total_classified_as_transitional} patents)",
|
1171 |
-
'description': description
|
1172 |
-
}
|
1173 |
-
cluster_insights.append(area_insight)
|
1174 |
-
|
1175 |
-
# --- Analyze underexplored areas ---
|
1176 |
-
if len(true_sparse_indices) > 0:
|
1177 |
-
update_progress('clustering', 'processing', f'Analyzing {len(true_sparse_indices)} potential underexplored areas...')
|
1178 |
-
print(f"\nProcessing {len(true_sparse_indices)} underexplored areas...")
|
1179 |
-
sparse_patents = df.iloc[true_sparse_indices]
|
1180 |
-
sparse_points = scaled_embeddings[true_sparse_indices]
|
1181 |
-
|
1182 |
-
# Ensure points are marked as sparse in the DataFrame
|
1183 |
-
df.loc[true_sparse_indices, 'point_type'] = 'sparse'
|
1184 |
-
|
1185 |
-
# More lenient subclustering parameters for underexplored areas
|
1186 |
-
min_subcluster_size = max(2, min(5, len(true_sparse_indices) // 10)) # More lenient minimum size
|
1187 |
-
sparse_clusterer = hdbscan.HDBSCAN(
|
1188 |
-
min_cluster_size=min_subcluster_size,
|
1189 |
-
min_samples=1, # Most lenient possible
|
1190 |
-
cluster_selection_epsilon=0.8, # Even more lenient
|
1191 |
-
cluster_selection_method='leaf', # Changed to leaf for finer subcluster detection
|
1192 |
-
metric='euclidean'
|
1193 |
-
)
|
1194 |
-
sparse_labels = sparse_clusterer.fit_predict(sparse_points)
|
1195 |
-
|
1196 |
-
# Collect innovation subclusters for sorting
|
1197 |
-
innovation_subclusters = []
|
1198 |
-
for label in set(sparse_labels):
|
1199 |
-
subcluster_mask = sparse_labels == label
|
1200 |
-
subcluster_patents = sparse_patents[subcluster_mask]
|
1201 |
-
subcluster_size = len(subcluster_patents)
|
1202 |
-
|
1203 |
-
# Accept all subclusters, even single points
|
1204 |
-
description = analyze_patent_group(subcluster_patents, 'innovation_subcluster', label)
|
1205 |
-
innovation_subclusters.append({
|
1206 |
-
'label': label,
|
1207 |
-
'size': subcluster_size,
|
1208 |
-
'patents': subcluster_patents,
|
1209 |
-
'description': description
|
1210 |
-
})
|
1211 |
-
|
1212 |
-
# Sort innovation subclusters by size in descending order
|
1213 |
-
innovation_subclusters.sort(key=lambda x: x['size'], reverse=True)
|
1214 |
-
|
1215 |
-
# Add sorted innovation subclusters to insights
|
1216 |
-
total_subclusters = len(innovation_subclusters)
|
1217 |
-
for idx, subcluster in enumerate(innovation_subclusters):
|
1218 |
-
update_progress('clustering', 'processing', f'Analyzing underexplored area opportunity {idx + 1} of {total_subclusters} ({subcluster["size"]} patents)...')
|
1219 |
-
cluster_insights.append({
|
1220 |
-
'type': 'innovation_subcluster',
|
1221 |
-
'id': idx + 1, # Store as 1-based ID
|
1222 |
-
'size': subcluster['size'],
|
1223 |
-
'label': f"Underexplored Area {idx + 1}",
|
1224 |
-
'description': subcluster['description']
|
1225 |
-
})
|
1226 |
-
else:
|
1227 |
-
cluster_insights.append({
|
1228 |
-
'type': 'innovation_subcluster',
|
1229 |
-
'id': -1,
|
1230 |
-
'size': 0,
|
1231 |
-
'label': 'No Underexplored Areas',
|
1232 |
-
'description': 'No significant underexplored areas were detected in this technology space.'
|
1233 |
-
})
|
1234 |
-
|
1235 |
update_progress('visualization', 'processing', 'Creating interactive plot...')
|
1236 |
|
1237 |
-
# Create Plotly figure with clusters
|
1238 |
-
# Ensure all points are properly categorized
|
1239 |
-
unassigned_mask = df['point_type'] == 'unassigned'
|
1240 |
-
if any(unassigned_mask):
|
1241 |
-
print(f"Warning: {sum(unassigned_mask)} points remain unassigned")
|
1242 |
-
df.loc[unassigned_mask, 'point_type'] = 'cluster' # Default unassigned to clusters
|
1243 |
-
|
1244 |
-
# Separate points into three categories: clusters, underexplored areas, and dense noise
|
1245 |
-
cluster_mask = df['point_type'] == 'cluster'
|
1246 |
-
innovation_gaps_mask = df['point_type'] == 'sparse'
|
1247 |
-
dense_noise_mask = df['point_type'] == 'dense_noise'
|
1248 |
|
|
|
1249 |
# Create hover text for all points
|
1250 |
hover_text = []
|
1251 |
-
# Create mapping for underexplored area points to their numbers
|
1252 |
-
innovation_gap_map = {}
|
1253 |
-
|
1254 |
-
# Map underexplored areas using the analyzed subclusters to ensure consistent numbering
|
1255 |
-
if len(true_sparse_indices) > 0:
|
1256 |
-
for idx, subcluster in enumerate(innovation_subclusters, 1):
|
1257 |
-
for patent in subcluster['patents'].index:
|
1258 |
-
innovation_gap_map[patent] = idx
|
1259 |
-
|
1260 |
-
# Create mapping for transitional areas
|
1261 |
-
transitional_area_map = {}
|
1262 |
-
for area_idx, area in enumerate(transitional_areas):
|
1263 |
-
for idx in area['indices']:
|
1264 |
-
transitional_area_map[idx] = {'number': area_idx + 1}
|
1265 |
-
|
1266 |
-
# Generate hover text for each point
|
1267 |
for idx, row in df.iterrows():
|
1268 |
-
point_info = ""
|
1269 |
-
if row['point_type'] == 'sparse':
|
1270 |
-
gap_number = innovation_gap_map.get(idx)
|
1271 |
-
if gap_number:
|
1272 |
-
point_info = f"<br><b>Region:</b> Underexplored Area {gap_number}"
|
1273 |
-
else:
|
1274 |
-
point_info = "<br><b>Region:</b> Potential Innovation Area"
|
1275 |
-
elif row['point_type'] == 'dense_noise':
|
1276 |
-
area_info = transitional_area_map.get(idx)
|
1277 |
-
if area_info:
|
1278 |
-
point_info = f"<br><b>Region:</b> Transitional Area {area_info['number']}"
|
1279 |
-
else:
|
1280 |
-
# This is a scattered transitional point
|
1281 |
-
point_info = f"<br><b>Region:</b> Transitional Area {len(transitional_areas)} (Scattered)"
|
1282 |
-
else:
|
1283 |
-
point_info = f"<br><b>Cluster:</b> {int(row['cluster']) + 1}" # Cluster IDs are still 0-based in the DataFrame
|
1284 |
-
|
1285 |
text = (
|
1286 |
f"<b>{row['title']}</b><br><br>"
|
1287 |
f"<b>By:</b> {row['assignee']} ({row['year']})<br>"
|
1288 |
-
f"{
|
1289 |
f"<b>Abstract:</b><br>{row['abstract']}"
|
1290 |
)
|
1291 |
hover_text.append(text)
|
1292 |
|
1293 |
-
# Create
|
1294 |
cluster_trace = go.Scatter3d(
|
1295 |
-
x=df[
|
1296 |
-
y=df[
|
1297 |
-
z=df[
|
1298 |
mode='markers',
|
1299 |
marker=dict(
|
1300 |
size=6,
|
1301 |
-
color=
|
1302 |
colorscale='Viridis',
|
1303 |
-
opacity=0.
|
1304 |
showscale=True,
|
1305 |
colorbar=dict(
|
1306 |
-
title="Clusters",
|
1307 |
-
|
1308 |
-
|
1309 |
-
|
1310 |
tickfont=dict(size=10),
|
1311 |
-
titlefont=dict(size=
|
1312 |
)
|
1313 |
),
|
1314 |
-
text=
|
1315 |
hoverinfo='text',
|
1316 |
-
name='Clusters',
|
1317 |
hoverlabel=dict(
|
1318 |
bgcolor="white",
|
1319 |
font_size=12,
|
1320 |
font_family="Arial",
|
1321 |
align="left"
|
1322 |
),
|
1323 |
-
customdata=
|
1324 |
)
|
1325 |
|
1326 |
-
|
1327 |
-
x=df[innovation_gaps_mask]['x'],
|
1328 |
-
y=df[innovation_gaps_mask]['y'],
|
1329 |
-
z=df[innovation_gaps_mask]['z'],
|
1330 |
-
mode='markers',
|
1331 |
-
marker=dict(
|
1332 |
-
size=6, # Same size as other points
|
1333 |
-
color='rgb(255, 0, 0)', # Pure bright red
|
1334 |
-
symbol='diamond',
|
1335 |
-
opacity=1.0, # Full opacity for visibility
|
1336 |
-
line=dict(
|
1337 |
-
color='white',
|
1338 |
-
width=1 # Thinner border to match other points
|
1339 |
-
)
|
1340 |
-
),
|
1341 |
-
text=[hover_text[i] for i in range(len(hover_text)) if innovation_gaps_mask[i]],
|
1342 |
-
hoverinfo='text',
|
1343 |
-
name='Underexplored Areas',
|
1344 |
-
hoverlabel=dict(
|
1345 |
-
bgcolor="white",
|
1346 |
-
font_size=12,
|
1347 |
-
font_family="Arial",
|
1348 |
-
align="left"
|
1349 |
-
),
|
1350 |
-
customdata=[df['link'].tolist()[i] for i in range(len(df)) if innovation_gaps_mask[i]]
|
1351 |
-
)
|
1352 |
-
|
1353 |
-
dense_noise_trace = go.Scatter3d(
|
1354 |
-
x=df[dense_noise_mask]['x'],
|
1355 |
-
y=df[dense_noise_mask]['y'],
|
1356 |
-
z=df[dense_noise_mask]['z'],
|
1357 |
-
mode='markers',
|
1358 |
-
marker=dict(
|
1359 |
-
size=6, # Same size as other points
|
1360 |
-
color='rgb(255, 165, 0)', # Orange for transitional areas
|
1361 |
-
symbol='circle',
|
1362 |
-
opacity=0.7, # Less opacity to make gaps more visible
|
1363 |
-
line=dict(
|
1364 |
-
color='white',
|
1365 |
-
width=1 # Thin border
|
1366 |
-
)
|
1367 |
-
),
|
1368 |
-
text=[hover_text[i] for i in range(len(hover_text)) if dense_noise_mask[i]],
|
1369 |
-
hoverinfo='text',
|
1370 |
-
name='Transitional Areas',
|
1371 |
-
hoverlabel=dict(
|
1372 |
-
bgcolor="white",
|
1373 |
-
font_size=12,
|
1374 |
-
font_family="Arial",
|
1375 |
-
align="left"
|
1376 |
-
),
|
1377 |
-
customdata=[df['link'].tolist()[i] for i in range(len(df)) if dense_noise_mask[i]]
|
1378 |
-
)
|
1379 |
-
|
1380 |
-
fig = go.Figure(data=[cluster_trace, innovation_gaps_trace, dense_noise_trace])
|
1381 |
|
1382 |
# Update layout
|
1383 |
fig.update_layout(
|
1384 |
-
title="Patent Technology Landscape",
|
1385 |
scene=dict(
|
1386 |
xaxis_title="UMAP 1",
|
1387 |
yaxis_title="UMAP 2",
|
@@ -1389,28 +814,16 @@ def create_3d_visualization(patents):
|
|
1389 |
camera=dict(
|
1390 |
up=dict(x=0, y=0, z=1),
|
1391 |
center=dict(x=0, y=0, z=0),
|
1392 |
-
eye=dict(x=1.8, y=1.8, z=1.8)
|
1393 |
),
|
1394 |
-
aspectmode='cube'
|
1395 |
),
|
1396 |
margin=dict(l=0, r=0, b=0, t=30),
|
1397 |
-
showlegend=
|
1398 |
template="plotly_dark",
|
1399 |
hoverlabel_align='left',
|
1400 |
hoverdistance=100,
|
1401 |
-
hovermode='closest'
|
1402 |
-
legend=dict(
|
1403 |
-
yanchor="top",
|
1404 |
-
y=0.99,
|
1405 |
-
xanchor="left",
|
1406 |
-
x=0.01,
|
1407 |
-
bgcolor="rgba(0,0,0,0.7)", # Darker background for better contrast
|
1408 |
-
font=dict(
|
1409 |
-
color="white",
|
1410 |
-
size=12
|
1411 |
-
),
|
1412 |
-
itemsizing='constant' # Keep legend marker sizes consistent
|
1413 |
-
)
|
1414 |
)
|
1415 |
|
1416 |
# Configure hover behavior
|
@@ -1455,7 +868,7 @@ def generate_analysis(prompt, cluster_insights):
|
|
1455 |
analysis = response.choices[0].message['content']
|
1456 |
|
1457 |
# Validate that analysis references valid areas
|
1458 |
-
area_pattern = r'(?:Cluster
|
1459 |
referenced_areas = set(int(num) for num in re.findall(area_pattern, analysis))
|
1460 |
|
1461 |
# Extract valid area numbers from insights
|
@@ -1478,92 +891,70 @@ def generate_analysis(prompt, cluster_insights):
|
|
1478 |
|
1479 |
def analyze_innovation_opportunities(cluster_insights):
|
1480 |
"""
|
1481 |
-
Analyze
|
1482 |
-
Returns focused analysis of high-value innovation opportunities between
|
1483 |
"""
|
1484 |
# Extract cluster numbers and validate
|
1485 |
cluster_nums = set()
|
1486 |
-
transitional_nums = set()
|
1487 |
-
underexplored_nums = set()
|
1488 |
|
1489 |
# Parse and validate cluster numbers with explicit error checking
|
1490 |
for insight in cluster_insights:
|
1491 |
area_type = insight.get('type', '')
|
1492 |
area_id = insight.get('id', -1)
|
1493 |
|
1494 |
-
if
|
1495 |
-
continue
|
1496 |
-
|
1497 |
-
if area_type == 'cluster':
|
1498 |
cluster_nums.add(area_id)
|
1499 |
-
elif area_type == 'transitional':
|
1500 |
-
transitional_nums.add(area_id)
|
1501 |
-
elif area_type == 'innovation_subcluster':
|
1502 |
-
if area_id >= 1: # Skip the "No underexplored areas" entry
|
1503 |
-
underexplored_nums.add(area_id)
|
1504 |
-
|
1505 |
-
# Format areas with validation
|
1506 |
-
def format_area_list(area_nums):
|
1507 |
-
return f"Areas {', '.join(str(n) for n in sorted(area_nums))}" if area_nums else "None identified"
|
1508 |
|
1509 |
-
# Only generate analysis if we have
|
1510 |
-
if not
|
1511 |
-
return "No
|
1512 |
|
1513 |
-
# Create descriptions list with
|
1514 |
descriptions = []
|
1515 |
cluster_details = {}
|
1516 |
-
transitional_details = {}
|
1517 |
-
underexplored_details = {}
|
1518 |
|
1519 |
for insight in cluster_insights:
|
1520 |
-
if insight.get('description'):
|
1521 |
-
area_type = insight.get('type', '')
|
1522 |
area_id = int(insight.get('id', -1)) # 1-based IDs
|
1523 |
area_size = insight.get('size', 0)
|
1524 |
|
1525 |
-
|
1526 |
-
|
1527 |
-
|
1528 |
-
cluster_details[area_id] = {'description': insight['description'], 'size': area_size}
|
1529 |
-
elif area_type == 'transitional':
|
1530 |
-
desc = f"T{area_id}:{insight['description']}"
|
1531 |
-
descriptions.append(desc)
|
1532 |
-
transitional_details[area_id] = {'description': insight['description'], 'size': area_size}
|
1533 |
-
elif area_type == 'innovation_subcluster' and insight['id'] >= 1:
|
1534 |
-
desc = f"U{area_id}:{insight['description']}"
|
1535 |
-
descriptions.append(desc)
|
1536 |
-
underexplored_details[area_id] = {'description': insight['description'], 'size': area_size}
|
1537 |
|
1538 |
# Format descriptions as a string with newlines
|
1539 |
descriptions_text = '\n'.join(descriptions)
|
1540 |
|
1541 |
-
prompt = f"""Available
|
1542 |
-
Clusters: {
|
1543 |
-
|
1544 |
-
|
1545 |
-
Area Descriptions:
|
1546 |
{descriptions_text}
|
1547 |
-
|
1548 |
-
|
1549 |
-
|
|
|
|
|
1550 |
For each opportunity:
|
1551 |
-
1. Select either ONE
|
1552 |
-
2. Identify a specific technical or market gap
|
1553 |
3. Propose a concrete solution that addresses this gap
|
1554 |
4. Quantify potential business impact and competitive advantage
|
|
|
1555 |
Follow this precise format:
|
1556 |
Opportunity N: [Title that describes the innovation]
|
1557 |
-
Source: [Single
|
1558 |
- Gap: [Specific technical or market gap that represents an unmet need]
|
1559 |
- Solution: [Practical, implementable technical approach]
|
1560 |
- Impact: [Specific business value creation - market size, efficiency gains, cost reduction]
|
1561 |
- Timeline: [Short-term (1-2 years) or medium-term (3-5 years)]
|
|
|
1562 |
Prioritize opportunities based on:
|
1563 |
1. Commercial potential (market size, growth potential)
|
1564 |
2. Technical feasibility (can be implemented with current or near-term technology)
|
1565 |
3. Competitive advantage (uniqueness, barriers to entry)
|
1566 |
4. Alignment with industry trends (sustainability, automation, digitalization)
|
|
|
1567 |
Focus on practical innovations that could realistically be implemented by a company rather than theoretical or speculative concepts."""
|
1568 |
|
1569 |
# Get analysis from LLM
|
@@ -2000,9 +1391,7 @@ def download_plot():
|
|
2000 |
<h1>Patent Technology Landscape</h1>
|
2001 |
<p><strong>Instructions:</strong> Click on any point to open the corresponding Google Patents page in a new tab.</p>
|
2002 |
<p><strong>Legend:</strong>
|
2003 |
-
<span style="color: #636EFA;">β Clusters</span>
|
2004 |
-
<span style="color: #FF0000;">β¦ Underexplored Areas</span> |
|
2005 |
-
<span style="color: #FFA500;">β Transitional Areas</span>
|
2006 |
</p>
|
2007 |
</div>
|
2008 |
<div id="plot"></div>
|
@@ -2244,30 +1633,6 @@ def download_insights():
|
|
2244 |
cluster_count += 1
|
2245 |
print(f"Added {cluster_count} clusters")
|
2246 |
|
2247 |
-
# Add transitional areas
|
2248 |
-
print("Adding transitional areas section...")
|
2249 |
-
story.append(Paragraph("Transitional Areas", heading_style))
|
2250 |
-
trans_count = 0
|
2251 |
-
for insight in insights:
|
2252 |
-
if insight['type'] == 'transitional':
|
2253 |
-
text = f"<b>Transitional Area {insight['id']}:</b> {insight['description']}"
|
2254 |
-
story.append(Paragraph(text, normal_style))
|
2255 |
-
story.append(Spacer(1, 12))
|
2256 |
-
trans_count += 1
|
2257 |
-
print(f"Added {trans_count} transitional areas")
|
2258 |
-
|
2259 |
-
# Add underexplored areas
|
2260 |
-
print("Adding underexplored areas section...")
|
2261 |
-
story.append(Paragraph("Underexplored Areas", heading_style))
|
2262 |
-
underexplored_count = 0
|
2263 |
-
for insight in insights:
|
2264 |
-
if insight['type'] == 'innovation_subcluster':
|
2265 |
-
text = f"<b>Underexplored Area {insight['id']}:</b> {insight['description']}"
|
2266 |
-
story.append(Paragraph(text, normal_style))
|
2267 |
-
story.append(Spacer(1, 12))
|
2268 |
-
underexplored_count += 1
|
2269 |
-
print(f"Added {underexplored_count} underexplored areas")
|
2270 |
-
|
2271 |
# Build PDF
|
2272 |
print("Building final PDF document...")
|
2273 |
doc.build(story)
|
|
|
489 |
return all_patents
|
490 |
|
491 |
def analyze_patent_group(patents, group_type, label, max_retries=3):
|
492 |
+
"""Analyze patent clusters using ChatGPT with improved formatting and concise output"""
|
493 |
# Extract key information from all patents in the group
|
494 |
patent_count = len(patents)
|
495 |
years_range = f"{patents['year'].min()}-{patents['year'].max()}"
|
|
|
535 |
else:
|
536 |
top_assignees = ", ".join(patents['assignee'].unique())
|
537 |
|
538 |
+
# Enhanced prompt template for cluster analysis
|
539 |
+
base_prompt = f"""Patent cluster analysis ({patent_count} patents, {years_range}):
|
|
|
|
|
|
|
540 |
Key players: {top_assignees}
|
541 |
Core technologies: {key_terms}
|
542 |
Sample innovations: {example_titles}
|
|
|
544 |
Provide concise analysis in exactly this format:
|
545 |
**Technology Focus:** [What specific problem/need this cluster addresses]
|
546 |
**Market Applications:** [Primary commercial uses and target industries]
|
547 |
+
**Innovation Trajectory:** [How this technology is evolving and future direction]"""
|
548 |
|
549 |
+
system_prompt = "You are a patent analyst providing strategic technology insights. Focus on commercial relevance and market opportunities."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
550 |
|
551 |
retry_count = 0
|
552 |
while retry_count < max_retries:
|
|
|
569 |
analysis = re.sub(r'(?i)technology focus:', '**Technology Focus:**', analysis)
|
570 |
analysis = re.sub(r'(?i)market applications:', '**Market Applications:**', analysis)
|
571 |
analysis = re.sub(r'(?i)innovation trajectory:', '**Innovation Trajectory:**', analysis)
|
|
|
|
|
|
|
|
|
|
|
|
|
572 |
|
573 |
# Clean up whitespace and formatting
|
574 |
analysis = re.sub(r'\n\s*\n', '\n', analysis) # Remove multiple blank lines
|
|
|
592 |
"""
|
593 |
Create a 3D visualization of patent embeddings using UMAP and Plotly
|
594 |
"""
|
595 |
+
# Initialize variables for tracking clusters
|
596 |
df = pd.DataFrame(patents)
|
|
|
|
|
597 |
|
598 |
if not patents:
|
599 |
return None
|
|
|
648 |
df['y'] = embedding_3d[:, 1]
|
649 |
df['z'] = embedding_3d[:, 2]
|
650 |
|
651 |
+
# --- Simplified HDBSCAN clustering for technological clusters ---
|
652 |
scaler = StandardScaler()
|
653 |
scaled_embeddings = scaler.fit_transform(embedding_3d)
|
654 |
|
655 |
n_points = len(scaled_embeddings)
|
656 |
update_progress('clustering', 'processing', f'Analyzing {n_points} patents for clustering...')
|
657 |
|
658 |
+
# Dynamically set clustering parameters based on dataset size
|
659 |
if n_points < 100:
|
660 |
+
min_cluster_size = max(5, int(n_points * 0.08))
|
|
|
|
|
661 |
elif n_points < 500:
|
662 |
+
min_cluster_size = max(8, int(n_points * 0.05))
|
|
|
|
|
663 |
elif n_points < 1000:
|
664 |
+
min_cluster_size = max(15, int(n_points * 0.03))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
665 |
else:
|
666 |
+
min_cluster_size = max(20, int(n_points * 0.02))
|
|
|
667 |
|
668 |
+
min_samples = max(3, int(min_cluster_size * 0.7))
|
669 |
+
|
670 |
+
print(f"HDBSCAN clustering: min_cluster_size={min_cluster_size}, min_samples={min_samples}")
|
671 |
+
|
672 |
+
# Apply HDBSCAN clustering
|
673 |
+
hdb = hdbscan.HDBSCAN(
|
674 |
+
min_cluster_size=min_cluster_size,
|
675 |
+
min_samples=min_samples,
|
676 |
+
cluster_selection_epsilon=0.1,
|
677 |
+
cluster_selection_method='eom',
|
678 |
+
metric='euclidean'
|
679 |
+
)
|
680 |
+
clusters = hdb.fit_predict(scaled_embeddings)
|
681 |
+
|
682 |
+
# Assign noise points to nearest cluster
|
683 |
+
noise_mask = clusters == -1
|
684 |
+
if any(noise_mask) and len(set(clusters)) > 1:
|
685 |
+
print(f"Assigning {sum(noise_mask)} noise points to nearest clusters...")
|
686 |
+
# Get cluster centers
|
687 |
+
cluster_centers = []
|
688 |
+
cluster_labels = []
|
689 |
+
for label in set(clusters):
|
690 |
+
if label != -1:
|
691 |
+
cluster_mask = clusters == label
|
692 |
+
center = np.mean(scaled_embeddings[cluster_mask], axis=0)
|
693 |
+
cluster_centers.append(center)
|
694 |
+
cluster_labels.append(label)
|
695 |
+
|
696 |
+
if cluster_centers:
|
697 |
+
cluster_centers = np.array(cluster_centers)
|
698 |
+
noise_points = scaled_embeddings[noise_mask]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
699 |
|
700 |
+
# Find nearest cluster for each noise point
|
701 |
+
nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
|
702 |
+
_, nearest_indices = nbrs.kneighbors(noise_points)
|
703 |
+
|
704 |
+
# Assign noise points to nearest clusters
|
705 |
+
noise_indices = np.where(noise_mask)[0]
|
706 |
+
for i, nearest_idx in enumerate(nearest_indices.flatten()):
|
707 |
+
clusters[noise_indices[i]] = cluster_labels[nearest_idx]
|
708 |
|
709 |
df['cluster'] = clusters
|
710 |
|
711 |
+
# --- Gather clusters and analyze them ---
|
712 |
cluster_info = []
|
713 |
+
n_clusters = len(set(clusters))
|
714 |
+
|
715 |
for label in set(clusters):
|
716 |
+
cluster_mask = clusters == label
|
717 |
+
cluster_patents = df[cluster_mask]
|
718 |
+
if len(cluster_patents) > 0:
|
719 |
+
cluster_info.append((label, len(cluster_patents), cluster_patents))
|
|
|
720 |
|
721 |
# Sort clusters by size in descending order
|
722 |
cluster_info.sort(key=lambda x: x[1], reverse=True)
|
723 |
|
724 |
+
print(f"\nFinal Clustering Results:")
|
725 |
+
print(f"Number of technological clusters: {n_clusters}")
|
726 |
+
print(f"Total patents clustered: {len(df)}")
|
727 |
print("\nCluster Size Distribution:")
|
728 |
for i, (label, size, _) in enumerate(cluster_info):
|
729 |
+
print(f"Cluster {i + 1}: {size} patents")
|
730 |
|
731 |
+
# Create mapping for new cluster IDs (1-based)
|
732 |
+
cluster_id_map = {old_label: i + 1 for i, (old_label, _, _) in enumerate(cluster_info)}
|
733 |
|
734 |
+
# Update cluster IDs in DataFrame to be 1-based
|
735 |
new_clusters = clusters.copy()
|
736 |
for old_label, new_label in cluster_id_map.items():
|
737 |
new_clusters[clusters == old_label] = new_label
|
738 |
df['cluster'] = new_clusters
|
739 |
|
740 |
+
update_progress('clustering', 'processing', 'Analyzing technological clusters...')
|
741 |
|
742 |
+
# Analyze each cluster
|
743 |
+
cluster_insights = []
|
|
|
|
|
|
|
744 |
total_clusters = len(cluster_info)
|
745 |
+
for i, (_, size, cluster_patents) in enumerate(cluster_info):
|
746 |
+
cluster_id = i + 1 # 1-based cluster ID
|
747 |
+
update_progress('clustering', 'processing', f'Analyzing cluster {cluster_id} of {total_clusters} ({size} patents)...')
|
748 |
+
description = analyze_patent_group(cluster_patents, 'cluster', cluster_id)
|
749 |
cluster_insights.append({
|
750 |
'type': 'cluster',
|
751 |
+
'id': cluster_id,
|
752 |
'size': size,
|
753 |
+
'label': f"Cluster {cluster_id}",
|
754 |
'description': description
|
755 |
})
|
756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
757 |
update_progress('visualization', 'processing', 'Creating interactive plot...')
|
758 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
759 |
|
760 |
+
# Create Plotly figure with clusters only
|
761 |
# Create hover text for all points
|
762 |
hover_text = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
763 |
for idx, row in df.iterrows():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
764 |
text = (
|
765 |
f"<b>{row['title']}</b><br><br>"
|
766 |
f"<b>By:</b> {row['assignee']} ({row['year']})<br>"
|
767 |
+
f"<b>Cluster:</b> {int(row['cluster'])}<br><br>"
|
768 |
f"<b>Abstract:</b><br>{row['abstract']}"
|
769 |
)
|
770 |
hover_text.append(text)
|
771 |
|
772 |
+
# Create single trace for all clusters
|
773 |
cluster_trace = go.Scatter3d(
|
774 |
+
x=df['x'],
|
775 |
+
y=df['y'],
|
776 |
+
z=df['z'],
|
777 |
mode='markers',
|
778 |
marker=dict(
|
779 |
size=6,
|
780 |
+
color=df['cluster'],
|
781 |
colorscale='Viridis',
|
782 |
+
opacity=0.7,
|
783 |
showscale=True,
|
784 |
colorbar=dict(
|
785 |
+
title="Technology Clusters",
|
786 |
+
tickmode="linear",
|
787 |
+
tick0=1,
|
788 |
+
dtick=1,
|
789 |
tickfont=dict(size=10),
|
790 |
+
titlefont=dict(size=12)
|
791 |
)
|
792 |
),
|
793 |
+
text=hover_text,
|
794 |
hoverinfo='text',
|
795 |
+
name='Technology Clusters',
|
796 |
hoverlabel=dict(
|
797 |
bgcolor="white",
|
798 |
font_size=12,
|
799 |
font_family="Arial",
|
800 |
align="left"
|
801 |
),
|
802 |
+
customdata=df['link'].tolist()
|
803 |
)
|
804 |
|
805 |
+
fig = go.Figure(data=[cluster_trace])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
806 |
|
807 |
# Update layout
|
808 |
fig.update_layout(
|
809 |
+
title="Patent Technology Landscape - Cluster Analysis",
|
810 |
scene=dict(
|
811 |
xaxis_title="UMAP 1",
|
812 |
yaxis_title="UMAP 2",
|
|
|
814 |
camera=dict(
|
815 |
up=dict(x=0, y=0, z=1),
|
816 |
center=dict(x=0, y=0, z=0),
|
817 |
+
eye=dict(x=1.8, y=1.8, z=1.8)
|
818 |
),
|
819 |
+
aspectmode='cube'
|
820 |
),
|
821 |
margin=dict(l=0, r=0, b=0, t=30),
|
822 |
+
showlegend=False, # Single trace doesn't need legend
|
823 |
template="plotly_dark",
|
824 |
hoverlabel_align='left',
|
825 |
hoverdistance=100,
|
826 |
+
hovermode='closest'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
827 |
)
|
828 |
|
829 |
# Configure hover behavior
|
|
|
868 |
analysis = response.choices[0].message['content']
|
869 |
|
870 |
# Validate that analysis references valid areas
|
871 |
+
area_pattern = r'(?:Cluster)\s+(\d+)'
|
872 |
referenced_areas = set(int(num) for num in re.findall(area_pattern, analysis))
|
873 |
|
874 |
# Extract valid area numbers from insights
|
|
|
891 |
|
892 |
def analyze_innovation_opportunities(cluster_insights):
|
893 |
"""
|
894 |
+
Analyze technology clusters to identify potential innovation opportunities.
|
895 |
+
Returns focused analysis of high-value innovation opportunities within and between technology clusters.
|
896 |
"""
|
897 |
# Extract cluster numbers and validate
|
898 |
cluster_nums = set()
|
|
|
|
|
899 |
|
900 |
# Parse and validate cluster numbers with explicit error checking
|
901 |
for insight in cluster_insights:
|
902 |
area_type = insight.get('type', '')
|
903 |
area_id = insight.get('id', -1)
|
904 |
|
905 |
+
if area_type == 'cluster' and area_id > 0:
|
|
|
|
|
|
|
906 |
cluster_nums.add(area_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
907 |
|
908 |
+
# Only generate analysis if we have clusters to analyze
|
909 |
+
if not cluster_nums:
|
910 |
+
return "No technology clusters found. Try broadening search terms or increasing patent count."
|
911 |
|
912 |
+
# Create descriptions list with cluster information
|
913 |
descriptions = []
|
914 |
cluster_details = {}
|
|
|
|
|
915 |
|
916 |
for insight in cluster_insights:
|
917 |
+
if insight.get('description') and insight.get('type') == 'cluster':
|
|
|
918 |
area_id = int(insight.get('id', -1)) # 1-based IDs
|
919 |
area_size = insight.get('size', 0)
|
920 |
|
921 |
+
desc = f"C{area_id}:{insight['description']}"
|
922 |
+
descriptions.append(desc)
|
923 |
+
cluster_details[area_id] = {'description': insight['description'], 'size': area_size}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
924 |
|
925 |
# Format descriptions as a string with newlines
|
926 |
descriptions_text = '\n'.join(descriptions)
|
927 |
|
928 |
+
prompt = f"""Technology Clusters Available:
|
929 |
+
Clusters: {', '.join(f'Cluster {n}' for n in sorted(cluster_nums))}
|
930 |
+
|
931 |
+
Cluster Descriptions:
|
|
|
932 |
{descriptions_text}
|
933 |
+
|
934 |
+
I need you to identify 3-4 high-value innovation opportunities in this patent technology landscape. Focus on creating REAL business value through either:
|
935 |
+
A) Cross-pollinating technologies between different clusters, OR
|
936 |
+
B) Identifying innovation gaps within individual clusters
|
937 |
+
|
938 |
For each opportunity:
|
939 |
+
1. Select either ONE cluster with internal innovation potential OR two complementary clusters that can be combined
|
940 |
+
2. Identify a specific technical or market gap within or between the selected clusters
|
941 |
3. Propose a concrete solution that addresses this gap
|
942 |
4. Quantify potential business impact and competitive advantage
|
943 |
+
|
944 |
Follow this precise format:
|
945 |
Opportunity N: [Title that describes the innovation]
|
946 |
+
Source: [Single cluster (e.g., "Cluster 2") OR combination (e.g., "Cluster 1 + Cluster 3")]
|
947 |
- Gap: [Specific technical or market gap that represents an unmet need]
|
948 |
- Solution: [Practical, implementable technical approach]
|
949 |
- Impact: [Specific business value creation - market size, efficiency gains, cost reduction]
|
950 |
- Timeline: [Short-term (1-2 years) or medium-term (3-5 years)]
|
951 |
+
|
952 |
Prioritize opportunities based on:
|
953 |
1. Commercial potential (market size, growth potential)
|
954 |
2. Technical feasibility (can be implemented with current or near-term technology)
|
955 |
3. Competitive advantage (uniqueness, barriers to entry)
|
956 |
4. Alignment with industry trends (sustainability, automation, digitalization)
|
957 |
+
|
958 |
Focus on practical innovations that could realistically be implemented by a company rather than theoretical or speculative concepts."""
|
959 |
|
960 |
# Get analysis from LLM
|
|
|
1391 |
<h1>Patent Technology Landscape</h1>
|
1392 |
<p><strong>Instructions:</strong> Click on any point to open the corresponding Google Patents page in a new tab.</p>
|
1393 |
<p><strong>Legend:</strong>
|
1394 |
+
<span style="color: #636EFA;">β Technology Clusters</span>
|
|
|
|
|
1395 |
</p>
|
1396 |
</div>
|
1397 |
<div id="plot"></div>
|
|
|
1633 |
cluster_count += 1
|
1634 |
print(f"Added {cluster_count} clusters")
|
1635 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1636 |
# Build PDF
|
1637 |
print("Building final PDF document...")
|
1638 |
doc.build(story)
|
templates/index.html
CHANGED
@@ -4,7 +4,12 @@
|
|
4 |
<title>Patent Explorer</title>
|
5 |
<meta charset="utf-8">
|
6 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
7 |
-
<link href="https://cdn.jsdelivr.net/npm/
|
|
|
|
|
|
|
|
|
|
|
8 |
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
9 |
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
|
10 |
<style>
|
@@ -209,7 +214,7 @@
|
|
209 |
<h3 class="text-lg font-semibold text-blue-300 mb-3">π Interactive Visualization Guide</h3>
|
210 |
<div class="text-gray-300 mb-3">
|
211 |
<p class="mb-2"><strong>Click any point</strong> to open the corresponding Google Patents page in a new tab.</p>
|
212 |
-
<p class="mb-3"><strong>Hover over points</strong> to see detailed patent information including title, assignee, year, and
|
213 |
</div>
|
214 |
<div class="flex flex-wrap items-center">
|
215 |
<span class="text-sm font-medium text-gray-400 mr-3">Legend:</span>
|
@@ -471,11 +476,7 @@
|
|
471 |
if (response.insights) {
|
472 |
console.log('Displaying insights...');
|
473 |
const clusters = response.insights.filter(i => i.type === 'cluster');
|
474 |
-
const innovationSubclusters = response.insights.filter(i => i.type === 'innovation_subcluster');
|
475 |
-
const transitionalAreas = response.insights.filter(i => i.type === 'transitional');
|
476 |
console.log('Found clusters:', clusters.length);
|
477 |
-
console.log('Found innovation subclusters:', innovationSubclusters.length);
|
478 |
-
console.log('Found transitional areas:', transitionalAreas.length);
|
479 |
|
480 |
// Start with Innovation Analysis
|
481 |
let insightsHtml = '';
|
@@ -490,11 +491,8 @@
|
|
490 |
`;
|
491 |
}
|
492 |
|
493 |
-
//
|
494 |
-
insightsHtml += '<div class="
|
495 |
-
|
496 |
-
// Left column: Technology Clusters
|
497 |
-
insightsHtml += '<div class="col-span-1">';
|
498 |
insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-blue-400">Technology Clusters</h3>';
|
499 |
|
500 |
if (clusters.length > 0) {
|
@@ -512,45 +510,7 @@
|
|
512 |
insightsHtml += '<p class="text-gray-400">No technology clusters identified.</p>';
|
513 |
}
|
514 |
insightsHtml += '</div>';
|
515 |
-
// Middle column: Transitional Areas
|
516 |
-
insightsHtml += '<div class="col-span-1">';
|
517 |
-
insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-orange-400">Transitional Areas</h3>';
|
518 |
|
519 |
-
if (transitionalAreas.length > 0) {
|
520 |
-
insightsHtml += '<div class="space-y-4">';
|
521 |
-
transitionalAreas.forEach(area => {
|
522 |
-
insightsHtml += `
|
523 |
-
<div class="transitional-card p-6 text-base" style="background-color: #4d3d2d;">
|
524 |
-
<div class="text-orange-300 text-lg font-bold mb-3">${area.label}</div>
|
525 |
-
<div class="text-gray-300 whitespace-pre-line leading-relaxed">${area.description}</div>
|
526 |
-
</div>
|
527 |
-
`;
|
528 |
-
});
|
529 |
-
insightsHtml += '</div>';
|
530 |
-
} else {
|
531 |
-
insightsHtml += '<p class="text-gray-400">No transitional areas identified.</p>';
|
532 |
-
}
|
533 |
-
insightsHtml += '</div>';
|
534 |
-
// Right column: Underexplored Areas
|
535 |
-
insightsHtml += '<div class="col-span-1">';
|
536 |
-
insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-green-400">Underexplored Areas</h3>';
|
537 |
-
|
538 |
-
if (innovationSubclusters.length > 0) {
|
539 |
-
insightsHtml += '<div class="space-y-4">';
|
540 |
-
innovationSubclusters.forEach(subcluster => {
|
541 |
-
insightsHtml += `
|
542 |
-
<div class="opportunity-card p-6 text-base">
|
543 |
-
<div class="text-green-300 text-lg font-bold mb-3">${subcluster.label} (${subcluster.size} patents)</div>
|
544 |
-
<div class="text-gray-300 whitespace-pre-line leading-relaxed">${subcluster.description}</div>
|
545 |
-
</div>
|
546 |
-
`;
|
547 |
-
});
|
548 |
-
insightsHtml += '</div>';
|
549 |
-
} else {
|
550 |
-
insightsHtml += '<p class="text-gray-400">No significant underexplored areas identified in this technology space.</p>';
|
551 |
-
}
|
552 |
-
insightsHtml += '</div>';
|
553 |
-
insightsHtml += '</div>';
|
554 |
$('#insights').html(insightsHtml);
|
555 |
} else {
|
556 |
console.warn('No insights data received');
|
|
|
4 |
<title>Patent Explorer</title>
|
5 |
<meta charset="utf-8">
|
6 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
7 |
+
<link href="https://cdn.jsdelivr.net/npm/tailw <div class="legend-item">
|
8 |
+
<div class="legend-dot" style="background-color: #636EFA;"></div>
|
9 |
+
<span class="text-sm">Technology Clusters</span>
|
10 |
+
</div>
|
11 |
+
</div>
|
12 |
+
</div>dist/tailwind.min.css" rel="stylesheet">
|
13 |
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
14 |
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
|
15 |
<style>
|
|
|
214 |
<h3 class="text-lg font-semibold text-blue-300 mb-3">π Interactive Visualization Guide</h3>
|
215 |
<div class="text-gray-300 mb-3">
|
216 |
<p class="mb-2"><strong>Click any point</strong> to open the corresponding Google Patents page in a new tab.</p>
|
217 |
+
<p class="mb-3"><strong>Hover over points</strong> to see detailed patent information including title, assignee, year, and abstract.</p>
|
218 |
</div>
|
219 |
<div class="flex flex-wrap items-center">
|
220 |
<span class="text-sm font-medium text-gray-400 mr-3">Legend:</span>
|
|
|
476 |
if (response.insights) {
|
477 |
console.log('Displaying insights...');
|
478 |
const clusters = response.insights.filter(i => i.type === 'cluster');
|
|
|
|
|
479 |
console.log('Found clusters:', clusters.length);
|
|
|
|
|
480 |
|
481 |
// Start with Innovation Analysis
|
482 |
let insightsHtml = '';
|
|
|
491 |
`;
|
492 |
}
|
493 |
|
494 |
+
// Technology Clusters section
|
495 |
+
insightsHtml += '<div class="p-6">';
|
|
|
|
|
|
|
496 |
insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-blue-400">Technology Clusters</h3>';
|
497 |
|
498 |
if (clusters.length > 0) {
|
|
|
510 |
insightsHtml += '<p class="text-gray-400">No technology clusters identified.</p>';
|
511 |
}
|
512 |
insightsHtml += '</div>';
|
|
|
|
|
|
|
513 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
514 |
$('#insights').html(insightsHtml);
|
515 |
} else {
|
516 |
console.warn('No insights data received');
|