Spaces:

PhyllisPeh
/

PatentExplorerApp

Sleeping

App Files Files Community

PhyllisPeh commited on Jun 13

Commit

0fbc86f

1 Parent(s): 398e8e4

removed transitional areas and underexplored areas

Browse files

Files changed (2) hide show

app.py +124 -759
templates/index.html +9 -49

app.py CHANGED Viewed

@@ -489,7 +489,7 @@ def search_patents(keywords, page_size=100):
     return all_patents
 def analyze_patent_group(patents, group_type, label, max_retries=3):
-    """Analyze patent groups using ChatGPT with improved formatting and concise output"""
     # Extract key information from all patents in the group
     patent_count = len(patents)
     years_range = f"{patents['year'].min()}-{patents['year'].max()}"
@@ -535,11 +535,8 @@ def analyze_patent_group(patents, group_type, label, max_retries=3):
     else:
         top_assignees = ", ".join(patents['assignee'].unique())
-    # Enhanced prompt templates for better analysis quality while maintaining conciseness
-    # Improved structure and specific guidance for more actionable insights
-    prompts = {
-        'cluster': (
-            f"""Patent cluster analysis ({patent_count} patents, {years_range}):
 Key players: {top_assignees}
 Core technologies: {key_terms}
 Sample innovations: {example_titles}
@@ -547,40 +544,9 @@ Sample innovations: {example_titles}
 Provide concise analysis in exactly this format:
 **Technology Focus:** [What specific problem/need this cluster addresses]
 **Market Applications:** [Primary commercial uses and target industries]
-**Innovation Trajectory:** [How this technology is evolving and future direction]""",
-            "You are a patent analyst providing strategic technology insights. Focus on commercial relevance and market opportunities."
-        ),
-        'transitional': (
-            f"""Transitional technology area ({patent_count} patents, {years_range}):
-Key players: {top_assignees}
-Bridge technologies: {key_terms}
-Sample innovations: {example_titles}
-Provide concise analysis in exactly this format:
-**Technology Bridge:** [Which established fields this area connects]
-**Integration Value:** [Why combining these technologies creates value]
-**Market Potential:** [Commercial opportunities from this convergence]""",
-            "You are a patent analyst identifying technology convergence opportunities. Focus on cross-domain innovation potential."
-        ),
-        'innovation_subcluster': (
-            f"""Underexplored technology area ({patent_count} patents, {years_range}):
-Current players: {top_assignees}
-Emerging concepts: {key_terms}
-Early innovations: {example_titles}
-Provide concise analysis in exactly this format:
-**Market Gap:** [Unmet need or problem this area could solve]
-**Technical Approach:** [Current methods and their limitations]
-**Innovation Opportunity:** [Specific R&D directions with commercial potential]""",
-            "You are a patent analyst identifying innovation opportunities. Focus on market gaps and commercial potential for R&D investment."
-        )
-    }
-    base_prompt = prompts[group_type][0]
-    system_prompt = prompts[group_type][1]
     retry_count = 0
     while retry_count < max_retries:
@@ -603,12 +569,6 @@ Provide concise analysis in exactly this format:
             analysis = re.sub(r'(?i)technology focus:', '**Technology Focus:**', analysis)
             analysis = re.sub(r'(?i)market applications:', '**Market Applications:**', analysis)
             analysis = re.sub(r'(?i)innovation trajectory:', '**Innovation Trajectory:**', analysis)
-            analysis = re.sub(r'(?i)technology bridge:', '**Technology Bridge:**', analysis)
-            analysis = re.sub(r'(?i)integration value:', '**Integration Value:**', analysis)
-            analysis = re.sub(r'(?i)market potential:', '**Market Potential:**', analysis)
-            analysis = re.sub(r'(?i)market gap:', '**Market Gap:**', analysis)
-            analysis = re.sub(r'(?i)technical approach:', '**Technical Approach:**', analysis)
-            analysis = re.sub(r'(?i)innovation opportunity:', '**Innovation Opportunity:**', analysis)
             # Clean up whitespace and formatting
             analysis = re.sub(r'\n\s*\n', '\n', analysis)  # Remove multiple blank lines
@@ -632,10 +592,8 @@ def create_3d_visualization(patents):
     """
     Create a 3D visualization of patent embeddings using UMAP and Plotly
     """
-    # Initialize variables for tracking different point types
     df = pd.DataFrame(patents)
-    df['point_type'] = 'cluster'  # Default type for all points
-    transitional_areas = []  # Initialize empty list for transitional areas
     if not patents:
         return None
@@ -690,698 +648,165 @@ def create_3d_visualization(patents):
     df['y'] = embedding_3d[:, 1]
     df['z'] = embedding_3d[:, 2]
-    # --- Improved HDBSCAN clustering logic for sparse region detection ---
     scaler = StandardScaler()
     scaled_embeddings = scaler.fit_transform(embedding_3d)
     n_points = len(scaled_embeddings)
     update_progress('clustering', 'processing', f'Analyzing {n_points} patents for clustering...')
-    # Dynamically set max_clusters and target_noise based on number of patents
     if n_points < 100:
-        max_clusters = 4
-        max_retries = 2
-        target_noise_ratio = 0.08
     elif n_points < 500:
-        max_clusters = 6
-        max_retries = 3
-        target_noise_ratio = 0.06
     elif n_points < 1000:
-        max_clusters = 8
-        max_retries = 4
-        target_noise_ratio = 0.05
-    else:
-        max_clusters = 15  # Increased from 12 to force more granular clustering
-        max_retries = 8   # More retries to find optimal clustering
-        target_noise_ratio = 0.03  # Keep low noise ratio
-    # Even more aggressive cluster parameters for large datasets
-    if n_points >= 1000:
-        min_cluster_size = max(5, int(n_points * 0.015))  # Further reduced to 1.5% for large datasets
-        min_samples = max(3, int(min_cluster_size * 0.95))  # Increased to 0.95 for even stricter formation
     else:
-        min_cluster_size = max(5, int(n_points * 0.02))  # 2% for smaller datasets
-        min_samples = max(3, int(min_cluster_size * 0.9))  # 0.9 ratio for smaller datasets
-    target_noise = int(n_points * target_noise_ratio)
-    print(f"Initial HDBSCAN: min_cluster_size={min_cluster_size}, min_samples={min_samples}, max_clusters={max_clusters}, max_retries={max_retries}, target_noise={target_noise}")
-    retry = 0
-    clusters = None
-    n_clusters = 0
-    n_noise = 0
-    best_result = None
-    best_score = float('-inf')
-    while retry < max_retries:
-        hdb = hdbscan.HDBSCAN(
-            min_cluster_size=min_cluster_size,
-            min_samples=min_samples,
-            cluster_selection_epsilon=0.03,  # Reduced further to force even tighter clusters
-            cluster_selection_method='eom',
-            metric='euclidean',
-            prediction_data=True
-        )
-        clusters = hdb.fit_predict(scaled_embeddings)
-        n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
-        n_noise = list(clusters).count(-1)
-        noise_ratio = n_noise / len(clusters)
-        avg_cluster_size = (len(clusters) - n_noise) / n_clusters if n_clusters > 0 else float('inf')
-        print(f"\nClustering Statistics (try {retry+1}):")
-        print(f"Number of clusters: {n_clusters}")
-        print(f"Number of patents in sparse regions: {n_noise}")
-        print(f"Total number of patents: {len(clusters)}")
-        print(f"Noise ratio: {noise_ratio:.2%}")
-        print(f"Average cluster size: {avg_cluster_size:.1f} patents")
-        update_progress('clustering', 'processing',
-            f'Optimizing clusters (attempt {retry + 1}/{max_retries}): ' +
-            f'Found {n_clusters} clusters with avg size {avg_cluster_size:.1f} patents')
-        # Calculate a score for this clustering result
-        # Penalize both too many and too few clusters, and reward good noise ratio
-        score = -abs(n_clusters - max_clusters) + \
-                -abs(noise_ratio - target_noise_ratio) * 10 + \
-                -abs(avg_cluster_size - (n_points / max_clusters)) / 10
-        if score > best_score:
-            best_score = score
-            best_result = (clusters, n_clusters, n_noise, noise_ratio, avg_cluster_size)
-        # Adjust parameters based on results
-        if n_clusters > max_clusters:
-            print("Too many clusters, increasing parameters more aggressively...")
-            min_cluster_size = int(min_cluster_size * 1.5)  # More aggressive increase
-            min_samples = int(min_samples * 1.4)
-        elif n_clusters == 1 and avg_cluster_size > len(clusters) * 0.8:
-            print("Single dominant cluster detected, adjusting for better separation...")
-            min_cluster_size = max(5, int(min_cluster_size * 0.6))  # More aggressive decrease
-            min_samples = max(3, int(min_samples * 0.6))
-        elif n_noise < target_noise * 0.5:
-            print("Too few noise points, adjusting parameters...")
-            min_cluster_size = int(min_cluster_size * 1.2)
-            min_samples = max(3, int(min_samples * 0.8))
-        elif n_clusters < max_clusters * 0.5:
-            print("Too few clusters, decreasing parameters...")
-            min_cluster_size = max(5, int(min_cluster_size * 0.8))
-            min_samples = max(3, int(min_samples * 0.7))
-        else:
-            print("Acceptable clustering found.")
-            break
-        retry += 1
-    # Use the best result if we didn't find an acceptable one
-    if retry == max_retries and best_result is not None:
-        print("Using best clustering result found...")
-        clusters, n_clusters, n_noise, noise_ratio, avg_cluster_size = best_result
     df['cluster'] = clusters
-    # --- First gather all existing clusters and their sizes ---
     cluster_info = []
     for label in set(clusters):
-        if label != -1:  # Skip noise points
-            cluster_mask = clusters == label
-            cluster_patents = df[cluster_mask]
-            if len(cluster_patents) > 0:
-                cluster_info.append((label, len(cluster_patents), cluster_patents))
     # Sort clusters by size in descending order
     cluster_info.sort(key=lambda x: x[1], reverse=True)
     print("\nCluster Size Distribution:")
     for i, (label, size, _) in enumerate(cluster_info):
-        print(f"Cluster {i} (originally {label}): {size} patents")
-    # Create mapping for new cluster IDs
-    cluster_id_map = {old_label: i for i, (old_label, _, _) in enumerate(cluster_info)}
-    # Update cluster IDs in DataFrame
     new_clusters = clusters.copy()
     for old_label, new_label in cluster_id_map.items():
         new_clusters[clusters == old_label] = new_label
     df['cluster'] = new_clusters
-    update_progress('clustering', 'processing', 'Identifying technology clusters and underexplored areas...')
-    # --- Initialize point types ---
-    df['point_type'] = 'unassigned'  # Start with all points unassigned
-    cluster_insights = []  # Initialize insights list
-    # First handle clustered points
     total_clusters = len(cluster_info)
-    for new_id, (_, size, cluster_patents) in enumerate(cluster_info):
-        update_progress('clustering', 'processing', f'Analyzing cluster {new_id + 1} of {total_clusters} ({size} patents)...')
-        description = analyze_patent_group(cluster_patents, 'cluster', new_id)
-        df.loc[cluster_patents.index, 'point_type'] = 'cluster'  # Mark clustered points
         cluster_insights.append({
             'type': 'cluster',
-            'id': int(new_id) + 1,  # Store as 1-based ID
             'size': size,
-            'label': f"Cluster {new_id + 1}",
             'description': description
         })
-    # --- Improved two-stage density analysis for noise points ---
-    noise_mask = df['cluster'] == -1
-    noise_points = scaled_embeddings[noise_mask]
-    noise_indices = df[noise_mask].index
-    dense_noise_indices = []  # Initialize empty list for dense noise points
-    true_sparse_indices = []  # Initialize empty list for sparse points
-    if len(noise_points) >= 3:
-        update_progress('clustering', 'processing', f'Analyzing {len(noise_points)} potential underexplored areas...')
-        print(f"\nStructural Analysis for Underexplored Area Detection:")
-        # Initialize sparse indices
-        true_sparse_indices = []
-        # Stage 1: Calculate local and global density metrics
-        n_neighbors = min(max(5, int(len(noise_points) * 0.05)), 15)
-        print(f"Using {n_neighbors} nearest neighbors for density calculation")
-        # Calculate local density for noise points
-        nbrs_local = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean').fit(noise_points)
-        local_distances, local_indices = nbrs_local.kneighbors(noise_points)
-        local_densities = 1 / (np.mean(local_distances, axis=1) + 1e-6)  # Add small epsilon to avoid division by zero
-        # Calculate distances to cluster centers and their densities
-        cluster_centers = []
-        cluster_densities = []  # Store density of each cluster
-        for label in set(clusters) - {-1}:
-            cluster_mask = clusters == label
-            cluster_points = scaled_embeddings[cluster_mask]
-            center = np.mean(cluster_points, axis=0)
-            cluster_centers.append(center)
-            # Calculate cluster density using its member points
-            if len(cluster_points) > 1:
-                nbrs_cluster = NearestNeighbors(n_neighbors=min(5, len(cluster_points))).fit(cluster_points)
-                cluster_dists, _ = nbrs_cluster.kneighbors(cluster_points)
-                cluster_density = 1 / (np.mean(cluster_dists) + 1e-6)
-            else:
-                cluster_density = 0
-            cluster_densities.append(cluster_density)
-        cluster_centers = np.array(cluster_centers)
-        cluster_densities = np.array(cluster_densities)
-        if len(cluster_centers) > 0:
-            # Calculate distances and density ratios to nearest clusters
-            nbrs_clusters = NearestNeighbors(n_neighbors=1, metric='euclidean').fit(cluster_centers)
-            cluster_distances, nearest_cluster_indices = nbrs_clusters.kneighbors(noise_points)
-            cluster_distances = cluster_distances.flatten()
-            # Get density of nearest cluster for each point
-            nearest_cluster_densities = cluster_densities[nearest_cluster_indices.flatten()]
-            # Calculate density ratios (local density / nearest cluster density)
-            density_ratios = local_densities / (nearest_cluster_densities + 1e-6)
-            print("\nDensity Analysis Statistics:")
-            print(f"Mean local density: {np.mean(local_densities):.3f}")
-            print(f"Mean cluster density: {np.mean(cluster_densities):.3f}")
-            print(f"Mean density ratio: {np.mean(density_ratios):.3f}")
-            # Identify structural gaps using multiple criteria with more sensitive thresholds
-            # 1. Density Isolation: Points with very low density compared to clusters
-            # 2. Spatial Isolation: Points far from both clusters and other noise points
-            # 3. Structural Stability: Points whose local neighborhood is also sparse
-            # Calculate isolation scores with more balanced thresholds
-            density_isolation = density_ratios < np.percentile(density_ratios, 65)  # More balanced threshold
-            spatial_isolation = cluster_distances > np.percentile(cluster_distances, 50)  # Median distance threshold
-            # Calculate structural stability with more balanced criteria
-            structural_stability = np.zeros(len(noise_points), dtype=bool)
-            for i, neighbors in enumerate(local_indices):
-                neighbor_densities = local_densities[neighbors]
-                # Point is stable if its neighborhood is relatively sparse
-                structural_stability[i] = np.mean(neighbor_densities) < np.percentile(local_densities, 50)  # Use median
-            # Use more balanced criteria - only need to meet any 1 of 3 criteria initially
-            candidate_sparse_indices = [
-                idx for i, idx in enumerate(noise_indices)
-                if sum([density_isolation[i], spatial_isolation[i], structural_stability[i]]) >= 1  # Only need 1 out of 3 criteria
-            ]
-            # Start by assuming all non-candidate points are dense noise
-            dense_noise_indices = [idx for idx in noise_indices if idx not in candidate_sparse_indices]
-            # Now calculate distances between candidates and dense noise points with more sensitive threshold
-            min_distance_threshold = np.percentile(cluster_distances, 40)  # More sensitive threshold
-            # Filter candidates based on distance from dense noise regions
-            if len(candidate_sparse_indices) > 0 and len(dense_noise_indices) > 0:
-                dense_noise_points = scaled_embeddings[dense_noise_indices]
-                true_sparse_indices = []
-                for idx in candidate_sparse_indices:
-                    point = scaled_embeddings[idx].reshape(1, -1)
-                    distances_to_dense = NearestNeighbors(n_neighbors=1).fit(dense_noise_points).kneighbors(point)[0][0]
-                    if distances_to_dense > min_distance_threshold:
-                        true_sparse_indices.append(idx)
-                # Update dense_noise_indices to include rejected candidates
-                rejected_indices = [idx for idx in candidate_sparse_indices if idx not in true_sparse_indices]
-                dense_noise_indices.extend(rejected_indices)
-            else:
-                true_sparse_indices = candidate_sparse_indices
-        else:
-            # Fallback using only local density analysis
-            density_threshold = np.percentile(local_densities, 25)  # Bottom 25% sparsest points
-            true_sparse_indices = [idx for i, idx in enumerate(noise_indices)
-                                 if local_densities[i] < density_threshold]
-            dense_noise_indices = [idx for idx in noise_indices if idx not in true_sparse_indices]
-        print(f"\nFinal Classification:")
-        print(f"True underexplored areas identified: {len(true_sparse_indices)}")
-        print(f"Transitional areas identified: {len(dense_noise_indices)}")
-        if len(true_sparse_indices) > 0:
-            print(f"Underexplored area ratio: {len(true_sparse_indices)/len(noise_points):.2%}")
-            print("\nUnderexplored Area Criteria Used:")
-            print("1. Density Isolation: Significantly lower density than nearest cluster")
-            print("2. Spatial Isolation: Far from both clusters and other points")
-            print("3. Structural Stability: Forms stable sparse regions with neighbors")
-        # Update point types in DataFrame for sparse points and dense noise
-        for idx in true_sparse_indices:
-            df.at[idx, 'point_type'] = 'sparse'
-        for idx in dense_noise_indices:
-            df.at[idx, 'point_type'] = 'dense_noise'
-    # --- Handle dense noise points as transitional areas ---
-    transitional_areas = []  # Store transitional areas for sorting
-    if len(dense_noise_indices) >= 3:
-        update_progress('clustering', 'processing', f'Analyzing {len(dense_noise_indices)} potential transitional areas...')
-        print("\nAnalyzing dense noise points as transitional areas...")
-        dense_noise_points = scaled_embeddings[dense_noise_indices]
-        # Use HDBSCAN to find subgroups within transitional areas
-        min_size = max(3, len(dense_noise_points) // 10)
-        print(f"Attempting to identify transitional area subgroups with min_size={min_size}")
-        hdb_dense = hdbscan.HDBSCAN(
-            min_cluster_size=min_size,
-            min_samples=max(2, min_size // 2),
-            cluster_selection_epsilon=0.3,
-            cluster_selection_method='leaf'
-        )
-        dense_labels = hdb_dense.fit_predict(dense_noise_points)
-        # Count potential transitional areas
-        unique_dense_labels = set(dense_labels) - {-1}
-        n_transitional = len(unique_dense_labels)
-        print(f"Found {n_transitional} distinct transitional areas")
-        # First get all transitional points, including scattered ones
-        all_transitional_points = {}
-        # Count sizes first
-        label_sizes = {}
-        for label in dense_labels:
-            if label != -1:
-                label_sizes[label] = label_sizes.get(label, 0) + 1
-        # Then collect points with their pre-calculated sizes
-        for i, label in enumerate(dense_labels):
-            idx = dense_noise_indices[i]
-            if label != -1:  # Regular transitional area
-                if label not in all_transitional_points:
-                    all_transitional_points[label] = {'indices': [], 'size': label_sizes[label]}
-                all_transitional_points[label]['indices'].append(idx)
-            else:  # Scattered points
-                label_key = 'scattered'
-                if label_key not in all_transitional_points:
-                    all_transitional_points[label_key] = {'indices': [], 'size': 0}
-                all_transitional_points[label_key]['indices'].append(idx)
-                all_transitional_points[label_key]['size'] += 1
-        # Sort transitional areas by size and create insights
-        # Filter out areas that are too small and sort by size
-        min_area_size = 3  # Minimum size for a valid transitional area
-        valid_areas = [(k, v) for k, v in all_transitional_points.items()
-                     if k != 'scattered' and v['size'] >= min_area_size]
-        sorted_areas = sorted(valid_areas, key=lambda x: x[1]['size'], reverse=True)
-        # Add regular transitional areas to insights
-        total_areas = len(sorted_areas)
-        for area_idx, (label, area_info) in enumerate(sorted_areas):
-            update_progress('clustering', 'processing', f'Analyzing transitional area {area_idx + 1} of {total_areas} ({area_info["size"]} patents)...')
-            area_patents = df.iloc[area_info['indices']]
-            description = analyze_patent_group(area_patents, 'transitional', label)
-            area_number = area_idx + 1  # 1-based numbering for display
-            # Create label without duplicate size info
-            area_label = f"Transitional Area {area_number}"
-            transitional_areas.append({
-                'label': area_label,
-                'indices': area_info['indices'],
-                'size': area_info['size'],
-                'patents': area_patents,
-                'description': description
-            })
-            area_insight = {
-                'type': 'transitional',
-                'id': area_idx + 1,  # Store as 1-based ID
-                'size': area_info['size'],
-                'label': f"{area_label} ({area_info['size']} patents)",
-                'description': description
-            }
-            cluster_insights.append(area_insight)
-        # Handle scattered points by analyzing them individually
-        if 'scattered' in all_transitional_points:
-            scattered_indices = all_transitional_points['scattered']['indices']
-            if len(scattered_indices) > 0:
-                print(f"\nAnalyzing {len(scattered_indices)} scattered points...")
-                scattered_points = scaled_embeddings[scattered_indices]
-                # Calculate distances to nearest cluster and transitional area
-                distances_to_clusters = []
-                distances_to_transitional = []
-                print("\nDistance analysis for each scattered point:")
-                point_counter = 0
-                # First calculate all distances
-                for point in scattered_points:
-                    point = point.reshape(1, -1)
-                    # Distance to nearest cluster
-                    if len(cluster_centers) > 0:
-                        dist_cluster = NearestNeighbors(n_neighbors=1).fit(cluster_centers).kneighbors(point)[0][0][0]
-                    else:
-                        dist_cluster = float('inf')
-                    # Distance to nearest transitional area (excluding scattered points)
-                    if len(dense_noise_points) > 0:
-                        # Get only the transitional area points (excluding scattered points)
-                        transitional_points = []
-                        for i, point_idx in enumerate(dense_noise_indices):
-                            if point_idx not in scattered_indices:
-                                transitional_points.append(dense_noise_points[i])
-                        if transitional_points:
-                            transitional_points = np.array(transitional_points)
-                            nbrs_trans = NearestNeighbors(n_neighbors=1).fit(transitional_points)
-                            dist_trans = nbrs_trans.kneighbors(point.reshape(1, -1))[0][0][0]
-                        else:
-                            dist_trans = float('inf')
-                    else:
-                        dist_trans = float('inf')
-                        # Store distances for ratio calculation
-                    distances_to_clusters.append(dist_cluster)
-                    distances_to_transitional.append(dist_trans)
-                total_classified_as_gaps = 0
-                total_classified_as_transitional = 0
-                # Use more aggressive thresholds for scattered points
-                cluster_distance_threshold = np.percentile(distances_to_clusters, 35)  # Even more lenient
-                transitional_distance_threshold = np.percentile(distances_to_transitional, 35)  # Even more lenient
-                print(f"\nClassification thresholds:")
-                print(f"- Cluster distance threshold: {cluster_distance_threshold:.3f}")
-                print(f"- Transitional distance threshold: {transitional_distance_threshold:.3f}")
-                # Classify scattered points
-                for idx, (dist_c, dist_t) in zip(scattered_indices, zip(distances_to_clusters, distances_to_transitional)):
-                    # 1. Check absolute distances with more lenient thresholds
-                    cluster_dist_threshold = np.percentile(distances_to_clusters, 60)  # Use 60th percentile
-                    trans_dist_threshold = np.percentile(distances_to_transitional, 60)  # Use 60th percentile
-                    # Point is isolated if it's farther than median distance from both clusters and transitional areas
-                    is_isolated = (dist_c > cluster_dist_threshold or dist_t > trans_dist_threshold)
-                    # 2. Calculate isolation based on absolute difference rather than ratio
-                    isolation_diff = dist_t - dist_c  # Positive means farther from transitional areas
-                    is_relatively_isolated = isolation_diff > 0  # Any positive difference counts
-                    # 3. Simplified region formation check
-                    nearby_transitional = sum(1 for d in distances_to_transitional if d < trans_dist_threshold)
-                    nearby_clusters = sum(1 for d in distances_to_clusters if d < cluster_dist_threshold)
-                    # Point forms new region if it has any cluster neighbors
-                    forms_new_region = nearby_clusters > 0
-                    # Classification decision and immediate DataFrame update
-                    # More lenient classification - if the point is isolated OR relatively isolated, mark as gap
-                    if is_isolated or is_relatively_isolated:
-                        true_sparse_indices.append(idx)
-                        df.at[idx, 'point_type'] = 'sparse'  # Immediately update DataFrame
-                        total_classified_as_gaps += 1
-                    else:
-                        dense_noise_indices.append(idx)
-                        df.at[idx, 'point_type'] = 'dense_noise'  # Immediately update DataFrame
-                        total_classified_as_transitional += 1
-                print(f"\nFinal classification summary for scattered points:")
-                print(f"- Total scattered points: {len(scattered_indices)}")
-                print(f"- Classified as underexplored areas: {total_classified_as_gaps}")
-                print(f"- Classified as transitional: {total_classified_as_transitional}")
-                if total_classified_as_gaps == 0:
-                    print("\nWarning: No scattered points were classified as underexplored areas!")
-                    print("Possible reasons:")
-                    print("1. Distance thresholds may be too high")
-                    print("2. Relative distance ratio may be too strict")
-                    print("3. Nearby points criterion may be too restrictive")
-                if total_classified_as_transitional > 0:
-                    # Create a transitional area for scattered points
-                    scattered_transitional_patents = df.iloc[dense_noise_indices[-total_classified_as_transitional:]]
-                    description = analyze_patent_group(scattered_transitional_patents, 'transitional', 'scattered')
-                    area_number = len(transitional_areas) + 1  # 1-based numbering for display
-                    # Add to transitional areas
-                    area_label = f"Transitional Area {area_number}"
-                    transitional_areas.append({
-                        'label': area_label,
-                        'indices': dense_noise_indices[-total_classified_as_transitional:],
-                        'size': total_classified_as_transitional,
-                        'patents': scattered_transitional_patents,
-                        'description': description
-                    })
-                    # Add to insights
-                    area_insight = {
-                        'type': 'transitional',
-                        'id': -1,  # Special ID for scattered points
-                        'size': total_classified_as_transitional,
-                        'label': f"{area_label} ({total_classified_as_transitional} patents)",
-                        'description': description
-                    }
-                    cluster_insights.append(area_insight)
-    # --- Analyze underexplored areas ---
-    if len(true_sparse_indices) > 0:
-        update_progress('clustering', 'processing', f'Analyzing {len(true_sparse_indices)} potential underexplored areas...')
-        print(f"\nProcessing {len(true_sparse_indices)} underexplored areas...")
-        sparse_patents = df.iloc[true_sparse_indices]
-        sparse_points = scaled_embeddings[true_sparse_indices]
-        # Ensure points are marked as sparse in the DataFrame
-        df.loc[true_sparse_indices, 'point_type'] = 'sparse'
-        # More lenient subclustering parameters for underexplored areas
-        min_subcluster_size = max(2, min(5, len(true_sparse_indices) // 10))  # More lenient minimum size
-        sparse_clusterer = hdbscan.HDBSCAN(
-            min_cluster_size=min_subcluster_size,
-            min_samples=1,  # Most lenient possible
-            cluster_selection_epsilon=0.8,  # Even more lenient
-            cluster_selection_method='leaf',  # Changed to leaf for finer subcluster detection
-            metric='euclidean'
-        )
-        sparse_labels = sparse_clusterer.fit_predict(sparse_points)
-        # Collect innovation subclusters for sorting
-        innovation_subclusters = []
-        for label in set(sparse_labels):
-            subcluster_mask = sparse_labels == label
-            subcluster_patents = sparse_patents[subcluster_mask]
-            subcluster_size = len(subcluster_patents)
-            # Accept all subclusters, even single points
-            description = analyze_patent_group(subcluster_patents, 'innovation_subcluster', label)
-            innovation_subclusters.append({
-                'label': label,
-                'size': subcluster_size,
-                'patents': subcluster_patents,
-                'description': description
-            })
-        # Sort innovation subclusters by size in descending order
-        innovation_subclusters.sort(key=lambda x: x['size'], reverse=True)
-        # Add sorted innovation subclusters to insights
-        total_subclusters = len(innovation_subclusters)
-        for idx, subcluster in enumerate(innovation_subclusters):
-            update_progress('clustering', 'processing', f'Analyzing underexplored area opportunity {idx + 1} of {total_subclusters} ({subcluster["size"]} patents)...')
-            cluster_insights.append({
-                'type': 'innovation_subcluster',
-                'id': idx + 1,  # Store as 1-based ID
-                'size': subcluster['size'],
-                'label': f"Underexplored Area {idx + 1}",
-                'description': subcluster['description']
-            })
-    else:
-        cluster_insights.append({
-            'type': 'innovation_subcluster',
-            'id': -1,
-            'size': 0,
-            'label': 'No Underexplored Areas',
-            'description': 'No significant underexplored areas were detected in this technology space.'
-        })
     update_progress('visualization', 'processing', 'Creating interactive plot...')
-    # Create Plotly figure with clusters
-    # Ensure all points are properly categorized
-    unassigned_mask = df['point_type'] == 'unassigned'
-    if any(unassigned_mask):
-        print(f"Warning: {sum(unassigned_mask)} points remain unassigned")
-        df.loc[unassigned_mask, 'point_type'] = 'cluster'  # Default unassigned to clusters
-    # Separate points into three categories: clusters, underexplored areas, and dense noise
-    cluster_mask = df['point_type'] == 'cluster'
-    innovation_gaps_mask = df['point_type'] == 'sparse'
-    dense_noise_mask = df['point_type'] == 'dense_noise'
     # Create hover text for all points
     hover_text = []
-    # Create mapping for underexplored area points to their numbers
-    innovation_gap_map = {}
-    # Map underexplored areas using the analyzed subclusters to ensure consistent numbering
-    if len(true_sparse_indices) > 0:
-        for idx, subcluster in enumerate(innovation_subclusters, 1):
-            for patent in subcluster['patents'].index:
-                innovation_gap_map[patent] = idx
-    # Create mapping for transitional areas
-    transitional_area_map = {}
-    for area_idx, area in enumerate(transitional_areas):
-        for idx in area['indices']:
-            transitional_area_map[idx] = {'number': area_idx + 1}
-    # Generate hover text for each point
     for idx, row in df.iterrows():
-        point_info = ""
-        if row['point_type'] == 'sparse':
-            gap_number = innovation_gap_map.get(idx)
-            if gap_number:
-                point_info = f"<br><b>Region:</b> Underexplored Area {gap_number}"
-            else:
-                point_info = "<br><b>Region:</b> Potential Innovation Area"
-        elif row['point_type'] == 'dense_noise':
-            area_info = transitional_area_map.get(idx)
-            if area_info:
-                point_info = f"<br><b>Region:</b> Transitional Area {area_info['number']}"
-            else:
-                # This is a scattered transitional point
-                point_info = f"<br><b>Region:</b> Transitional Area {len(transitional_areas)} (Scattered)"
-        else:
-            point_info = f"<br><b>Cluster:</b> {int(row['cluster']) + 1}"  # Cluster IDs are still 0-based in the DataFrame
         text = (
             f"<b>{row['title']}</b><br><br>"
             f"<b>By:</b> {row['assignee']} ({row['year']})<br>"
-            f"{point_info}<br><br>"
             f"<b>Abstract:</b><br>{row['abstract']}"
         )
         hover_text.append(text)
-    # Create three separate traces: clusters, underexplored areas, and dense noise points
     cluster_trace = go.Scatter3d(
-        x=df[cluster_mask]['x'],
-        y=df[cluster_mask]['y'],
-        z=df[cluster_mask]['z'],
         mode='markers',
         marker=dict(
             size=6,
-            color=clusters[cluster_mask] + 1,  # Add 1 to shift cluster numbers from 0-based to 1-based
             colorscale='Viridis',
-            opacity=0.5,
             showscale=True,
             colorbar=dict(
-                title="Clusters",
-                ticktext=[f"Cluster {i+1}" for i in range(n_clusters)],  # Custom tick labels
-                tickvals=list(range(1, n_clusters + 1)),  # Values to match the 1-based cluster numbers
-                tickmode="array",
                 tickfont=dict(size=10),
-                titlefont=dict(size=10)
             )
         ),
-        text=[hover_text[i] for i in range(len(hover_text)) if cluster_mask[i]],
         hoverinfo='text',
-        name='Clusters',
         hoverlabel=dict(
             bgcolor="white",
             font_size=12,
             font_family="Arial",
             align="left"
         ),
-        customdata=[df['link'].tolist()[i] for i in range(len(df)) if cluster_mask[i]]
     )
-    innovation_gaps_trace = go.Scatter3d(
-        x=df[innovation_gaps_mask]['x'],
-        y=df[innovation_gaps_mask]['y'],
-        z=df[innovation_gaps_mask]['z'],
-        mode='markers',
-        marker=dict(
-            size=6,  # Same size as other points
-            color='rgb(255, 0, 0)',  # Pure bright red
-            symbol='diamond',
-            opacity=1.0,  # Full opacity for visibility
-            line=dict(
-                color='white',
-                width=1  # Thinner border to match other points
-            )
-        ),
-        text=[hover_text[i] for i in range(len(hover_text)) if innovation_gaps_mask[i]],
-        hoverinfo='text',
-        name='Underexplored Areas',
-        hoverlabel=dict(
-            bgcolor="white",
-            font_size=12,
-            font_family="Arial",
-            align="left"
-        ),
-        customdata=[df['link'].tolist()[i] for i in range(len(df)) if innovation_gaps_mask[i]]
-    )
-    dense_noise_trace = go.Scatter3d(
-        x=df[dense_noise_mask]['x'],
-        y=df[dense_noise_mask]['y'],
-        z=df[dense_noise_mask]['z'],
-        mode='markers',
-        marker=dict(
-            size=6,  # Same size as other points
-            color='rgb(255, 165, 0)',  # Orange for transitional areas
-            symbol='circle',
-            opacity=0.7,  # Less opacity to make gaps more visible
-            line=dict(
-                color='white',
-                width=1  # Thin border
-            )
-        ),
-        text=[hover_text[i] for i in range(len(hover_text)) if dense_noise_mask[i]],
-        hoverinfo='text',
-        name='Transitional Areas',
-        hoverlabel=dict(
-            bgcolor="white",
-            font_size=12,
-            font_family="Arial",
-            align="left"
-        ),
-        customdata=[df['link'].tolist()[i] for i in range(len(df)) if dense_noise_mask[i]]
-    )
-    fig = go.Figure(data=[cluster_trace, innovation_gaps_trace, dense_noise_trace])
     # Update layout
     fig.update_layout(
-        title="Patent Technology Landscape",
         scene=dict(
             xaxis_title="UMAP 1",
             yaxis_title="UMAP 2",
@@ -1389,28 +814,16 @@ def create_3d_visualization(patents):
             camera=dict(
                 up=dict(x=0, y=0, z=1),
                 center=dict(x=0, y=0, z=0),
-                eye=dict(x=1.8, y=1.8, z=1.8)  # Slightly further out for better overview
             ),
-            aspectmode='cube'  # Force equal scaling
         ),
         margin=dict(l=0, r=0, b=0, t=30),
-        showlegend=True,
         template="plotly_dark",
         hoverlabel_align='left',
         hoverdistance=100,
-        hovermode='closest',
-        legend=dict(
-            yanchor="top",
-            y=0.99,
-            xanchor="left",
-            x=0.01,
-            bgcolor="rgba(0,0,0,0.7)",  # Darker background for better contrast
-            font=dict(
-                color="white",
-                size=12
-            ),
-            itemsizing='constant'  # Keep legend marker sizes consistent
-        )
     )
     # Configure hover behavior
@@ -1455,7 +868,7 @@ def generate_analysis(prompt, cluster_insights):
         analysis = response.choices[0].message['content']
         # Validate that analysis references valid areas
-        area_pattern = r'(?:Cluster|Transitional Area|Underexplored Area)\s+(\d+)'
         referenced_areas = set(int(num) for num in re.findall(area_pattern, analysis))
         # Extract valid area numbers from insights
@@ -1478,92 +891,70 @@ def generate_analysis(prompt, cluster_insights):
 def analyze_innovation_opportunities(cluster_insights):
     """
-    Analyze relationships between different areas to identify potential innovation opportunities.
-    Returns focused analysis of high-value innovation opportunities between existing technology areas.
     """
     # Extract cluster numbers and validate
     cluster_nums = set()
-    transitional_nums = set()
-    underexplored_nums = set()
     # Parse and validate cluster numbers with explicit error checking
     for insight in cluster_insights:
         area_type = insight.get('type', '')
         area_id = insight.get('id', -1)
-        if area_id < 0 and area_type != 'cluster':
-            continue
-        if area_type == 'cluster':
             cluster_nums.add(area_id)
-        elif area_type == 'transitional':
-            transitional_nums.add(area_id)
-        elif area_type == 'innovation_subcluster':
-            if area_id >= 1:  # Skip the "No underexplored areas" entry
-                underexplored_nums.add(area_id)
-    # Format areas with validation
-    def format_area_list(area_nums):
-        return f"Areas {', '.join(str(n) for n in sorted(area_nums))}" if area_nums else "None identified"
-    # Only generate analysis if we have areas to analyze
-    if not any([cluster_nums, transitional_nums, underexplored_nums]):
-        return "No distinct areas found. Try broadening search terms or increasing patent count."
-    # Create descriptions list with more detailed information
     descriptions = []
     cluster_details = {}
-    transitional_details = {}
-    underexplored_details = {}
     for insight in cluster_insights:
-        if insight.get('description'):
-            area_type = insight.get('type', '')
             area_id = int(insight.get('id', -1))  # 1-based IDs
             area_size = insight.get('size', 0)
-            if area_type == 'cluster':
-                desc = f"C{area_id}:{insight['description']}"
-                descriptions.append(desc)
-                cluster_details[area_id] = {'description': insight['description'], 'size': area_size}
-            elif area_type == 'transitional':
-                desc = f"T{area_id}:{insight['description']}"
-                descriptions.append(desc)
-                transitional_details[area_id] = {'description': insight['description'], 'size': area_size}
-            elif area_type == 'innovation_subcluster' and insight['id'] >= 1:
-                desc = f"U{area_id}:{insight['description']}"
-                descriptions.append(desc)
-                underexplored_details[area_id] = {'description': insight['description'], 'size': area_size}
     # Format descriptions as a string with newlines
     descriptions_text = '\n'.join(descriptions)
-    prompt = f"""Available Areas:
-Clusters: {format_area_list(cluster_nums)}
-Transitional Areas: {format_area_list(transitional_nums)}
-Underexplored Areas: {format_area_list(underexplored_nums)}
-Area Descriptions:
 {descriptions_text}
-I need you to identify 3-4 high-value innovation opportunities in this patent landscape. Focus on creating REAL business value through either:
-A) Connecting complementary technologies from different areas, OR
-B) Developing promising technologies within underexplored/transitional areas
 For each opportunity:
-1. Select either ONE area with internal innovation potential OR two technologically adjacent areas that can be connected
-2. Identify a specific technical or market gap (either within the area or between areas)
 3. Propose a concrete solution that addresses this gap
 4. Quantify potential business impact and competitive advantage
 Follow this precise format:
 Opportunity N: [Title that describes the innovation]
-Source: [Single area (e.g., "Underexplored Area 2") OR combination (e.g., "Cluster 1 + Transitional Area 3")]
 - Gap: [Specific technical or market gap that represents an unmet need]
 - Solution: [Practical, implementable technical approach]
 - Impact: [Specific business value creation - market size, efficiency gains, cost reduction]
 - Timeline: [Short-term (1-2 years) or medium-term (3-5 years)]
 Prioritize opportunities based on:
 1. Commercial potential (market size, growth potential)
 2. Technical feasibility (can be implemented with current or near-term technology)
 3. Competitive advantage (uniqueness, barriers to entry)
 4. Alignment with industry trends (sustainability, automation, digitalization)
 Focus on practical innovations that could realistically be implemented by a company rather than theoretical or speculative concepts."""
     # Get analysis from LLM
@@ -2000,9 +1391,7 @@ def download_plot():
         <h1>Patent Technology Landscape</h1>
         <p><strong>Instructions:</strong> Click on any point to open the corresponding Google Patents page in a new tab.</p>
         <p><strong>Legend:</strong>
-            <span style="color: #636EFA;">● Clusters</span> |
-            <span style="color: #FF0000;">♦ Underexplored Areas</span> |
-            <span style="color: #FFA500;">● Transitional Areas</span>
         </p>
     </div>
     <div id="plot"></div>
@@ -2244,30 +1633,6 @@ def download_insights():
                     cluster_count += 1
             print(f"Added {cluster_count} clusters")
-            # Add transitional areas
-            print("Adding transitional areas section...")
-            story.append(Paragraph("Transitional Areas", heading_style))
-            trans_count = 0
-            for insight in insights:
-                if insight['type'] == 'transitional':
-                    text = f"<b>Transitional Area {insight['id']}:</b> {insight['description']}"
-                    story.append(Paragraph(text, normal_style))
-                    story.append(Spacer(1, 12))
-                    trans_count += 1
-            print(f"Added {trans_count} transitional areas")
-            # Add underexplored areas
-            print("Adding underexplored areas section...")
-            story.append(Paragraph("Underexplored Areas", heading_style))
-            underexplored_count = 0
-            for insight in insights:
-                if insight['type'] == 'innovation_subcluster':
-                    text = f"<b>Underexplored Area {insight['id']}:</b> {insight['description']}"
-                    story.append(Paragraph(text, normal_style))
-                    story.append(Spacer(1, 12))
-                    underexplored_count += 1
-            print(f"Added {underexplored_count} underexplored areas")
             # Build PDF
             print("Building final PDF document...")
             doc.build(story)

     return all_patents
 def analyze_patent_group(patents, group_type, label, max_retries=3):
+    """Analyze patent clusters using ChatGPT with improved formatting and concise output"""
     # Extract key information from all patents in the group
     patent_count = len(patents)
     years_range = f"{patents['year'].min()}-{patents['year'].max()}"
     else:
         top_assignees = ", ".join(patents['assignee'].unique())
+    # Enhanced prompt template for cluster analysis
+    base_prompt = f"""Patent cluster analysis ({patent_count} patents, {years_range}):
 Key players: {top_assignees}
 Core technologies: {key_terms}
 Sample innovations: {example_titles}
 Provide concise analysis in exactly this format:
 **Technology Focus:** [What specific problem/need this cluster addresses]
 **Market Applications:** [Primary commercial uses and target industries]
+**Innovation Trajectory:** [How this technology is evolving and future direction]"""
+    system_prompt = "You are a patent analyst providing strategic technology insights. Focus on commercial relevance and market opportunities."
     retry_count = 0
     while retry_count < max_retries:
             analysis = re.sub(r'(?i)technology focus:', '**Technology Focus:**', analysis)
             analysis = re.sub(r'(?i)market applications:', '**Market Applications:**', analysis)
             analysis = re.sub(r'(?i)innovation trajectory:', '**Innovation Trajectory:**', analysis)
             # Clean up whitespace and formatting
             analysis = re.sub(r'\n\s*\n', '\n', analysis)  # Remove multiple blank lines
     """
     Create a 3D visualization of patent embeddings using UMAP and Plotly
     """
+    # Initialize variables for tracking clusters
     df = pd.DataFrame(patents)
     if not patents:
         return None
     df['y'] = embedding_3d[:, 1]
     df['z'] = embedding_3d[:, 2]
+    # --- Simplified HDBSCAN clustering for technological clusters ---
     scaler = StandardScaler()
     scaled_embeddings = scaler.fit_transform(embedding_3d)
     n_points = len(scaled_embeddings)
     update_progress('clustering', 'processing', f'Analyzing {n_points} patents for clustering...')
+    # Dynamically set clustering parameters based on dataset size
     if n_points < 100:
+        min_cluster_size = max(5, int(n_points * 0.08))
     elif n_points < 500:
+        min_cluster_size = max(8, int(n_points * 0.05))
     elif n_points < 1000:
+        min_cluster_size = max(15, int(n_points * 0.03))
     else:
+        min_cluster_size = max(20, int(n_points * 0.02))
+    min_samples = max(3, int(min_cluster_size * 0.7))
+    print(f"HDBSCAN clustering: min_cluster_size={min_cluster_size}, min_samples={min_samples}")
+    # Apply HDBSCAN clustering
+    hdb = hdbscan.HDBSCAN(
+        min_cluster_size=min_cluster_size,
+        min_samples=min_samples,
+        cluster_selection_epsilon=0.1,
+        cluster_selection_method='eom',
+        metric='euclidean'
+    )
+    clusters = hdb.fit_predict(scaled_embeddings)
+    # Assign noise points to nearest cluster
+    noise_mask = clusters == -1
+    if any(noise_mask) and len(set(clusters)) > 1:
+        print(f"Assigning {sum(noise_mask)} noise points to nearest clusters...")
+        # Get cluster centers
+        cluster_centers = []
+        cluster_labels = []
+        for label in set(clusters):
+            if label != -1:
+                cluster_mask = clusters == label
+                center = np.mean(scaled_embeddings[cluster_mask], axis=0)
+                cluster_centers.append(center)
+                cluster_labels.append(label)
+        if cluster_centers:
+            cluster_centers = np.array(cluster_centers)
+            noise_points = scaled_embeddings[noise_mask]
+            # Find nearest cluster for each noise point
+            nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers)
+            _, nearest_indices = nbrs.kneighbors(noise_points)
+            # Assign noise points to nearest clusters
+            noise_indices = np.where(noise_mask)[0]
+            for i, nearest_idx in enumerate(nearest_indices.flatten()):
+                clusters[noise_indices[i]] = cluster_labels[nearest_idx]
     df['cluster'] = clusters
+    # --- Gather clusters and analyze them ---
     cluster_info = []
+    n_clusters = len(set(clusters))
     for label in set(clusters):
+        cluster_mask = clusters == label
+        cluster_patents = df[cluster_mask]
+        if len(cluster_patents) > 0:
+            cluster_info.append((label, len(cluster_patents), cluster_patents))
     # Sort clusters by size in descending order
     cluster_info.sort(key=lambda x: x[1], reverse=True)
+    print(f"\nFinal Clustering Results:")
+    print(f"Number of technological clusters: {n_clusters}")
+    print(f"Total patents clustered: {len(df)}")
     print("\nCluster Size Distribution:")
     for i, (label, size, _) in enumerate(cluster_info):
+        print(f"Cluster {i + 1}: {size} patents")
+    # Create mapping for new cluster IDs (1-based)
+    cluster_id_map = {old_label: i + 1 for i, (old_label, _, _) in enumerate(cluster_info)}
+    # Update cluster IDs in DataFrame to be 1-based
     new_clusters = clusters.copy()
     for old_label, new_label in cluster_id_map.items():
         new_clusters[clusters == old_label] = new_label
     df['cluster'] = new_clusters
+    update_progress('clustering', 'processing', 'Analyzing technological clusters...')
+    # Analyze each cluster
+    cluster_insights = []
     total_clusters = len(cluster_info)
+    for i, (_, size, cluster_patents) in enumerate(cluster_info):
+        cluster_id = i + 1  # 1-based cluster ID
+        update_progress('clustering', 'processing', f'Analyzing cluster {cluster_id} of {total_clusters} ({size} patents)...')
+        description = analyze_patent_group(cluster_patents, 'cluster', cluster_id)
         cluster_insights.append({
             'type': 'cluster',
+            'id': cluster_id,
             'size': size,
+            'label': f"Cluster {cluster_id}",
             'description': description
         })
     update_progress('visualization', 'processing', 'Creating interactive plot...')
+    # Create Plotly figure with clusters only
     # Create hover text for all points
     hover_text = []
     for idx, row in df.iterrows():
         text = (
             f"<b>{row['title']}</b><br><br>"
             f"<b>By:</b> {row['assignee']} ({row['year']})<br>"
+            f"<b>Cluster:</b> {int(row['cluster'])}<br><br>"
             f"<b>Abstract:</b><br>{row['abstract']}"
         )
         hover_text.append(text)
+    # Create single trace for all clusters
     cluster_trace = go.Scatter3d(
+        x=df['x'],
+        y=df['y'],
+        z=df['z'],
         mode='markers',
         marker=dict(
             size=6,
+            color=df['cluster'],
             colorscale='Viridis',
+            opacity=0.7,
             showscale=True,
             colorbar=dict(
+                title="Technology Clusters",
+                tickmode="linear",
+                tick0=1,
+                dtick=1,
                 tickfont=dict(size=10),
+                titlefont=dict(size=12)
             )
         ),
+        text=hover_text,
         hoverinfo='text',
+        name='Technology Clusters',
         hoverlabel=dict(
             bgcolor="white",
             font_size=12,
             font_family="Arial",
             align="left"
         ),
+        customdata=df['link'].tolist()
     )
+    fig = go.Figure(data=[cluster_trace])
     # Update layout
     fig.update_layout(
+        title="Patent Technology Landscape - Cluster Analysis",
         scene=dict(
             xaxis_title="UMAP 1",
             yaxis_title="UMAP 2",
             camera=dict(
                 up=dict(x=0, y=0, z=1),
                 center=dict(x=0, y=0, z=0),
+                eye=dict(x=1.8, y=1.8, z=1.8)
             ),
+            aspectmode='cube'
         ),
         margin=dict(l=0, r=0, b=0, t=30),
+        showlegend=False,  # Single trace doesn't need legend
         template="plotly_dark",
         hoverlabel_align='left',
         hoverdistance=100,
+        hovermode='closest'
     )
     # Configure hover behavior
         analysis = response.choices[0].message['content']
         # Validate that analysis references valid areas
+        area_pattern = r'(?:Cluster)\s+(\d+)'
         referenced_areas = set(int(num) for num in re.findall(area_pattern, analysis))
         # Extract valid area numbers from insights
 def analyze_innovation_opportunities(cluster_insights):
     """
+    Analyze technology clusters to identify potential innovation opportunities.
+    Returns focused analysis of high-value innovation opportunities within and between technology clusters.
     """
     # Extract cluster numbers and validate
     cluster_nums = set()
     # Parse and validate cluster numbers with explicit error checking
     for insight in cluster_insights:
         area_type = insight.get('type', '')
         area_id = insight.get('id', -1)
+        if area_type == 'cluster' and area_id > 0:
             cluster_nums.add(area_id)
+    # Only generate analysis if we have clusters to analyze
+    if not cluster_nums:
+        return "No technology clusters found. Try broadening search terms or increasing patent count."
+    # Create descriptions list with cluster information
     descriptions = []
     cluster_details = {}
     for insight in cluster_insights:
+        if insight.get('description') and insight.get('type') == 'cluster':
             area_id = int(insight.get('id', -1))  # 1-based IDs
             area_size = insight.get('size', 0)
+            desc = f"C{area_id}:{insight['description']}"
+            descriptions.append(desc)
+            cluster_details[area_id] = {'description': insight['description'], 'size': area_size}
     # Format descriptions as a string with newlines
     descriptions_text = '\n'.join(descriptions)
+    prompt = f"""Technology Clusters Available:
+Clusters: {', '.join(f'Cluster {n}' for n in sorted(cluster_nums))}
+Cluster Descriptions:
 {descriptions_text}
+I need you to identify 3-4 high-value innovation opportunities in this patent technology landscape. Focus on creating REAL business value through either:
+A) Cross-pollinating technologies between different clusters, OR
+B) Identifying innovation gaps within individual clusters
 For each opportunity:
+1. Select either ONE cluster with internal innovation potential OR two complementary clusters that can be combined
+2. Identify a specific technical or market gap within or between the selected clusters
 3. Propose a concrete solution that addresses this gap
 4. Quantify potential business impact and competitive advantage
 Follow this precise format:
 Opportunity N: [Title that describes the innovation]
+Source: [Single cluster (e.g., "Cluster 2") OR combination (e.g., "Cluster 1 + Cluster 3")]
 - Gap: [Specific technical or market gap that represents an unmet need]
 - Solution: [Practical, implementable technical approach]
 - Impact: [Specific business value creation - market size, efficiency gains, cost reduction]
 - Timeline: [Short-term (1-2 years) or medium-term (3-5 years)]
 Prioritize opportunities based on:
 1. Commercial potential (market size, growth potential)
 2. Technical feasibility (can be implemented with current or near-term technology)
 3. Competitive advantage (uniqueness, barriers to entry)
 4. Alignment with industry trends (sustainability, automation, digitalization)
 Focus on practical innovations that could realistically be implemented by a company rather than theoretical or speculative concepts."""
     # Get analysis from LLM
         <h1>Patent Technology Landscape</h1>
         <p><strong>Instructions:</strong> Click on any point to open the corresponding Google Patents page in a new tab.</p>
         <p><strong>Legend:</strong>
+            <span style="color: #636EFA;">● Technology Clusters</span>
         </p>
     </div>
     <div id="plot"></div>
                     cluster_count += 1
             print(f"Added {cluster_count} clusters")
             # Build PDF
             print("Building final PDF document...")
             doc.build(story)

templates/index.html CHANGED Viewed

@@ -4,7 +4,12 @@
     <title>Patent Explorer</title>
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1">
-    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/tailwind.min.css" rel="stylesheet">
     <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
     <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
     <style>
@@ -209,7 +214,7 @@
             <h3 class="text-lg font-semibold text-blue-300 mb-3">📍 Interactive Visualization Guide</h3>
             <div class="text-gray-300 mb-3">
                 <p class="mb-2"><strong>Click any point</strong> to open the corresponding Google Patents page in a new tab.</p>
-                <p class="mb-3"><strong>Hover over points</strong> to see detailed patent information including title, assignee, year, and main technical ideas.</p>
             </div>
             <div class="flex flex-wrap items-center">
                 <span class="text-sm font-medium text-gray-400 mr-3">Legend:</span>
@@ -471,11 +476,7 @@
                         if (response.insights) {
                             console.log('Displaying insights...');
                             const clusters = response.insights.filter(i => i.type === 'cluster');
-                            const innovationSubclusters = response.insights.filter(i => i.type === 'innovation_subcluster');
-                            const transitionalAreas = response.insights.filter(i => i.type === 'transitional');
                             console.log('Found clusters:', clusters.length);
-                            console.log('Found innovation subclusters:', innovationSubclusters.length);
-                            console.log('Found transitional areas:', transitionalAreas.length);
                             // Start with Innovation Analysis
                             let insightsHtml = '';
@@ -490,11 +491,8 @@
                                 `;
                             }
-                            // Add the grid for clusters, transitional areas, and underexplored areas
-                            insightsHtml += '<div class="grid grid-cols-1 lg:grid-cols-3 gap-6 p-6">';
-                            // Left column: Technology Clusters
-                            insightsHtml += '<div class="col-span-1">';
                             insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-blue-400">Technology Clusters</h3>';
                             if (clusters.length > 0) {
@@ -512,45 +510,7 @@
                                 insightsHtml += '<p class="text-gray-400">No technology clusters identified.</p>';
                             }
                             insightsHtml += '</div>';
-                            // Middle column: Transitional Areas
-                            insightsHtml += '<div class="col-span-1">';
-                            insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-orange-400">Transitional Areas</h3>';
-                            if (transitionalAreas.length > 0) {
-                                insightsHtml += '<div class="space-y-4">';
-                                transitionalAreas.forEach(area => {
-                                    insightsHtml += `
-                                        <div class="transitional-card p-6 text-base" style="background-color: #4d3d2d;">
-                                            <div class="text-orange-300 text-lg font-bold mb-3">${area.label}</div>
-                                            <div class="text-gray-300 whitespace-pre-line leading-relaxed">${area.description}</div>
-                                        </div>
-                                    `;
-                                });
-                                insightsHtml += '</div>';
-                            } else {
-                                insightsHtml += '<p class="text-gray-400">No transitional areas identified.</p>';
-                            }
-                            insightsHtml += '</div>';
-                            // Right column: Underexplored Areas
-                            insightsHtml += '<div class="col-span-1">';
-                            insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-green-400">Underexplored Areas</h3>';
-                            if (innovationSubclusters.length > 0) {
-                                insightsHtml += '<div class="space-y-4">';
-                                innovationSubclusters.forEach(subcluster => {
-                                    insightsHtml += `
-                                        <div class="opportunity-card p-6 text-base">
-                                            <div class="text-green-300 text-lg font-bold mb-3">${subcluster.label} (${subcluster.size} patents)</div>
-                                            <div class="text-gray-300 whitespace-pre-line leading-relaxed">${subcluster.description}</div>
-                                        </div>
-                                    `;
-                                });
-                                insightsHtml += '</div>';
-                            } else {
-                                insightsHtml += '<p class="text-gray-400">No significant underexplored areas identified in this technology space.</p>';
-                            }
-                            insightsHtml += '</div>';
-                            insightsHtml += '</div>';
                             $('#insights').html(insightsHtml);
                         } else {
                             console.warn('No insights data received');

     <title>Patent Explorer</title>
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link href="https://cdn.jsdelivr.net/npm/tailw                <div class="legend-item">
+                    <div class="legend-dot" style="background-color: #636EFA;"></div>
+                    <span class="text-sm">Technology Clusters</span>
+                </div>
+            </div>
+        </div>dist/tailwind.min.css" rel="stylesheet">
     <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
     <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
     <style>
             <h3 class="text-lg font-semibold text-blue-300 mb-3">📍 Interactive Visualization Guide</h3>
             <div class="text-gray-300 mb-3">
                 <p class="mb-2"><strong>Click any point</strong> to open the corresponding Google Patents page in a new tab.</p>
+                <p class="mb-3"><strong>Hover over points</strong> to see detailed patent information including title, assignee, year, and abstract.</p>
             </div>
             <div class="flex flex-wrap items-center">
                 <span class="text-sm font-medium text-gray-400 mr-3">Legend:</span>
                         if (response.insights) {
                             console.log('Displaying insights...');
                             const clusters = response.insights.filter(i => i.type === 'cluster');
                             console.log('Found clusters:', clusters.length);
                             // Start with Innovation Analysis
                             let insightsHtml = '';
                                 `;
                             }
+                            // Technology Clusters section
+                            insightsHtml += '<div class="p-6">';
                             insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-blue-400">Technology Clusters</h3>';
                             if (clusters.length > 0) {
                                 insightsHtml += '<p class="text-gray-400">No technology clusters identified.</p>';
                             }
                             insightsHtml += '</div>';
                             $('#insights').html(insightsHtml);
                         } else {
                             console.warn('No insights data received');