Spaces:
Sleeping
Sleeping
from flask import Flask, render_template, request, jsonify, Response | |
from dotenv import load_dotenv | |
import requests | |
from datetime import datetime | |
import os | |
import json | |
import openai | |
import numpy as np | |
import pickle | |
from pathlib import Path | |
import umap_config | |
import umap | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import pandas as pd | |
from sklearn.cluster import DBSCAN | |
from sklearn.preprocessing import StandardScaler | |
import time | |
import queue | |
import threading | |
load_dotenv() | |
app = Flask(__name__) | |
# Get API keys from environment variables | |
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY') | |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') | |
MAX_PATENTS = 300 # Limit number of patents to process | |
CACHE_FILE = 'patent_embeddings_cache.pkl' | |
# Global progress queue for SSE updates | |
progress_queue = queue.Queue() | |
if not SERPAPI_API_KEY: | |
raise ValueError("SERPAPI_API_KEY environment variable is not set") | |
if not OPENAI_API_KEY: | |
raise ValueError("OPENAI_API_KEY environment variable is not set") | |
# Initialize OpenAI API key | |
openai.api_key = OPENAI_API_KEY | |
def load_cache(): | |
"""Load cached embeddings from file""" | |
try: | |
if os.path.exists(CACHE_FILE): | |
with open(CACHE_FILE, 'rb') as f: | |
return pickle.load(f) | |
except Exception as e: | |
print(f"Error loading cache: {e}") | |
return {} | |
def save_cache(cache): | |
"""Save embeddings cache to file""" | |
try: | |
with open(CACHE_FILE, 'wb') as f: | |
pickle.dump(cache, f) | |
except Exception as e: | |
print(f"Error saving cache: {e}") | |
def get_embedding(text, cache): | |
"""Get embedding for text, using cache if available""" | |
if not text or text.strip() == "": | |
return None | |
if text in cache: | |
return cache[text] | |
try: | |
response = openai.Embedding.create( | |
model="text-embedding-3-small", | |
input=text | |
) | |
embedding = response['data'][0]['embedding'] | |
if embedding: # Only cache if we got a valid embedding | |
cache[text] = embedding | |
save_cache(cache) # Save cache after each new embedding | |
return embedding | |
except Exception as e: | |
print(f"Error getting embedding: {e}") | |
return None | |
def search_patents(keywords, page_size=100): | |
""" | |
Search patents using SerpApi's Google Patents API with pagination and generate embeddings | |
""" | |
# Load existing cache | |
embedding_cache = load_cache() | |
all_patents = [] | |
page = 1 | |
total_processed = 0 | |
while len(all_patents) < MAX_PATENTS: | |
update_progress('search', f'Fetching page {page} of patents...') | |
# SerpApi Google Patents API endpoint | |
api_url = "https://serpapi.com/search" | |
params = { | |
"engine": "google_patents", | |
"q": keywords, | |
"api_key": SERPAPI_API_KEY, | |
"num": page_size, | |
"start": (page - 1) * page_size | |
} | |
try: | |
response = requests.get(api_url, params=params) | |
response_data = response.json() | |
if "error" in response_data: | |
print(f"API returned error: {response_data['error']}") | |
break | |
patents_data = response_data.get('organic_results', []) | |
if not patents_data: | |
print(f"No more patents found on page {page}") | |
break | |
for idx, patent in enumerate(patents_data): | |
if len(all_patents) >= MAX_PATENTS: | |
break | |
# Format filing date | |
filing_date = patent.get('filing_date', '') | |
filing_year = 'N/A' | |
if filing_date: | |
try: | |
filing_year = datetime.strptime(filing_date, '%Y-%m-%d').year | |
except ValueError: | |
pass | |
# Get assignee | |
assignee = patent.get('assignee', 'N/A') | |
if isinstance(assignee, list) and assignee: | |
assignee = assignee[0] | |
# Format title and abstract for embedding | |
title = patent.get('title', '').strip() | |
abstract = patent.get('snippet', '').strip() | |
combined_text = f"{title}\n{abstract}".strip() | |
# Get embedding for combined text | |
total_processed += 1 | |
if total_processed % 10 == 0: # Update progress every 10 patents | |
update_progress('embedding', f'Processing patent {total_processed} of {MAX_PATENTS}...') | |
embedding = get_embedding(combined_text, embedding_cache) | |
formatted_patent = { | |
'title': title, | |
'assignee': assignee, | |
'filing_year': filing_year, | |
'abstract': abstract, | |
'link': patent.get('patent_link', '') or patent.get('link', ''), | |
'embedding': embedding | |
} | |
all_patents.append(formatted_patent) | |
print(f"Retrieved {len(patents_data)} patents from page {page}") | |
# Check if there are more pages | |
if not response_data.get('serpapi_pagination', {}).get('next'): | |
break | |
page += 1 | |
except Exception as e: | |
print(f"Error searching patents: {e}") | |
break | |
# Save final cache state | |
save_cache(embedding_cache) | |
print(f"Total patents retrieved and embedded: {len(all_patents)}") | |
return all_patents | |
def generate_summary(patents): | |
""" | |
Generate a summary of the patents using ChatGPT | |
""" | |
if not patents: | |
return "No patents to summarize." | |
# Prepare the prompt with patent information | |
prompt = "Please provide a concise summary of these patents:\n\n" | |
for patent in patents[:5]: # Limit to first 5 patents to stay within token limits | |
prompt += f"Title: {patent['title']}\n" | |
prompt += f"Abstract: {patent['abstract']}\n" | |
prompt += f"Assignee: {patent['assignee']}\n" | |
prompt += f"Year: {patent['filing_year']}\n\n" | |
try: | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "You are a patent expert. Provide a clear and concise summary of the following patents, highlighting key innovations and common themes."}, | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=500, | |
temperature=0.7 | |
) | |
print("Finish reason:", response.choices[0].finish_reason) | |
return response.choices[0].message['content'] | |
except Exception as e: | |
print(f"Error generating summary: {str(e)}") | |
return "Error generating summary." | |
def analyze_clusters(df, labels, embeddings_3d): | |
""" | |
Generate descriptions for patent clusters and identify opportunity zones | |
""" | |
unique_labels = np.unique(labels) | |
cluster_insights = [] | |
# Analyze each cluster (including noise points labeled as -1) | |
for label in unique_labels: | |
cluster_mask = labels == label | |
cluster_patents = df[cluster_mask] | |
cluster_points = embeddings_3d[cluster_mask] | |
if label == -1: | |
# Analyze sparse regions (potential opportunity zones) | |
if len(cluster_patents) > 0: | |
titles = "\n".join(cluster_patents['title'].tolist()) | |
assignees = ", ".join(cluster_patents['assignee'].unique()) | |
years = f"{cluster_patents['year'].min()} - {cluster_patents['year'].max()}" | |
prompt = f"""Analyze these {len(cluster_patents)} patents that are in sparse regions of the technology landscape: | |
Patents: | |
{titles} | |
Key assignees: {assignees} | |
Years: {years} | |
Please provide: | |
1. A brief description of these isolated technologies | |
2. Potential innovation opportunities in this space | |
3. Why these areas might be underexplored | |
Keep the response concise (max 3 sentences per point).""" | |
try: | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "You are a patent and technology expert analyzing innovation opportunities."}, | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=300, | |
temperature=0.7 | |
) | |
cluster_insights.append({ | |
'type': 'opportunity_zone', | |
'size': len(cluster_patents), | |
'description': response['choices'][0]['message']['content'] | |
}) | |
except Exception as e: | |
print(f"Error generating opportunity zone analysis: {e}") | |
else: | |
# Analyze regular clusters | |
if len(cluster_patents) > 0: | |
titles = "\n".join(cluster_patents['title'].tolist()) | |
assignees = ", ".join(cluster_patents['assignee'].unique()) | |
years = f"{cluster_patents['year'].min()} - {cluster_patents['year'].max()}" | |
prompt = f"""Analyze this cluster of {len(cluster_patents)} related patents: | |
Patents: | |
{titles} | |
Key assignees: {assignees} | |
Years: {years} | |
Please provide a concise (2-3 sentences) summary of: | |
1. The main technology focus of this cluster | |
2. Current development status and trends""" | |
try: | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "You are a patent and technology expert analyzing innovation clusters."}, | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=200, | |
temperature=0.7 | |
) | |
cluster_insights.append({ | |
'type': 'cluster', | |
'id': int(label), | |
'size': len(cluster_patents), | |
'description': response['choices'][0]['message']['content'] | |
}) | |
except Exception as e: | |
print(f"Error generating cluster analysis: {e}") | |
return cluster_insights | |
def create_3d_visualization(patents): | |
""" | |
Create a 3D visualization of patent embeddings using UMAP and Plotly | |
""" | |
if not patents: | |
return None | |
update_progress('clustering', 'Extracting embeddings...') | |
# Extract embeddings and metadata | |
embeddings = [] | |
metadata = [] | |
for patent in patents: | |
if patent['embedding'] is not None: | |
embeddings.append(patent['embedding']) | |
abstract = patent['abstract'] | |
if len(abstract) > 200: | |
abstract = abstract[:200] + "..." | |
metadata.append({ | |
'title': patent['title'], | |
'assignee': patent['assignee'], | |
'year': patent['filing_year'], | |
'abstract': abstract, | |
'link': patent['link'] | |
}) | |
if not embeddings: | |
return None | |
# Convert embeddings to numpy array | |
embeddings_array = np.array(embeddings) | |
update_progress('clustering', 'Applying UMAP dimensionality reduction...') | |
# Apply UMAP dimensionality reduction | |
reducer = umap.UMAP(n_components=3, random_state=42) | |
embedding_3d = reducer.fit_transform(embeddings_array) | |
update_progress('clustering', 'Performing DBSCAN clustering...') | |
# Create DataFrame for plotting | |
df = pd.DataFrame(metadata) | |
df['x'] = embedding_3d[:, 0] | |
df['y'] = embedding_3d[:, 1] | |
df['z'] = embedding_3d[:, 2] | |
# Apply DBSCAN clustering | |
scaler = StandardScaler() | |
scaled_embeddings = scaler.fit_transform(embedding_3d) | |
dbscan = DBSCAN(eps=0.75, min_samples=5) | |
clusters = dbscan.fit_predict(scaled_embeddings) | |
update_progress('analysis', 'Analyzing clusters and opportunities...') | |
# Print clustering statistics | |
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0) | |
n_noise = list(clusters).count(-1) | |
print(f"\nClustering Statistics:") | |
print(f"Number of clusters: {n_clusters}") | |
print(f"Number of patents in sparse regions: {n_noise}") | |
print(f"Total number of patents: {len(clusters)}") | |
if n_noise == 0: | |
print("\nWarning: No sparse regions detected. Consider adjusting DBSCAN parameters.") | |
dbscan = DBSCAN(eps=0.5, min_samples=7) | |
clusters = dbscan.fit_predict(scaled_embeddings) | |
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0) | |
n_noise = list(clusters).count(-1) | |
print(f"\nRetrying with stricter parameters:") | |
print(f"Number of clusters: {n_clusters}") | |
print(f"Number of patents in sparse regions: {n_noise}") | |
df['cluster'] = clusters | |
update_progress('analysis', 'Generating cluster insights...') | |
# Generate cluster insights | |
cluster_insights = analyze_clusters(df, clusters, embedding_3d) | |
update_progress('visualization', 'Creating interactive plot...') | |
# Create hover text with cluster information | |
hover_text = [] | |
for idx, row in df.iterrows(): | |
cluster_info = "" | |
if row['cluster'] == -1: | |
cluster_info = "<br><b>Region:</b> Sparse Area (Potential Innovation Zone)" | |
else: | |
cluster_info = f"<br><b>Cluster:</b> {row['cluster']}" | |
text = ( | |
f"<b>{row['title']}</b><br><br>" | |
f"<b>By:</b> {row['assignee']} ({row['year']})<br>" | |
f"{cluster_info}<br><br>" | |
f"<b>Abstract:</b><br>{row['abstract']}" | |
) | |
hover_text.append(text) | |
# Create Plotly figure with clusters | |
fig = go.Figure(data=[go.Scatter3d( | |
x=df['x'], | |
y=df['y'], | |
z=df['z'], | |
mode='markers', | |
marker=dict( | |
size=10, | |
color=clusters, | |
colorscale='Viridis', | |
opacity=0.8, | |
showscale=True, | |
colorbar=dict( | |
title="Clusters<br>(-1: Opportunity Zones)", | |
tickfont=dict(size=10), | |
titlefont=dict(size=10) | |
) | |
), | |
text=hover_text, | |
hoverinfo='text', | |
hoverlabel=dict( | |
bgcolor="white", | |
font_size=12, | |
font_family="Arial", | |
align="left" | |
), | |
customdata=df['link'].tolist() | |
)]) | |
# Update layout | |
fig.update_layout( | |
title="Patent Technology Landscape with Innovation Clusters", | |
scene=dict( | |
xaxis_title="UMAP 1", | |
yaxis_title="UMAP 2", | |
zaxis_title="UMAP 3", | |
camera=dict( | |
up=dict(x=0, y=0, z=1), | |
center=dict(x=0, y=0, z=0), | |
eye=dict(x=1.5, y=1.5, z=1.5) | |
) | |
), | |
margin=dict(l=0, r=0, b=0, t=30), | |
showlegend=False, | |
template="plotly_dark", | |
hoverlabel_align='left', | |
hoverdistance=100, | |
hovermode='closest' | |
) | |
# Add hover template configuration | |
fig.update_traces( | |
hovertemplate='%{text}<extra></extra>' | |
) | |
update_progress('visualization', 'Finalizing visualization...') | |
return { | |
'plot': fig.to_json(), | |
'insights': cluster_insights | |
} | |
def home(): | |
return render_template('index.html') | |
def get_progress(): | |
"""Server-sent events endpoint for progress updates""" | |
def generate(): | |
while True: | |
try: | |
data = progress_queue.get(timeout=30) # 30 second timeout | |
if data == 'DONE': | |
break | |
yield f"data: {json.dumps(data)}\n\n" | |
except queue.Empty: | |
break | |
return Response(generate(), mimetype='text/event-stream') | |
def update_progress(step, status='processing'): | |
"""Update progress through the progress queue""" | |
progress_queue.put({ | |
'step': step, | |
'status': status, | |
'timestamp': datetime.now().strftime('%H:%M:%S') | |
}) | |
def search(): | |
keywords = request.form.get('keywords', '') | |
if not keywords: | |
return jsonify({'error': 'Please enter search keywords'}) | |
print(f"\nProcessing search request for keywords: {keywords}") | |
try: | |
# Clear any existing progress updates | |
while not progress_queue.empty(): | |
progress_queue.get_nowait() | |
# Search for patents | |
update_progress('search') | |
patents = search_patents(keywords) | |
if not patents: | |
return jsonify({'error': 'No patents found or an error occurred'}) | |
# Generate embeddings | |
update_progress('embedding') | |
# Cluster analysis | |
update_progress('clustering') | |
# Innovation analysis | |
update_progress('analysis') | |
# Create visualization | |
update_progress('visualization') | |
viz_data = create_3d_visualization(patents) | |
if not viz_data: | |
return jsonify({'error': 'Error creating visualization'}) | |
# Signal completion | |
progress_queue.put('DONE') | |
return jsonify({ | |
'visualization': viz_data['plot'], | |
'insights': viz_data['insights'] | |
}) | |
except Exception as e: | |
print(f"Error processing request: {e}") | |
progress_queue.put('DONE') | |
return jsonify({'error': str(e)}) | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=7860) |