PhyllisPeh commited on
Commit
65801e7
·
1 Parent(s): 16858c5

added 3D interactive plot

Browse files
Files changed (3) hide show
  1. app.py +412 -120
  2. requirements.txt +7 -8
  3. templates/index.html +322 -45
app.py CHANGED
@@ -1,4 +1,4 @@
1
- from flask import Flask, render_template, request, jsonify
2
  from dotenv import load_dotenv
3
  import requests
4
  from datetime import datetime
@@ -8,45 +8,56 @@ import openai
8
  import numpy as np
9
  import pickle
10
  from pathlib import Path
 
 
 
 
 
 
 
 
 
11
 
12
  load_dotenv()
13
 
14
  app = Flask(__name__)
15
 
16
  # Get API keys from environment variables
17
- PATENTSVIEW_API_KEY = os.getenv('PATENTSVIEW_API_KEY')
18
  OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
19
  MAX_PATENTS = 300 # Limit number of patents to process
20
  CACHE_FILE = 'patent_embeddings_cache.pkl'
21
- QUERY_CACHE_FILE = 'query_embeddings_cache.pkl'
22
 
23
- if not PATENTSVIEW_API_KEY:
24
- raise ValueError("PATENTSVIEW_API_KEY environment variable is not set")
 
 
 
25
  if not OPENAI_API_KEY:
26
  raise ValueError("OPENAI_API_KEY environment variable is not set")
27
 
28
  # Initialize OpenAI API key
29
  openai.api_key = OPENAI_API_KEY
30
 
31
- def load_cache(cache_file):
32
  """Load cached embeddings from file"""
33
  try:
34
- if os.path.exists(cache_file):
35
- with open(cache_file, 'rb') as f:
36
  return pickle.load(f)
37
  except Exception as e:
38
  print(f"Error loading cache: {e}")
39
  return {}
40
 
41
- def save_cache(cache, cache_file):
42
  """Save embeddings cache to file"""
43
  try:
44
- with open(cache_file, 'wb') as f:
45
  pickle.dump(cache, f)
46
  except Exception as e:
47
  print(f"Error saving cache: {e}")
48
 
49
- def get_embedding(text, cache, cache_file):
50
  """Get embedding for text, using cache if available"""
51
  if not text or text.strip() == "":
52
  return None
@@ -62,7 +73,7 @@ def get_embedding(text, cache, cache_file):
62
  embedding = response['data'][0]['embedding']
63
  if embedding: # Only cache if we got a valid embedding
64
  cache[text] = embedding
65
- save_cache(cache, cache_file) # Save cache after each new embedding
66
  return embedding
67
  except Exception as e:
68
  print(f"Error getting embedding: {e}")
@@ -70,119 +81,97 @@ def get_embedding(text, cache, cache_file):
70
 
71
  def search_patents(keywords, page_size=100):
72
  """
73
- Search patents using PatentsView API with semantic search capabilities
74
  """
75
- # Load separate caches for patents and queries
76
- patent_cache = load_cache(CACHE_FILE)
77
- query_cache = load_cache(QUERY_CACHE_FILE)
78
-
79
- # Get embedding for search query
80
- query_embedding = get_embedding(keywords, query_cache, QUERY_CACHE_FILE)
81
- if not query_embedding:
82
- return []
83
-
84
- # PatentsView API endpoint
85
- api_url = "https://search.patentsview.org/api/v1/patent/"
86
-
87
- # Create a broader search query using related terms
88
- # We'll search in both title and abstract with more flexible matching
89
- query = {
90
- "q": {
91
- "_or": [
92
- {"_text_any": {"patent_title": keywords.split()}},
93
- {"_text_any": {"patent_abstract": keywords.split()}}
94
- ]
95
- },
96
- "f": [
97
- "patent_title",
98
- "patent_abstract",
99
- "patent_date",
100
- "patent_id",
101
- "assignees"
102
- ],
103
- "o": {
104
- "page": 1,
105
- "size": MAX_PATENTS # Get maximum allowed patents for better semantic matching
106
- }
107
- }
108
-
109
  all_patents = []
110
- try:
111
- headers = {
112
- "Content-Type": "application/json",
113
- "X-Api-Key": PATENTSVIEW_API_KEY
114
- }
115
 
116
- response = requests.post(api_url, json=query, headers=headers)
117
- response_data = response.json()
118
 
119
- if response_data.get('error'):
120
- print(f"API returned error: {response_data}")
121
- return []
 
 
 
 
 
 
 
 
122
 
123
- patents_data = response_data.get('patents', [])
124
-
125
- # Process and embed all patents
126
- for patent in patents_data:
127
- # Format filing date
128
- date_str = patent.get('patent_date', '')
129
- filing_year = 'N/A'
130
- if date_str:
131
- try:
132
- filing_year = datetime.strptime(date_str, '%Y-%m-%d').year
133
- except ValueError:
134
- pass
135
-
136
- # Get first assignee organization if available
137
- assignee_org = 'N/A'
138
- assignees = patent.get('assignees', [])
139
- if assignees and len(assignees) > 0:
140
- assignee_org = assignees[0].get('assignee_organization', 'N/A')
141
-
142
- # Format patent ID for Google Patents URL
143
- patent_id = patent.get('patent_id', '')
144
- if patent_id and not patent_id.startswith('US'):
145
- patent_id = f"US{patent_id}"
146
-
147
- # Combine title and abstract for embedding
148
- title = patent.get('patent_title', '').strip()
149
- abstract = patent.get('patent_abstract', '').strip()
150
- combined_text = f"{title}\n{abstract}".strip()
151
-
152
- # Get embedding for combined text using patent cache
153
- patent_embedding = get_embedding(combined_text, patent_cache, CACHE_FILE)
154
 
155
- if patent_embedding:
156
- # Calculate cosine similarity with query
157
- similarity = np.dot(query_embedding, patent_embedding) / (
158
- np.linalg.norm(query_embedding) * np.linalg.norm(patent_embedding)
159
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
 
 
161
  formatted_patent = {
162
  'title': title,
163
- 'assignee': assignee_org,
164
  'filing_year': filing_year,
165
  'abstract': abstract,
166
- 'link': f"https://patents.google.com/patent/{patent_id}",
167
- 'embedding': patent_embedding,
168
- 'similarity': float(similarity) # Convert to float for JSON serialization
169
  }
170
  all_patents.append(formatted_patent)
171
-
172
- # Sort patents by similarity score
173
- all_patents.sort(key=lambda x: x['similarity'], reverse=True)
174
-
175
- # Take top MAX_PATENTS results
176
- all_patents = all_patents[:MAX_PATENTS]
177
- print(len(all_patents))
178
-
179
- except Exception as e:
180
- print(f"Error searching patents: {e}")
181
- return []
 
182
 
183
- # Save final cache states
184
- save_cache(patent_cache, CACHE_FILE)
185
- save_cache(query_cache, QUERY_CACHE_FILE)
186
 
187
  print(f"Total patents retrieved and embedded: {len(all_patents)}")
188
  return all_patents
@@ -218,10 +207,284 @@ def generate_summary(patents):
218
  print(f"Error generating summary: {str(e)}")
219
  return "Error generating summary."
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  @app.route('/')
222
  def home():
223
  return render_template('index.html')
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  @app.route('/search', methods=['POST'])
226
  def search():
227
  keywords = request.form.get('keywords', '')
@@ -229,16 +492,45 @@ def search():
229
  return jsonify({'error': 'Please enter search keywords'})
230
 
231
  print(f"\nProcessing search request for keywords: {keywords}")
232
- patents = search_patents(keywords)
233
- if not patents:
234
- return jsonify({'error': 'No patents found or an error occurred'})
235
 
236
- # Generate summary using ChatGPT
237
- # summary = generate_summary(patents)
238
- return jsonify({
239
- 'patents': patents,
240
- 'summary': None # Set to None since we're not generating summaries currently
241
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  if __name__ == '__main__':
244
  app.run(host='0.0.0.0', port=7860)
 
1
+ from flask import Flask, render_template, request, jsonify, Response
2
  from dotenv import load_dotenv
3
  import requests
4
  from datetime import datetime
 
8
  import numpy as np
9
  import pickle
10
  from pathlib import Path
11
+ import umap
12
+ import plotly.express as px
13
+ import plotly.graph_objects as go
14
+ import pandas as pd
15
+ from sklearn.cluster import DBSCAN
16
+ from sklearn.preprocessing import StandardScaler
17
+ import time
18
+ import queue
19
+ import threading
20
 
21
  load_dotenv()
22
 
23
  app = Flask(__name__)
24
 
25
  # Get API keys from environment variables
26
+ SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
27
  OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
28
  MAX_PATENTS = 300 # Limit number of patents to process
29
  CACHE_FILE = 'patent_embeddings_cache.pkl'
 
30
 
31
+ # Global progress queue for SSE updates
32
+ progress_queue = queue.Queue()
33
+
34
+ if not SERPAPI_API_KEY:
35
+ raise ValueError("SERPAPI_API_KEY environment variable is not set")
36
  if not OPENAI_API_KEY:
37
  raise ValueError("OPENAI_API_KEY environment variable is not set")
38
 
39
  # Initialize OpenAI API key
40
  openai.api_key = OPENAI_API_KEY
41
 
42
+ def load_cache():
43
  """Load cached embeddings from file"""
44
  try:
45
+ if os.path.exists(CACHE_FILE):
46
+ with open(CACHE_FILE, 'rb') as f:
47
  return pickle.load(f)
48
  except Exception as e:
49
  print(f"Error loading cache: {e}")
50
  return {}
51
 
52
+ def save_cache(cache):
53
  """Save embeddings cache to file"""
54
  try:
55
+ with open(CACHE_FILE, 'wb') as f:
56
  pickle.dump(cache, f)
57
  except Exception as e:
58
  print(f"Error saving cache: {e}")
59
 
60
+ def get_embedding(text, cache):
61
  """Get embedding for text, using cache if available"""
62
  if not text or text.strip() == "":
63
  return None
 
73
  embedding = response['data'][0]['embedding']
74
  if embedding: # Only cache if we got a valid embedding
75
  cache[text] = embedding
76
+ save_cache(cache) # Save cache after each new embedding
77
  return embedding
78
  except Exception as e:
79
  print(f"Error getting embedding: {e}")
 
81
 
82
  def search_patents(keywords, page_size=100):
83
  """
84
+ Search patents using SerpApi's Google Patents API with pagination and generate embeddings
85
  """
86
+ # Load existing cache
87
+ embedding_cache = load_cache()
88
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  all_patents = []
90
+ page = 1
91
+ total_processed = 0
92
+
93
+ while len(all_patents) < MAX_PATENTS:
94
+ update_progress('search', f'Fetching page {page} of patents...')
95
 
96
+ # SerpApi Google Patents API endpoint
97
+ api_url = "https://serpapi.com/search"
98
 
99
+ params = {
100
+ "engine": "google_patents",
101
+ "q": keywords,
102
+ "api_key": SERPAPI_API_KEY,
103
+ "num": page_size,
104
+ "start": (page - 1) * page_size
105
+ }
106
+
107
+ try:
108
+ response = requests.get(api_url, params=params)
109
+ response_data = response.json()
110
 
111
+ if "error" in response_data:
112
+ print(f"API returned error: {response_data['error']}")
113
+ break
114
+
115
+ patents_data = response_data.get('organic_results', [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ if not patents_data:
118
+ print(f"No more patents found on page {page}")
119
+ break
120
+
121
+ for idx, patent in enumerate(patents_data):
122
+ if len(all_patents) >= MAX_PATENTS:
123
+ break
124
+
125
+ # Format filing date
126
+ filing_date = patent.get('filing_date', '')
127
+ filing_year = 'N/A'
128
+ if filing_date:
129
+ try:
130
+ filing_year = datetime.strptime(filing_date, '%Y-%m-%d').year
131
+ except ValueError:
132
+ pass
133
+
134
+ # Get assignee
135
+ assignee = patent.get('assignee', 'N/A')
136
+ if isinstance(assignee, list) and assignee:
137
+ assignee = assignee[0]
138
+
139
+ # Format title and abstract for embedding
140
+ title = patent.get('title', '').strip()
141
+ abstract = patent.get('snippet', '').strip()
142
+ combined_text = f"{title}\n{abstract}".strip()
143
+
144
+ # Get embedding for combined text
145
+ total_processed += 1
146
+ if total_processed % 10 == 0: # Update progress every 10 patents
147
+ update_progress('embedding', f'Processing patent {total_processed} of {MAX_PATENTS}...')
148
 
149
+ embedding = get_embedding(combined_text, embedding_cache)
150
+
151
  formatted_patent = {
152
  'title': title,
153
+ 'assignee': assignee,
154
  'filing_year': filing_year,
155
  'abstract': abstract,
156
+ 'link': patent.get('patent_link', '') or patent.get('link', ''),
157
+ 'embedding': embedding
 
158
  }
159
  all_patents.append(formatted_patent)
160
+
161
+ print(f"Retrieved {len(patents_data)} patents from page {page}")
162
+
163
+ # Check if there are more pages
164
+ if not response_data.get('serpapi_pagination', {}).get('next'):
165
+ break
166
+
167
+ page += 1
168
+
169
+ except Exception as e:
170
+ print(f"Error searching patents: {e}")
171
+ break
172
 
173
+ # Save final cache state
174
+ save_cache(embedding_cache)
 
175
 
176
  print(f"Total patents retrieved and embedded: {len(all_patents)}")
177
  return all_patents
 
207
  print(f"Error generating summary: {str(e)}")
208
  return "Error generating summary."
209
 
210
+ def analyze_clusters(df, labels, embeddings_3d):
211
+ """
212
+ Generate descriptions for patent clusters and identify opportunity zones
213
+ """
214
+ unique_labels = np.unique(labels)
215
+ cluster_insights = []
216
+
217
+ # Analyze each cluster (including noise points labeled as -1)
218
+ for label in unique_labels:
219
+ cluster_mask = labels == label
220
+ cluster_patents = df[cluster_mask]
221
+ cluster_points = embeddings_3d[cluster_mask]
222
+
223
+ if label == -1:
224
+ # Analyze sparse regions (potential opportunity zones)
225
+ if len(cluster_patents) > 0:
226
+ titles = "\n".join(cluster_patents['title'].tolist())
227
+ assignees = ", ".join(cluster_patents['assignee'].unique())
228
+ years = f"{cluster_patents['year'].min()} - {cluster_patents['year'].max()}"
229
+
230
+ prompt = f"""Analyze these {len(cluster_patents)} patents that are in sparse regions of the technology landscape:
231
+
232
+ Patents:
233
+ {titles}
234
+
235
+ Key assignees: {assignees}
236
+ Years: {years}
237
+
238
+ Please provide:
239
+ 1. A brief description of these isolated technologies
240
+ 2. Potential innovation opportunities in this space
241
+ 3. Why these areas might be underexplored
242
+ Keep the response concise (max 3 sentences per point)."""
243
+
244
+ try:
245
+ response = openai.ChatCompletion.create(
246
+ model="gpt-3.5-turbo",
247
+ messages=[
248
+ {"role": "system", "content": "You are a patent and technology expert analyzing innovation opportunities."},
249
+ {"role": "user", "content": prompt}
250
+ ],
251
+ max_tokens=300,
252
+ temperature=0.7
253
+ )
254
+ cluster_insights.append({
255
+ 'type': 'opportunity_zone',
256
+ 'size': len(cluster_patents),
257
+ 'description': response['choices'][0]['message']['content']
258
+ })
259
+ except Exception as e:
260
+ print(f"Error generating opportunity zone analysis: {e}")
261
+ else:
262
+ # Analyze regular clusters
263
+ if len(cluster_patents) > 0:
264
+ titles = "\n".join(cluster_patents['title'].tolist())
265
+ assignees = ", ".join(cluster_patents['assignee'].unique())
266
+ years = f"{cluster_patents['year'].min()} - {cluster_patents['year'].max()}"
267
+
268
+ prompt = f"""Analyze this cluster of {len(cluster_patents)} related patents:
269
+
270
+ Patents:
271
+ {titles}
272
+
273
+ Key assignees: {assignees}
274
+ Years: {years}
275
+
276
+ Please provide a concise (2-3 sentences) summary of:
277
+ 1. The main technology focus of this cluster
278
+ 2. Current development status and trends"""
279
+
280
+ try:
281
+ response = openai.ChatCompletion.create(
282
+ model="gpt-3.5-turbo",
283
+ messages=[
284
+ {"role": "system", "content": "You are a patent and technology expert analyzing innovation clusters."},
285
+ {"role": "user", "content": prompt}
286
+ ],
287
+ max_tokens=200,
288
+ temperature=0.7
289
+ )
290
+ cluster_insights.append({
291
+ 'type': 'cluster',
292
+ 'id': int(label),
293
+ 'size': len(cluster_patents),
294
+ 'description': response['choices'][0]['message']['content']
295
+ })
296
+ except Exception as e:
297
+ print(f"Error generating cluster analysis: {e}")
298
+
299
+ return cluster_insights
300
+
301
+ def create_3d_visualization(patents):
302
+ """
303
+ Create a 3D visualization of patent embeddings using UMAP and Plotly
304
+ """
305
+ if not patents:
306
+ return None
307
+
308
+ update_progress('clustering', 'Extracting embeddings...')
309
+
310
+ # Extract embeddings and metadata
311
+ embeddings = []
312
+ metadata = []
313
+ for patent in patents:
314
+ if patent['embedding'] is not None:
315
+ embeddings.append(patent['embedding'])
316
+ abstract = patent['abstract']
317
+ if len(abstract) > 200:
318
+ abstract = abstract[:200] + "..."
319
+
320
+ metadata.append({
321
+ 'title': patent['title'],
322
+ 'assignee': patent['assignee'],
323
+ 'year': patent['filing_year'],
324
+ 'abstract': abstract,
325
+ 'link': patent['link']
326
+ })
327
+
328
+ if not embeddings:
329
+ return None
330
+
331
+ # Convert embeddings to numpy array
332
+ embeddings_array = np.array(embeddings)
333
+
334
+ update_progress('clustering', 'Applying UMAP dimensionality reduction...')
335
+
336
+ # Apply UMAP dimensionality reduction
337
+ reducer = umap.UMAP(n_components=3, random_state=42)
338
+ embedding_3d = reducer.fit_transform(embeddings_array)
339
+
340
+ update_progress('clustering', 'Performing DBSCAN clustering...')
341
+
342
+ # Create DataFrame for plotting
343
+ df = pd.DataFrame(metadata)
344
+ df['x'] = embedding_3d[:, 0]
345
+ df['y'] = embedding_3d[:, 1]
346
+ df['z'] = embedding_3d[:, 2]
347
+
348
+ # Apply DBSCAN clustering
349
+ scaler = StandardScaler()
350
+ scaled_embeddings = scaler.fit_transform(embedding_3d)
351
+ dbscan = DBSCAN(eps=0.75, min_samples=5)
352
+ clusters = dbscan.fit_predict(scaled_embeddings)
353
+
354
+ update_progress('analysis', 'Analyzing clusters and opportunities...')
355
+
356
+ # Print clustering statistics
357
+ n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
358
+ n_noise = list(clusters).count(-1)
359
+ print(f"\nClustering Statistics:")
360
+ print(f"Number of clusters: {n_clusters}")
361
+ print(f"Number of patents in sparse regions: {n_noise}")
362
+ print(f"Total number of patents: {len(clusters)}")
363
+
364
+ if n_noise == 0:
365
+ print("\nWarning: No sparse regions detected. Consider adjusting DBSCAN parameters.")
366
+ dbscan = DBSCAN(eps=0.5, min_samples=7)
367
+ clusters = dbscan.fit_predict(scaled_embeddings)
368
+ n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
369
+ n_noise = list(clusters).count(-1)
370
+ print(f"\nRetrying with stricter parameters:")
371
+ print(f"Number of clusters: {n_clusters}")
372
+ print(f"Number of patents in sparse regions: {n_noise}")
373
+
374
+ df['cluster'] = clusters
375
+
376
+ update_progress('analysis', 'Generating cluster insights...')
377
+
378
+ # Generate cluster insights
379
+ cluster_insights = analyze_clusters(df, clusters, embedding_3d)
380
+
381
+ update_progress('visualization', 'Creating interactive plot...')
382
+
383
+ # Create hover text with cluster information
384
+ hover_text = []
385
+ for idx, row in df.iterrows():
386
+ cluster_info = ""
387
+ if row['cluster'] == -1:
388
+ cluster_info = "<br><b>Region:</b> Sparse Area (Potential Innovation Zone)"
389
+ else:
390
+ cluster_info = f"<br><b>Cluster:</b> {row['cluster']}"
391
+
392
+ text = (
393
+ f"<b>{row['title']}</b><br><br>"
394
+ f"<b>By:</b> {row['assignee']} ({row['year']})<br>"
395
+ f"{cluster_info}<br><br>"
396
+ f"<b>Abstract:</b><br>{row['abstract']}"
397
+ )
398
+ hover_text.append(text)
399
+
400
+ # Create Plotly figure with clusters
401
+ fig = go.Figure(data=[go.Scatter3d(
402
+ x=df['x'],
403
+ y=df['y'],
404
+ z=df['z'],
405
+ mode='markers',
406
+ marker=dict(
407
+ size=10,
408
+ color=clusters,
409
+ colorscale='Viridis',
410
+ opacity=0.8,
411
+ showscale=True,
412
+ colorbar=dict(
413
+ title="Clusters<br>(-1: Opportunity Zones)",
414
+ tickfont=dict(size=10),
415
+ titlefont=dict(size=10)
416
+ )
417
+ ),
418
+ text=hover_text,
419
+ hoverinfo='text',
420
+ hoverlabel=dict(
421
+ bgcolor="white",
422
+ font_size=12,
423
+ font_family="Arial",
424
+ align="left"
425
+ ),
426
+ customdata=df['link'].tolist()
427
+ )])
428
+
429
+ # Update layout
430
+ fig.update_layout(
431
+ title="Patent Technology Landscape with Innovation Clusters",
432
+ scene=dict(
433
+ xaxis_title="UMAP 1",
434
+ yaxis_title="UMAP 2",
435
+ zaxis_title="UMAP 3",
436
+ camera=dict(
437
+ up=dict(x=0, y=0, z=1),
438
+ center=dict(x=0, y=0, z=0),
439
+ eye=dict(x=1.5, y=1.5, z=1.5)
440
+ )
441
+ ),
442
+ margin=dict(l=0, r=0, b=0, t=30),
443
+ showlegend=False,
444
+ template="plotly_dark",
445
+ hoverlabel_align='left',
446
+ hoverdistance=100,
447
+ hovermode='closest'
448
+ )
449
+
450
+ # Add hover template configuration
451
+ fig.update_traces(
452
+ hovertemplate='%{text}<extra></extra>'
453
+ )
454
+
455
+ update_progress('visualization', 'Finalizing visualization...')
456
+
457
+ return {
458
+ 'plot': fig.to_json(),
459
+ 'insights': cluster_insights
460
+ }
461
+
462
  @app.route('/')
463
  def home():
464
  return render_template('index.html')
465
 
466
+ @app.route('/progress')
467
+ def get_progress():
468
+ """Server-sent events endpoint for progress updates"""
469
+ def generate():
470
+ while True:
471
+ try:
472
+ data = progress_queue.get(timeout=30) # 30 second timeout
473
+ if data == 'DONE':
474
+ break
475
+ yield f"data: {json.dumps(data)}\n\n"
476
+ except queue.Empty:
477
+ break
478
+ return Response(generate(), mimetype='text/event-stream')
479
+
480
+ def update_progress(step, status='processing'):
481
+ """Update progress through the progress queue"""
482
+ progress_queue.put({
483
+ 'step': step,
484
+ 'status': status,
485
+ 'timestamp': datetime.now().strftime('%H:%M:%S')
486
+ })
487
+
488
  @app.route('/search', methods=['POST'])
489
  def search():
490
  keywords = request.form.get('keywords', '')
 
492
  return jsonify({'error': 'Please enter search keywords'})
493
 
494
  print(f"\nProcessing search request for keywords: {keywords}")
 
 
 
495
 
496
+ try:
497
+ # Clear any existing progress updates
498
+ while not progress_queue.empty():
499
+ progress_queue.get_nowait()
500
+
501
+ # Search for patents
502
+ update_progress('search')
503
+ patents = search_patents(keywords)
504
+ if not patents:
505
+ return jsonify({'error': 'No patents found or an error occurred'})
506
+
507
+ # Generate embeddings
508
+ update_progress('embedding')
509
+
510
+ # Cluster analysis
511
+ update_progress('clustering')
512
+
513
+ # Innovation analysis
514
+ update_progress('analysis')
515
+
516
+ # Create visualization
517
+ update_progress('visualization')
518
+ viz_data = create_3d_visualization(patents)
519
+ if not viz_data:
520
+ return jsonify({'error': 'Error creating visualization'})
521
+
522
+ # Signal completion
523
+ progress_queue.put('DONE')
524
+
525
+ return jsonify({
526
+ 'visualization': viz_data['plot'],
527
+ 'insights': viz_data['insights']
528
+ })
529
+
530
+ except Exception as e:
531
+ print(f"Error processing request: {e}")
532
+ progress_queue.put('DONE')
533
+ return jsonify({'error': str(e)})
534
 
535
  if __name__ == '__main__':
536
  app.run(host='0.0.0.0', port=7860)
requirements.txt CHANGED
@@ -1,10 +1,9 @@
1
  flask==2.0.1
2
- Werkzeug==2.0.3
3
- requests==2.31.0
4
- gunicorn==20.1.0
5
- itsdangerous==2.0.1
6
- Jinja2==3.0.1
7
- MarkupSafe==2.0.1
8
  openai==0.28.1
9
- python-dotenv==1.0.1
10
- numpy==1.26.1
 
 
 
 
1
  flask==2.0.1
2
+ python-dotenv==0.19.0
3
+ requests==2.26.0
 
 
 
 
4
  openai==0.28.1
5
+ numpy==1.24.3
6
+ pandas==2.0.3
7
+ umap-learn==0.5.3
8
+ plotly==5.3.1
9
+ scikit-learn==1.3.0
templates/index.html CHANGED
@@ -3,43 +3,272 @@
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>Patent Explorer</title>
7
  <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/tailwind.min.css" rel="stylesheet">
8
  <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  </head>
10
- <body class="bg-gray-100 min-h-screen">
11
  <div class="container mx-auto px-4 py-8">
12
- <h1 class="text-4xl font-bold text-center text-blue-600 mb-8">Patent Explorer</h1>
13
 
14
  <!-- Search Form -->
15
  <div class="max-w-2xl mx-auto mb-8">
16
- <form id="searchForm" class="bg-white shadow-md rounded px-8 pt-6 pb-8 mb-4">
17
  <div class="mb-4">
18
  <input type="text" id="keywords" name="keywords"
19
- class="shadow appearance-none border rounded w-full py-2 px-3 text-gray-700 leading-tight focus:outline-none focus:shadow-outline"
20
- placeholder="Enter keywords to search patents...">
21
  </div>
22
  <div class="flex items-center justify-center">
23
  <button type="submit"
24
- class="bg-blue-500 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded focus:outline-none focus:shadow-outline">
25
- Search Patents
26
  </button>
27
  </div>
28
  </form>
29
  </div>
30
 
31
- <!-- Loading Spinner -->
32
- <div id="loading" class="hidden">
33
- <div class="flex justify-center items-center">
34
- <div class="animate-spin rounded-full h-12 w-12 border-b-2 border-blue-500"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  </div>
36
  </div>
37
 
38
- <!-- Results Container -->
39
- <div id="results" class="max-w-4xl mx-auto"></div>
 
 
 
40
  </div>
41
 
42
  <script>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  $(document).ready(function() {
44
  $('#searchForm').on('submit', function(e) {
45
  e.preventDefault();
@@ -50,54 +279,102 @@
50
  return;
51
  }
52
 
53
- // Show loading spinner
54
  $('#loading').removeClass('hidden');
55
- $('#results').empty();
 
 
 
 
 
 
 
 
 
56
 
57
  $.ajax({
58
  url: '/search',
59
  method: 'POST',
60
  data: { keywords: keywords },
61
  success: function(response) {
62
- $('#loading').addClass('hidden');
63
-
64
  if (response.error) {
65
- $('#results').html(`<div class="text-red-500 text-center">${response.error}</div>`);
 
 
66
  return;
67
  }
68
 
69
- if (!response.patents.length) {
70
- $('#results').html('<div class="text-center text-gray-600">No patents found.</div>');
71
- return;
 
 
 
 
 
 
 
 
72
  }
73
 
74
- const resultsHtml = response.patents.map((patent, index) => `
75
- <div class="bg-white shadow-md rounded-lg p-6 mb-4">
76
- <h2 class="text-xl font-bold text-blue-600 mb-2">
77
- <a href="${patent.link}" target="_blank" class="hover:underline">
78
- ${patent.title}
79
- </a>
80
- </h2>
81
- <div class="grid grid-cols-2 gap-4 mb-4 text-sm">
82
- <div>
83
- <span class="font-semibold">Assignee:</span> ${patent.assignee}
84
- </div>
85
- <div>
86
- <span class="font-semibold">Filing Year:</span> ${patent.filing_year}
87
- </div>
88
- </div>
89
- <div class="text-gray-600">
90
- <span class="font-semibold">Abstract:</span><br>
91
- ${patent.abstract}
92
- </div>
93
- </div>
94
- `).join('');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- $('#results').html(resultsHtml);
 
 
 
 
 
 
97
  },
98
  error: function() {
 
99
  $('#loading').addClass('hidden');
100
- $('#results').html('<div class="text-red-500 text-center">An error occurred while searching patents.</div>');
101
  }
102
  });
103
  });
 
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Patent Technology Landscape</title>
7
  <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/tailwind.min.css" rel="stylesheet">
8
  <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
9
+ <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
10
+ <style>
11
+ .visualization-container {
12
+ width: 100%;
13
+ height: 50vh;
14
+ margin-bottom: 20px;
15
+ background-color: #1a1a1a;
16
+ border-radius: 8px;
17
+ overflow: hidden;
18
+ }
19
+ .insights-panel {
20
+ background-color: #2d2d2d;
21
+ border-radius: 8px;
22
+ height: calc(120vh - 40px);
23
+ overflow-y: auto;
24
+ transition: all 0.3s ease;
25
+ }
26
+ .cluster-card {
27
+ background-color: #3d3d3d;
28
+ border-radius: 6px;
29
+ margin-bottom: 10px;
30
+ transition: all 0.2s ease;
31
+ }
32
+ .cluster-card:hover {
33
+ transform: translateY(-2px);
34
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
35
+ }
36
+ .opportunity-card {
37
+ background-color: #2d4a3e;
38
+ border-radius: 6px;
39
+ margin-bottom: 10px;
40
+ transition: all 0.2s ease;
41
+ }
42
+ .opportunity-card:hover {
43
+ transform: translateY(-2px);
44
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
45
+ }
46
+ .loading-container {
47
+ background-color: #2d2d2d;
48
+ border-radius: 8px;
49
+ padding: 1.5rem;
50
+ margin-bottom: 1rem;
51
+ }
52
+ .progress-bar {
53
+ width: 100%;
54
+ height: 8px;
55
+ background-color: #4a5568;
56
+ border-radius: 4px;
57
+ margin-bottom: 1rem;
58
+ overflow: hidden;
59
+ }
60
+ .progress-fill {
61
+ height: 100%;
62
+ width: 0;
63
+ background-color: #4299e1;
64
+ transition: width 0.3s ease;
65
+ }
66
+ .status-list {
67
+ max-height: 150px;
68
+ overflow-y: auto;
69
+ }
70
+ .status-item {
71
+ display: flex;
72
+ align-items: center;
73
+ margin-bottom: 0.75rem;
74
+ padding: 0.5rem;
75
+ border-radius: 4px;
76
+ background-color: #374151;
77
+ opacity: 0.6;
78
+ transition: all 0.3s ease;
79
+ }
80
+ .status-item.active {
81
+ opacity: 1;
82
+ background-color: #3b4f7d;
83
+ }
84
+ .status-item.complete {
85
+ opacity: 0.8;
86
+ background-color: #2d4a3e;
87
+ }
88
+ .status-icon {
89
+ width: 24px;
90
+ height: 24px;
91
+ margin-right: 12px;
92
+ border-radius: 50%;
93
+ display: inline-flex;
94
+ align-items: center;
95
+ justify-content: center;
96
+ font-size: 12px;
97
+ font-weight: bold;
98
+ }
99
+ .status-text {
100
+ flex-grow: 1;
101
+ }
102
+ .status-time {
103
+ font-size: 0.8rem;
104
+ color: #9ca3af;
105
+ margin-left: 8px;
106
+ }
107
+ .status-pending {
108
+ background-color: #4a5568;
109
+ }
110
+ .status-processing {
111
+ background-color: #4299e1;
112
+ animation: pulse 1.5s infinite;
113
+ }
114
+ .status-complete {
115
+ background-color: #48bb78;
116
+ }
117
+ @keyframes pulse {
118
+ 0% { opacity: 0.6; }
119
+ 50% { opacity: 1; }
120
+ 100% { opacity: 0.6; }
121
+ }
122
+ /* Custom scrollbar for insights panel */
123
+ .insights-panel::-webkit-scrollbar {
124
+ width: 8px;
125
+ }
126
+ .insights-panel::-webkit-scrollbar-track {
127
+ background: #1a1a1a;
128
+ border-radius: 4px;
129
+ }
130
+ .insights-panel::-webkit-scrollbar-thumb {
131
+ background: #4a4a4a;
132
+ border-radius: 4px;
133
+ }
134
+ .insights-panel::-webkit-scrollbar-thumb:hover {
135
+ background: #555;
136
+ }
137
+ </style>
138
  </head>
139
+ <body class="bg-gray-900 text-gray-100 min-h-screen">
140
  <div class="container mx-auto px-4 py-8">
141
+ <h1 class="text-4xl font-bold text-center text-blue-400 mb-8">Patent Technology Landscape</h1>
142
 
143
  <!-- Search Form -->
144
  <div class="max-w-2xl mx-auto mb-8">
145
+ <form id="searchForm" class="bg-gray-800 shadow-lg rounded px-8 pt-6 pb-8 mb-4">
146
  <div class="mb-4">
147
  <input type="text" id="keywords" name="keywords"
148
+ class="w-full bg-gray-700 text-white rounded border border-gray-600 focus:border-blue-500 focus:ring-2 focus:ring-blue-900 py-2 px-4"
149
+ placeholder="Enter keywords to explore patent landscape...">
150
  </div>
151
  <div class="flex items-center justify-center">
152
  <button type="submit"
153
+ class="bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-6 rounded focus:outline-none focus:shadow-outline transform hover:scale-105 transition-transform duration-200">
154
+ Explore
155
  </button>
156
  </div>
157
  </form>
158
  </div>
159
 
160
+ <!-- Loading Status -->
161
+ <div id="loading" class="loading-container hidden">
162
+ <div class="mb-4">
163
+ <div class="flex justify-between items-center mb-2">
164
+ <span class="text-sm font-medium" id="progress-text">Initializing...</span>
165
+ <span class="text-sm font-medium" id="progress-percentage">0%</span>
166
+ </div>
167
+ <div class="progress-bar">
168
+ <div class="progress-fill" id="progress-fill"></div>
169
+ </div>
170
+ </div>
171
+ <div class="status-list">
172
+ <div class="status-item" data-step="search">
173
+ <div class="status-icon status-pending">1</div>
174
+ <div class="status-text">Searching for patents</div>
175
+ <div class="status-time"></div>
176
+ </div>
177
+ <div class="status-item" data-step="embedding">
178
+ <div class="status-icon status-pending">2</div>
179
+ <div class="status-text">Generating patent embeddings</div>
180
+ <div class="status-time"></div>
181
+ </div>
182
+ <div class="status-item" data-step="clustering">
183
+ <div class="status-icon status-pending">3</div>
184
+ <div class="status-text">Identifying technology clusters</div>
185
+ <div class="status-time"></div>
186
+ </div>
187
+ <div class="status-item" data-step="analysis">
188
+ <div class="status-icon status-pending">4</div>
189
+ <div class="status-text">Analyzing innovation opportunities</div>
190
+ <div class="status-time"></div>
191
+ </div>
192
+ <div class="status-item" data-step="visualization">
193
+ <div class="status-icon status-pending">5</div>
194
+ <div class="status-text">Creating interactive visualization</div>
195
+ <div class="status-time"></div>
196
+ </div>
197
  </div>
198
  </div>
199
 
200
+ <!-- Visualization Container -->
201
+ <div id="visualization" class="visualization-container"></div>
202
+
203
+ <!-- Insights Panel -->
204
+ <div id="insights" class="insights-panel p-4"></div>
205
  </div>
206
 
207
  <script>
208
+ let progressEventSource = null;
209
+
210
+ function startProgressMonitoring() {
211
+ if (progressEventSource) {
212
+ progressEventSource.close();
213
+ }
214
+
215
+ progressEventSource = new EventSource('/progress');
216
+ progressEventSource.onmessage = function(event) {
217
+ const data = JSON.parse(event.data);
218
+ updateProgress(data.step, data.status);
219
+ };
220
+
221
+ progressEventSource.onerror = function() {
222
+ progressEventSource.close();
223
+ };
224
+ }
225
+
226
+ function stopProgressMonitoring() {
227
+ if (progressEventSource) {
228
+ progressEventSource.close();
229
+ progressEventSource = null;
230
+ }
231
+ }
232
+
233
+ function updateProgress(step, status) {
234
+ const steps = ['search', 'embedding', 'clustering', 'analysis', 'visualization'];
235
+ const stepIndex = steps.indexOf(step);
236
+ const progress = ((stepIndex + 1) / steps.length) * 100;
237
+
238
+ // Update progress bar
239
+ $('#progress-fill').css('width', `${progress}%`);
240
+ $('#progress-percentage').text(`${Math.round(progress)}%`);
241
+
242
+ // Update status text
243
+ const statusTexts = {
244
+ 'search': 'Searching patent database...',
245
+ 'embedding': 'Generating patent embeddings...',
246
+ 'clustering': 'Identifying technology clusters...',
247
+ 'analysis': 'Analyzing innovation opportunities...',
248
+ 'visualization': 'Creating interactive visualization...'
249
+ };
250
+ $('#progress-text').text(statusTexts[step]);
251
+
252
+ // Update status items
253
+ steps.forEach((s, i) => {
254
+ const item = $(`.status-item[data-step="${s}"]`);
255
+ const time = item.find('.status-time');
256
+
257
+ if (i < stepIndex) {
258
+ item.removeClass('active').addClass('complete');
259
+ if (!time.text()) {
260
+ time.text(new Date().toLocaleTimeString());
261
+ }
262
+ } else if (i === stepIndex) {
263
+ item.addClass('active').removeClass('complete');
264
+ time.text('In progress...');
265
+ } else {
266
+ item.removeClass('active complete');
267
+ time.text('');
268
+ }
269
+ });
270
+ }
271
+
272
  $(document).ready(function() {
273
  $('#searchForm').on('submit', function(e) {
274
  e.preventDefault();
 
279
  return;
280
  }
281
 
282
+ // Reset and show loading status
283
  $('#loading').removeClass('hidden');
284
+ $('#visualization').empty();
285
+ $('#insights').empty();
286
+ $('.progress-fill').css('width', '0%');
287
+ $('#progress-percentage').text('0%');
288
+ $('#progress-text').text('Initializing...');
289
+ $('.status-item').removeClass('active complete');
290
+ $('.status-time').text('');
291
+
292
+ // Start progress monitoring
293
+ startProgressMonitoring();
294
 
295
  $.ajax({
296
  url: '/search',
297
  method: 'POST',
298
  data: { keywords: keywords },
299
  success: function(response) {
 
 
300
  if (response.error) {
301
+ stopProgressMonitoring();
302
+ $('#loading').addClass('hidden');
303
+ alert(response.error);
304
  return;
305
  }
306
 
307
+ // Display visualization
308
+ if (response.visualization) {
309
+ const vizData = JSON.parse(response.visualization);
310
+ Plotly.newPlot('visualization', vizData.data, vizData.layout);
311
+
312
+ document.getElementById('visualization').on('plotly_click', function(data) {
313
+ const link = data.points[0].customdata;
314
+ if (link) {
315
+ window.open(link, '_blank');
316
+ }
317
+ });
318
  }
319
 
320
+ // Display insights with two-column layout
321
+ if (response.insights) {
322
+ let insightsHtml = '<div class="grid grid-cols-1 lg:grid-cols-2 gap-6 p-6">';
323
+
324
+ // Left column: Innovation Opportunities
325
+ insightsHtml += '<div class="col-span-1">';
326
+ insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-green-400">Innovation Opportunities</h3>';
327
+
328
+ const opportunityZones = response.insights.filter(i => i.type === 'opportunity_zone');
329
+ if (opportunityZones.length > 0) {
330
+ insightsHtml += '<div class="space-y-4">';
331
+ opportunityZones.forEach(zone => {
332
+ insightsHtml += `
333
+ <div class="opportunity-card p-6 text-base">
334
+ <div class="text-green-300 text-lg font-bold mb-3">Sparse Region (${zone.size} patents)</div>
335
+ <div class="text-gray-300 whitespace-pre-line leading-relaxed">${zone.description}</div>
336
+ </div>
337
+ `;
338
+ });
339
+ insightsHtml += '</div>';
340
+ } else {
341
+ insightsHtml += '<p class="text-gray-400">No innovation opportunities identified.</p>';
342
+ }
343
+ insightsHtml += '</div>';
344
+
345
+ // Right column: Technology Clusters
346
+ insightsHtml += '<div class="col-span-1">';
347
+ insightsHtml += '<h3 class="text-2xl font-bold mb-4 text-blue-400">Technology Clusters</h3>';
348
+
349
+ const clusters = response.insights.filter(i => i.type === 'cluster');
350
+ if (clusters.length > 0) {
351
+ insightsHtml += '<div class="space-y-4">';
352
+ clusters.forEach(cluster => {
353
+ insightsHtml += `
354
+ <div class="cluster-card p-6 text-base">
355
+ <div class="text-blue-300 text-lg font-bold mb-3">Cluster ${cluster.id} (${cluster.size} patents)</div>
356
+ <div class="text-gray-300 whitespace-pre-line leading-relaxed">${cluster.description}</div>
357
+ </div>
358
+ `;
359
+ });
360
+ insightsHtml += '</div>';
361
+ } else {
362
+ insightsHtml += '<p class="text-gray-400">No technology clusters identified.</p>';
363
+ }
364
+ insightsHtml += '</div>';
365
 
366
+ insightsHtml += '</div>';
367
+ $('#insights').html(insightsHtml);
368
+ }
369
+
370
+ // Stop progress monitoring and hide loading status
371
+ stopProgressMonitoring();
372
+ $('#loading').addClass('hidden');
373
  },
374
  error: function() {
375
+ stopProgressMonitoring();
376
  $('#loading').addClass('hidden');
377
+ alert('An error occurred while analyzing patents.');
378
  }
379
  });
380
  });