Create main.py
Browse files
main.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from langchain.prompts import PromptTemplate
|
3 |
+
from langchain.output_parsers.list import NumberedListOutputParser
|
4 |
+
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
|
5 |
+
from langchain_community.chat_models import ChatOpenAI
|
6 |
+
import serpapi
|
7 |
+
|
8 |
+
model_name = 'gpt-4-0125-preview'
|
9 |
+
|
10 |
+
openai_key = os.getenv("OPENAI_API_KEY")
|
11 |
+
serpapi_key = os.getenv("SERPAPI_KEY")
|
12 |
+
# generate search terms using OpenAI
|
13 |
+
def generate_search_terms(input_text: str, number_of_generated_search_terms):
|
14 |
+
llm = ChatOpenAI(model_name=model_name, temperature=0.0)
|
15 |
+
output_parser = NumberedListOutputParser()
|
16 |
+
format_instructions = output_parser.get_format_instructions()
|
17 |
+
prompt = PromptTemplate(
|
18 |
+
template="As a search specialist with expertise in optimizing searches in the Google Patents database, your task is to generate " + str(number_of_generated_search_terms) + " optimal keyword or keyword list like single and multiple keywords(please choose correct terms, i want to get at least 10 results for each query, don't be too specific) like, `(rabbit toy), (coffee brew) AND (pot) OR (top), (stabilization system), (vr heading) OR (logic freq)`, so dont use \" or ' use only phranthesis, searches to find similar patents for the following invention idea: ---BEGINNING--- `{user_input}` ---END--- {format_instructions}\n",
|
19 |
+
input_variables=["user_input"],
|
20 |
+
partial_variables={"format_instructions": format_instructions}
|
21 |
+
)
|
22 |
+
|
23 |
+
output = llm.predict(text=prompt.format(user_input=input_text))
|
24 |
+
output_list = output_parser.parse(output)
|
25 |
+
return output_list
|
26 |
+
|
27 |
+
# search Google Patents using SerpApi
|
28 |
+
def search_on_google_patents(terms: list):
|
29 |
+
# multiple_queries = ';'.join(terms)
|
30 |
+
search_terms_patterns ={}
|
31 |
+
for search_term in terms:
|
32 |
+
params = {
|
33 |
+
"engine": "google_patents",
|
34 |
+
"q": search_term,
|
35 |
+
"clustered": "true",
|
36 |
+
"scholar": "true",
|
37 |
+
"api_key": serpapi_key
|
38 |
+
}
|
39 |
+
results = serpapi.search(params)
|
40 |
+
if results.get('error', False):
|
41 |
+
raise results['error']
|
42 |
+
organic_results = results["organic_results"]
|
43 |
+
|
44 |
+
patents = []
|
45 |
+
for result in organic_results:
|
46 |
+
if "patent_id" in result:
|
47 |
+
patent = {
|
48 |
+
"patentTitle": result["title"],
|
49 |
+
"patentNumber": result["publication_number"],
|
50 |
+
"inventors": [result["inventor"]],
|
51 |
+
"assignee": result["assignee"],
|
52 |
+
"abstract": result["snippet"],
|
53 |
+
"publicationDate": result["publication_date"],
|
54 |
+
"filingDate": result["filing_date"],
|
55 |
+
"patentUrl": result["serpapi_link"]
|
56 |
+
}
|
57 |
+
patents.append(patent)
|
58 |
+
search_terms_patterns[search_term] = patents
|
59 |
+
return search_terms_patterns
|
60 |
+
|
61 |
+
# check similarity of patents using OpenAI
|
62 |
+
def check_similarity_of_patents(input_text, patents: list):
|
63 |
+
llm = ChatOpenAI(model_name=model_name, temperature=0.0)
|
64 |
+
|
65 |
+
response_schemas = [
|
66 |
+
ResponseSchema(
|
67 |
+
name="listOfPatents",
|
68 |
+
description="List of dicts of patentTitle, patentNumber and similarityScore (score over 100): [{patentTitle: string, patentNumber: string, similarityScore: number}]",
|
69 |
+
type="array(objects)"
|
70 |
+
)
|
71 |
+
]
|
72 |
+
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
|
73 |
+
format_instructions = output_parser.get_format_instructions()
|
74 |
+
prompt = PromptTemplate(
|
75 |
+
template="Could you please generate a semantic similarity score out of 100 for the following patent information {user_input}, comparing it with the following abstracts:" + '\n'.join([f"\n===BEGINNING=== {i+1} - {patent['patentTitle']} - {patent['patentNumber']} - {patent['abstract']} ===END===" for i, patent in enumerate(patents)]) + "\n{format_instructions}\n",
|
76 |
+
input_variables=["user_input"],
|
77 |
+
partial_variables={"format_instructions": format_instructions}
|
78 |
+
)
|
79 |
+
output = llm.predict(text=prompt.format(user_input=input_text))
|
80 |
+
output_list = output_parser.parse(output)
|
81 |
+
return output_list
|
82 |
+
|
83 |
+
# merge patents with similarity data
|
84 |
+
def merge_patents_with_similarity(patents, similarity_data):
|
85 |
+
merged_list = []
|
86 |
+
for patent in patents:
|
87 |
+
patent_number = patent['patentNumber']
|
88 |
+
for similarity_patent in similarity_data['listOfPatents']:
|
89 |
+
if similarity_patent['patentNumber'] == patent_number:
|
90 |
+
patent['similarityScore'] = similarity_patent['similarityScore']
|
91 |
+
patent['patentGoogleUrl'] = f"https://patents.google.com/patent/{patent_number}"
|
92 |
+
break
|
93 |
+
merged_list.append(patent)
|
94 |
+
merged_list = sorted(merged_list, key=lambda x: x['similarityScore'], reverse=True)
|
95 |
+
return list(merged_list)
|
96 |
+
|
97 |
+
# sort patents by similarity score
|
98 |
+
def sort_patents_by_similarity_score(data):
|
99 |
+
sorted_patents = sorted(data, key=lambda x: x['similarityScore'], reverse=True)
|
100 |
+
return sorted_patents
|