DrishtiSharma commited on
Commit
724f682
·
verified ·
1 Parent(s): 2c11c21

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +100 -0
main.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.prompts import PromptTemplate
3
+ from langchain.output_parsers.list import NumberedListOutputParser
4
+ from langchain.output_parsers import ResponseSchema, StructuredOutputParser
5
+ from langchain_community.chat_models import ChatOpenAI
6
+ import serpapi
7
+
8
+ model_name = 'gpt-4-0125-preview'
9
+
10
+ openai_key = os.getenv("OPENAI_API_KEY")
11
+ serpapi_key = os.getenv("SERPAPI_KEY")
12
+ # generate search terms using OpenAI
13
+ def generate_search_terms(input_text: str, number_of_generated_search_terms):
14
+ llm = ChatOpenAI(model_name=model_name, temperature=0.0)
15
+ output_parser = NumberedListOutputParser()
16
+ format_instructions = output_parser.get_format_instructions()
17
+ prompt = PromptTemplate(
18
+ template="As a search specialist with expertise in optimizing searches in the Google Patents database, your task is to generate " + str(number_of_generated_search_terms) + " optimal keyword or keyword list like single and multiple keywords(please choose correct terms, i want to get at least 10 results for each query, don't be too specific) like, `(rabbit toy), (coffee brew) AND (pot) OR (top), (stabilization system), (vr heading) OR (logic freq)`, so dont use \" or ' use only phranthesis, searches to find similar patents for the following invention idea: ---BEGINNING--- `{user_input}` ---END--- {format_instructions}\n",
19
+ input_variables=["user_input"],
20
+ partial_variables={"format_instructions": format_instructions}
21
+ )
22
+
23
+ output = llm.predict(text=prompt.format(user_input=input_text))
24
+ output_list = output_parser.parse(output)
25
+ return output_list
26
+
27
+ # search Google Patents using SerpApi
28
+ def search_on_google_patents(terms: list):
29
+ # multiple_queries = ';'.join(terms)
30
+ search_terms_patterns ={}
31
+ for search_term in terms:
32
+ params = {
33
+ "engine": "google_patents",
34
+ "q": search_term,
35
+ "clustered": "true",
36
+ "scholar": "true",
37
+ "api_key": serpapi_key
38
+ }
39
+ results = serpapi.search(params)
40
+ if results.get('error', False):
41
+ raise results['error']
42
+ organic_results = results["organic_results"]
43
+
44
+ patents = []
45
+ for result in organic_results:
46
+ if "patent_id" in result:
47
+ patent = {
48
+ "patentTitle": result["title"],
49
+ "patentNumber": result["publication_number"],
50
+ "inventors": [result["inventor"]],
51
+ "assignee": result["assignee"],
52
+ "abstract": result["snippet"],
53
+ "publicationDate": result["publication_date"],
54
+ "filingDate": result["filing_date"],
55
+ "patentUrl": result["serpapi_link"]
56
+ }
57
+ patents.append(patent)
58
+ search_terms_patterns[search_term] = patents
59
+ return search_terms_patterns
60
+
61
+ # check similarity of patents using OpenAI
62
+ def check_similarity_of_patents(input_text, patents: list):
63
+ llm = ChatOpenAI(model_name=model_name, temperature=0.0)
64
+
65
+ response_schemas = [
66
+ ResponseSchema(
67
+ name="listOfPatents",
68
+ description="List of dicts of patentTitle, patentNumber and similarityScore (score over 100): [{patentTitle: string, patentNumber: string, similarityScore: number}]",
69
+ type="array(objects)"
70
+ )
71
+ ]
72
+ output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
73
+ format_instructions = output_parser.get_format_instructions()
74
+ prompt = PromptTemplate(
75
+ template="Could you please generate a semantic similarity score out of 100 for the following patent information {user_input}, comparing it with the following abstracts:" + '\n'.join([f"\n===BEGINNING=== {i+1} - {patent['patentTitle']} - {patent['patentNumber']} - {patent['abstract']} ===END===" for i, patent in enumerate(patents)]) + "\n{format_instructions}\n",
76
+ input_variables=["user_input"],
77
+ partial_variables={"format_instructions": format_instructions}
78
+ )
79
+ output = llm.predict(text=prompt.format(user_input=input_text))
80
+ output_list = output_parser.parse(output)
81
+ return output_list
82
+
83
+ # merge patents with similarity data
84
+ def merge_patents_with_similarity(patents, similarity_data):
85
+ merged_list = []
86
+ for patent in patents:
87
+ patent_number = patent['patentNumber']
88
+ for similarity_patent in similarity_data['listOfPatents']:
89
+ if similarity_patent['patentNumber'] == patent_number:
90
+ patent['similarityScore'] = similarity_patent['similarityScore']
91
+ patent['patentGoogleUrl'] = f"https://patents.google.com/patent/{patent_number}"
92
+ break
93
+ merged_list.append(patent)
94
+ merged_list = sorted(merged_list, key=lambda x: x['similarityScore'], reverse=True)
95
+ return list(merged_list)
96
+
97
+ # sort patents by similarity score
98
+ def sort_patents_by_similarity_score(data):
99
+ sorted_patents = sorted(data, key=lambda x: x['similarityScore'], reverse=True)
100
+ return sorted_patents