bhaskartripathi commited on
Commit
49d9e62
·
1 Parent(s): dbd6aa4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -80
app.py CHANGED
@@ -55,48 +55,53 @@ def text_to_chunks(texts, word_length=150, start_page=1):
55
 
56
  class SemanticSearch:
57
 
58
- def __init__(self):
59
- self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
 
 
60
  self.fitted = False
61
-
62
-
63
- def fit(self, data, batch=1000, n_neighbors=5):
64
  self.data = data
65
- self.embeddings = self.get_text_embedding(data, batch=batch)
66
  n_neighbors = min(n_neighbors, len(self.embeddings))
67
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
68
  self.nn.fit(self.embeddings)
69
  self.fitted = True
70
-
71
-
72
- def __call__(self, text, return_data=True):
73
- inp_emb = self.use([text])
74
- neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
75
-
76
- if return_data:
77
- return [self.data[i] for i in neighbors]
78
- else:
79
- return neighbors
80
-
81
-
82
- def get_text_embedding(self, texts, batch=1000):
83
  embeddings = []
 
 
 
 
 
 
84
  for i in range(0, len(texts), batch):
85
  text_batch = texts[i:(i+batch)]
86
- emb_batch = self.use(text_batch)
 
 
 
 
 
 
87
  embeddings.append(emb_batch)
 
88
  embeddings = np.vstack(embeddings)
89
  return embeddings
90
 
91
 
92
 
93
- def load_recommender(path, start_page=1):
 
94
  global recommender
95
  texts = pdf_to_text(path, start_page=start_page)
96
  chunks = text_to_chunks(texts, start_page=start_page)
97
- recommender.fit(chunks)
98
  return 'Corpus Loaded.'
99
 
 
100
  def generate_text(openAI_key,prompt, engine="text-davinci-003"):
101
  openai.api_key = openAI_key
102
  completions = openai.Completion.create(
@@ -110,7 +115,7 @@ def generate_text(openAI_key,prompt, engine="text-davinci-003"):
110
  message = completions.choices[0].text
111
  return message
112
 
113
- def generate_answer1(question,openAI_key):
114
  topn_chunks = recommender(question)
115
  prompt = ""
116
  prompt += 'search results:\n\n'
@@ -130,56 +135,6 @@ def generate_answer1(question,openAI_key):
130
  answer = generate_text(openAI_key, prompt,"text-davinci-003")
131
  return answer
132
 
133
- def generate_answer(question, openAI_key):
134
- topn_chunks = recommender(question)
135
-
136
- max_tokens = 4096 # Maximum tokens allowed for text-davinci-003
137
- completion_tokens = 512 # Tokens reserved for the completion
138
- tokenizer = OpenAITokenizer()
139
- max_prompt_tokens = max_tokens - completion_tokens
140
-
141
- # Split search results into groups based on token count
142
- search_results_groups = []
143
- current_group = []
144
- current_group_tokens = 0
145
-
146
- for c in topn_chunks:
147
- c_tokens = len(tokenizer.tokenize(c))
148
- if current_group_tokens + c_tokens <= max_prompt_tokens:
149
- current_group.append(c)
150
- current_group_tokens += c_tokens
151
- else:
152
- search_results_groups.append(current_group)
153
- current_group = [c]
154
- current_group_tokens = c_tokens
155
-
156
- if current_group:
157
- search_results_groups.append(current_group)
158
-
159
- # Generate response for each group of search results
160
- responses = []
161
- for search_results in search_results_groups:
162
- prompt = 'search results:\n\n'
163
- for c in search_results:
164
- prompt += c + '\n\n'
165
-
166
- prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
167
- "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
168
- "Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
169
- "with the same name, create separate answers for each. Only include information found in the results and "\
170
- "don't add any additional information. Make sure the answer is correct and don't output false content. "\
171
- "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "\
172
- "search results which has nothing to do with the question. Only answer what is asked. The "\
173
- "answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
174
-
175
- response = generate_text(openAI_key, prompt, "text-davinci-003")
176
- responses.append(response)
177
-
178
- # Combine and clean up the responses
179
- final_answer = " ".join(responses).strip()
180
-
181
- return final_answer
182
-
183
 
184
  def question_answer(url, file, question,openAI_key):
185
  if openAI_key.strip()=='':
@@ -193,14 +148,10 @@ def question_answer(url, file, question,openAI_key):
193
  if url.strip() != '':
194
  glob_url = url
195
  download_pdf(glob_url, 'corpus.pdf')
196
- load_recommender('corpus.pdf')
197
 
198
  else:
199
- old_file_name = file.name
200
- file_name = file.name
201
- file_name = file_name[:-12] + file_name[-4:]
202
- os.rename(old_file_name, file_name)
203
- load_recommender(file_name)
204
 
205
  if question.strip() == '':
206
  return '[ERROR]: Question field is empty'
@@ -208,7 +159,9 @@ def question_answer(url, file, question,openAI_key):
208
  return generate_answer(question,openAI_key)
209
 
210
 
211
- recommender = SemanticSearch()
 
 
212
 
213
  title = 'PDF GPT'
214
  description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
 
55
 
56
  class SemanticSearch:
57
 
58
+ def __init__(self, embedding_method='use'):
59
+ self.embedding_method = embedding_method
60
+ if embedding_method == 'use':
61
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
62
  self.fitted = False
63
+
64
+ def fit(self, data, openAI_key=None, batch=1000, n_neighbors=5):
 
65
  self.data = data
66
+ self.embeddings = self.get_text_embedding(data, openAI_key, batch=batch)
67
  n_neighbors = min(n_neighbors, len(self.embeddings))
68
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
69
  self.nn.fit(self.embeddings)
70
  self.fitted = True
71
+
72
+ def get_text_embedding(self, texts, openAI_key=None, batch=1000):
 
 
 
 
 
 
 
 
 
 
 
73
  embeddings = []
74
+
75
+ if self.embedding_method == 'openai':
76
+ if openAI_key is None:
77
+ raise ValueError("OpenAI key is required when using OpenAI embeddings.")
78
+ openai.api_key = openAI_key
79
+
80
  for i in range(0, len(texts), batch):
81
  text_batch = texts[i:(i+batch)]
82
+
83
+ if self.embedding_method == 'use':
84
+ emb_batch = self.use(text_batch)
85
+ elif self.embedding_method == 'openai':
86
+ emb_batch = [openai.Embed.extract(prompt=text)["embeddings"] for text in text_batch]
87
+ emb_batch = np.vstack(emb_batch)
88
+
89
  embeddings.append(emb_batch)
90
+
91
  embeddings = np.vstack(embeddings)
92
  return embeddings
93
 
94
 
95
 
96
+
97
+ def load_recommender(path, openAI_key=None, start_page=1):
98
  global recommender
99
  texts = pdf_to_text(path, start_page=start_page)
100
  chunks = text_to_chunks(texts, start_page=start_page)
101
+ recommender.fit(chunks, openAI_key=openAI_key)
102
  return 'Corpus Loaded.'
103
 
104
+
105
  def generate_text(openAI_key,prompt, engine="text-davinci-003"):
106
  openai.api_key = openAI_key
107
  completions = openai.Completion.create(
 
115
  message = completions.choices[0].text
116
  return message
117
 
118
+ def generate_answer(question,openAI_key):
119
  topn_chunks = recommender(question)
120
  prompt = ""
121
  prompt += 'search results:\n\n'
 
135
  answer = generate_text(openAI_key, prompt,"text-davinci-003")
136
  return answer
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  def question_answer(url, file, question,openAI_key):
140
  if openAI_key.strip()=='':
 
148
  if url.strip() != '':
149
  glob_url = url
150
  download_pdf(glob_url, 'corpus.pdf')
151
+ load_recommender('corpus.pdf', openAI_key=openAI_key)
152
 
153
  else:
154
+ load_recommender(file_name, openAI_key=openAI_key)
 
 
 
 
155
 
156
  if question.strip() == '':
157
  return '[ERROR]: Question field is empty'
 
159
  return generate_answer(question,openAI_key)
160
 
161
 
162
+ #recommender = SemanticSearch()
163
+ recommender = SemanticSearch(embedding_method='openai')
164
+
165
 
166
  title = 'PDF GPT'
167
  description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""