bhaskartripathi commited on
Commit
d99b030
·
1 Parent(s): 49d9e62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -41
app.py CHANGED
@@ -53,55 +53,70 @@ def text_to_chunks(texts, word_length=150, start_page=1):
53
  chunks.append(chunk)
54
  return chunks
55
 
 
56
  class SemanticSearch:
57
 
58
- def __init__(self, embedding_method='use'):
59
- self.embedding_method = embedding_method
60
- if embedding_method == 'use':
61
- self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
 
 
 
62
  self.fitted = False
63
-
64
- def fit(self, data, openAI_key=None, batch=1000, n_neighbors=5):
 
65
  self.data = data
66
- self.embeddings = self.get_text_embedding(data, openAI_key, batch=batch)
67
  n_neighbors = min(n_neighbors, len(self.embeddings))
68
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
69
  self.nn.fit(self.embeddings)
70
  self.fitted = True
71
-
72
- def get_text_embedding(self, texts, openAI_key=None, batch=1000):
73
- embeddings = []
 
 
74
 
75
- if self.embedding_method == 'openai':
76
- if openAI_key is None:
77
- raise ValueError("OpenAI key is required when using OpenAI embeddings.")
78
- openai.api_key = openAI_key
79
-
 
 
 
80
  for i in range(0, len(texts), batch):
81
  text_batch = texts[i:(i+batch)]
82
-
83
- if self.embedding_method == 'use':
84
- emb_batch = self.use(text_batch)
85
- elif self.embedding_method == 'openai':
86
- emb_batch = [openai.Embed.extract(prompt=text)["embeddings"] for text in text_batch]
87
- emb_batch = np.vstack(emb_batch)
88
-
89
  embeddings.append(emb_batch)
90
-
91
  embeddings = np.vstack(embeddings)
92
- return embeddings
93
-
94
-
95
-
96
-
97
- def load_recommender(path, openAI_key=None, start_page=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  global recommender
99
  texts = pdf_to_text(path, start_page=start_page)
100
  chunks = text_to_chunks(texts, start_page=start_page)
101
- recommender.fit(chunks, openAI_key=openAI_key)
102
  return 'Corpus Loaded.'
103
 
104
-
105
  def generate_text(openAI_key,prompt, engine="text-davinci-003"):
106
  openai.api_key = openAI_key
107
  completions = openai.Completion.create(
@@ -148,10 +163,14 @@ def question_answer(url, file, question,openAI_key):
148
  if url.strip() != '':
149
  glob_url = url
150
  download_pdf(glob_url, 'corpus.pdf')
151
- load_recommender('corpus.pdf', openAI_key=openAI_key)
152
 
153
  else:
154
- load_recommender(file_name, openAI_key=openAI_key)
 
 
 
 
155
 
156
  if question.strip() == '':
157
  return '[ERROR]: Question field is empty'
@@ -159,20 +178,21 @@ def question_answer(url, file, question,openAI_key):
159
  return generate_answer(question,openAI_key)
160
 
161
 
162
- #recommender = SemanticSearch()
163
- recommender = SemanticSearch(embedding_method='openai')
164
 
 
 
165
 
166
  title = 'PDF GPT'
167
  description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
168
 
169
- with gr.Blocks() as demo:
 
170
 
 
171
  gr.Markdown(f'<center><h1>{title}</h1></center>')
172
  gr.Markdown(description)
173
-
174
  with gr.Row():
175
-
176
  with gr.Group():
177
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
178
  openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
@@ -182,10 +202,7 @@ with gr.Blocks() as demo:
182
  question = gr.Textbox(label='Enter your question here')
183
  btn = gr.Button(value='Submit')
184
  btn.style(full_width=True)
185
-
186
  with gr.Group():
187
  answer = gr.Textbox(label='The answer to your question is :')
188
-
189
  btn.click(question_answer, inputs=[url, file, question,openAI_key], outputs=[answer])
190
- #openai.api_key = os.getenv('Your_Key_Here')
191
  demo.launch()
 
53
  chunks.append(chunk)
54
  return chunks
55
 
56
+
57
  class SemanticSearch:
58
 
59
+ def __init__(self, embedder='openai'):
60
+ if embedder == 'openai':
61
+ self.embedder = openai.Engine("davinci")
62
+ elif embedder == 'use':
63
+ self.embedder = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
64
+ else:
65
+ raise ValueError("Invalid embedder. Must be either 'openai' or 'use'.")
66
  self.fitted = False
67
+
68
+
69
+ def fit(self, data, batch=1000, n_neighbors=5):
70
  self.data = data
71
+ self.embeddings = self.get_text_embedding(data, batch=batch)
72
  n_neighbors = min(n_neighbors, len(self.embeddings))
73
  self.nn = NearestNeighbors(n_neighbors=n_neighbors)
74
  self.nn.fit(self.embeddings)
75
  self.fitted = True
76
+
77
+
78
+ def __call__(self, text, return_data=True):
79
+ inp_emb = self.use([text])
80
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
81
 
82
+ if return_data:
83
+ return [self.data[i] for i in neighbors]
84
+ else:
85
+ return neighbors
86
+
87
+
88
+ '''def get_text_embedding(self, texts, batch=1000):
89
+ embeddings = []
90
  for i in range(0, len(texts), batch):
91
  text_batch = texts[i:(i+batch)]
92
+ emb_batch = self.use(text_batch)
 
 
 
 
 
 
93
  embeddings.append(emb_batch)
 
94
  embeddings = np.vstack(embeddings)
95
+ return embeddings'''
96
+ def get_text_embedding(self, texts):
97
+ embeddings = []
98
+ if isinstance(self.embedder, openai.Engine):
99
+ for text in texts:
100
+ response = self.embedder.search(
101
+ documents=texts,
102
+ query=text,
103
+ max_rerank=1
104
+ )
105
+ embeddings.append(response["data"][0]["score"])
106
+ elif isinstance(self.embedder, hub.Module):
107
+ embeddings = self.embedder(texts)
108
+ else:
109
+ raise ValueError("Invalid embedder.")
110
+ return np.array(embeddings)
111
+
112
+
113
+ def load_recommender(path, start_page=1):
114
  global recommender
115
  texts = pdf_to_text(path, start_page=start_page)
116
  chunks = text_to_chunks(texts, start_page=start_page)
117
+ recommender.fit(chunks)
118
  return 'Corpus Loaded.'
119
 
 
120
  def generate_text(openAI_key,prompt, engine="text-davinci-003"):
121
  openai.api_key = openAI_key
122
  completions = openai.Completion.create(
 
163
  if url.strip() != '':
164
  glob_url = url
165
  download_pdf(glob_url, 'corpus.pdf')
166
+ load_recommender('corpus.pdf')
167
 
168
  else:
169
+ old_file_name = file.name
170
+ file_name = file.name
171
+ file_name = file_name[:-12] + file_name[-4:]
172
+ os.rename(old_file_name, file_name)
173
+ load_recommender(file_name)
174
 
175
  if question.strip() == '':
176
  return '[ERROR]: Question field is empty'
 
178
  return generate_answer(question,openAI_key)
179
 
180
 
181
+ recommender = SemanticSearch()
 
182
 
183
+ title = 'PDF GPT'
184
+ description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
185
 
186
  title = 'PDF GPT'
187
  description = """ PDF GPT allows you to chat with your PDF file using Universal Sentence Encoder and Open AI. It gives hallucination free response than other tools as the embeddings are better than OpenAI. The returned response can even cite the page number in square brackets([]) where the information is located, adding credibility to the responses and helping to locate pertinent information quickly."""
188
 
189
+ embedder = gr.Dropdown(['openai', 'use'], label='Select Embedder')
190
+ recommender = SemanticSearch(embedder=embedder)
191
 
192
+ with gr.Blocks() as demo:
193
  gr.Markdown(f'<center><h1>{title}</h1></center>')
194
  gr.Markdown(description)
 
195
  with gr.Row():
 
196
  with gr.Group():
197
  gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
198
  openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
 
202
  question = gr.Textbox(label='Enter your question here')
203
  btn = gr.Button(value='Submit')
204
  btn.style(full_width=True)
 
205
  with gr.Group():
206
  answer = gr.Textbox(label='The answer to your question is :')
 
207
  btn.click(question_answer, inputs=[url, file, question,openAI_key], outputs=[answer])
 
208
  demo.launch()