Spaces:
Runtime error
Runtime error
Implementation of multi-pdf chat
Browse filesI have added a simple implementation of multi-pdf chats by making it possible to upload multiple pdfs via file upload. It's not possible to use multiple URLs yet.
Changes
- Making sure chunks get added from text of all files
- Added a [file number] parameter to each chunk
- Adapted the prompts to also cite the file number
app.py
CHANGED
@@ -18,6 +18,7 @@ def preprocess(text):
|
|
18 |
return text
|
19 |
|
20 |
|
|
|
21 |
def pdf_to_text(path, start_page=1, end_page=None):
|
22 |
doc = fitz.open(path)
|
23 |
total_pages = doc.page_count
|
@@ -35,8 +36,8 @@ def pdf_to_text(path, start_page=1, end_page=None):
|
|
35 |
doc.close()
|
36 |
return text_list
|
37 |
|
38 |
-
|
39 |
-
def text_to_chunks(texts, word_length=150, start_page=1):
|
40 |
text_toks = [t.split(' ') for t in texts]
|
41 |
page_nums = []
|
42 |
chunks = []
|
@@ -49,7 +50,7 @@ def text_to_chunks(texts, word_length=150, start_page=1):
|
|
49 |
text_toks[idx+1] = chunk + text_toks[idx+1]
|
50 |
continue
|
51 |
chunk = ' '.join(chunk).strip()
|
52 |
-
chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
|
53 |
chunks.append(chunk)
|
54 |
return chunks
|
55 |
|
@@ -91,10 +92,12 @@ class SemanticSearch:
|
|
91 |
|
92 |
|
93 |
|
94 |
-
def load_recommender(
|
95 |
global recommender
|
96 |
-
texts =
|
97 |
-
chunks =
|
|
|
|
|
98 |
recommender.fit(chunks)
|
99 |
return 'Corpus Loaded.'
|
100 |
|
@@ -140,7 +143,7 @@ def generate_answer(question, openAI_key, model):
|
|
140 |
prompt += c + '\n\n'
|
141 |
|
142 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
143 |
-
"Cite each reference using [ Page Number] notation. "\
|
144 |
"Only answer what is asked. The answer should be short and concise. \n\nQuery: "
|
145 |
|
146 |
prompt += f"{question}\nAnswer:"
|
@@ -148,13 +151,15 @@ def generate_answer(question, openAI_key, model):
|
|
148 |
return answer
|
149 |
|
150 |
|
151 |
-
def question_answer(chat_history, url,
|
152 |
try:
|
|
|
|
|
153 |
if openAI_key.strip()=='':
|
154 |
return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
|
155 |
-
if url.strip() == '' and
|
156 |
return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
|
157 |
-
if url.strip() != '' and
|
158 |
return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
|
159 |
if model is None or model =='':
|
160 |
return '[ERROR]: You have not selected any model. Please choose an LLM model.'
|
@@ -163,11 +168,16 @@ def question_answer(chat_history, url, file, question, openAI_key, model):
|
|
163 |
download_pdf(glob_url, 'corpus.pdf')
|
164 |
load_recommender('corpus.pdf')
|
165 |
else:
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
171 |
if question.strip() == '':
|
172 |
return '[ERROR]: Question field is empty'
|
173 |
if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
|
@@ -203,7 +213,7 @@ def generate_answer_text_davinci_003(question,openAI_key):
|
|
203 |
prompt += c + '\n\n'
|
204 |
|
205 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
206 |
-
"Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
|
207 |
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
|
208 |
"with the same name, create separate answers for each. Only include information found in the results and "\
|
209 |
"don't add any additional information. Make sure the answer is correct and don't output false content. "\
|
@@ -212,6 +222,7 @@ def generate_answer_text_davinci_003(question,openAI_key):
|
|
212 |
"answer should be short and concise. \n\nQuery: {question}\nAnswer: "
|
213 |
|
214 |
prompt += f"Query: {question}\nAnswer:"
|
|
|
215 |
answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
|
216 |
return answer
|
217 |
|
@@ -242,15 +253,14 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
|
|
242 |
gr.Markdown(f'<center><h3>{title}</h3></center>')
|
243 |
gr.Markdown(description)
|
244 |
|
245 |
-
with gr.Row():
|
246 |
-
|
247 |
with gr.Group():
|
248 |
gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
|
249 |
with gr.Accordion("API Key"):
|
250 |
openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
|
251 |
-
url = gr.Textbox(label='Enter PDF URL here
|
252 |
gr.Markdown("<center><h4>OR<h4></center>")
|
253 |
-
|
254 |
question = gr.Textbox(label='Enter your question here')
|
255 |
gr.Examples(
|
256 |
[[q] for q in questions],
|
@@ -273,15 +283,12 @@ with gr.Blocks(css="""#chatbot { font-size: 14px; min-height: 1200; }""") as dem
|
|
273 |
with gr.Group():
|
274 |
chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
|
275 |
|
276 |
-
|
277 |
-
#
|
278 |
# Bind the click event of the button to the question_answer function
|
279 |
btn.click(
|
280 |
question_answer,
|
281 |
-
inputs=[chatbot, url,
|
282 |
outputs=[chatbot],
|
283 |
)
|
284 |
|
285 |
-
demo.launch()
|
286 |
-
|
287 |
-
|
|
|
18 |
return text
|
19 |
|
20 |
|
21 |
+
# converts pdf to text
|
22 |
def pdf_to_text(path, start_page=1, end_page=None):
|
23 |
doc = fitz.open(path)
|
24 |
total_pages = doc.page_count
|
|
|
36 |
doc.close()
|
37 |
return text_list
|
38 |
|
39 |
+
# one text converts a list of chunks
|
40 |
+
def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
|
41 |
text_toks = [t.split(' ') for t in texts]
|
42 |
page_nums = []
|
43 |
chunks = []
|
|
|
50 |
text_toks[idx+1] = chunk + text_toks[idx+1]
|
51 |
continue
|
52 |
chunk = ' '.join(chunk).strip()
|
53 |
+
chunk = f'[File no. {file_number}] [Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
|
54 |
chunks.append(chunk)
|
55 |
return chunks
|
56 |
|
|
|
92 |
|
93 |
|
94 |
|
95 |
+
def load_recommender(paths, start_page=1):
|
96 |
global recommender
|
97 |
+
texts = []
|
98 |
+
chunks = []
|
99 |
+
for idx, path in enumerate(paths):
|
100 |
+
chunks += text_to_chunks(pdf_to_text(path, start_page=start_page), start_page=start_page, file_number=idx+1)
|
101 |
recommender.fit(chunks)
|
102 |
return 'Corpus Loaded.'
|
103 |
|
|
|
143 |
prompt += c + '\n\n'
|
144 |
|
145 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
146 |
+
"Cite each reference using [File number][ Page Number] notation. "\
|
147 |
"Only answer what is asked. The answer should be short and concise. \n\nQuery: "
|
148 |
|
149 |
prompt += f"{question}\nAnswer:"
|
|
|
151 |
return answer
|
152 |
|
153 |
|
154 |
+
def question_answer(chat_history, url, files, question, openAI_key, model):
|
155 |
try:
|
156 |
+
if files == None:
|
157 |
+
files = []
|
158 |
if openAI_key.strip()=='':
|
159 |
return '[ERROR]: Please enter your Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
|
160 |
+
if url.strip() == '' and files == []:
|
161 |
return '[ERROR]: Both URL and PDF is empty. Provide at least one.'
|
162 |
+
if url.strip() != '' and files is not []:
|
163 |
return '[ERROR]: Both URL and PDF is provided. Please provide only one (either URL or PDF).'
|
164 |
if model is None or model =='':
|
165 |
return '[ERROR]: You have not selected any model. Please choose an LLM model.'
|
|
|
168 |
download_pdf(glob_url, 'corpus.pdf')
|
169 |
load_recommender('corpus.pdf')
|
170 |
else:
|
171 |
+
filenames = []
|
172 |
+
for file in files:
|
173 |
+
old_file_name = file.name
|
174 |
+
file_name = file.name
|
175 |
+
file_name = file_name[:-12] + file_name[-4:]
|
176 |
+
os.rename(old_file_name, file_name)
|
177 |
+
filenames.append(file_name)
|
178 |
+
load_recommender(filenames)
|
179 |
+
|
180 |
+
|
181 |
if question.strip() == '':
|
182 |
return '[ERROR]: Question field is empty'
|
183 |
if model == "text-davinci-003" or model == "gpt-4" or model == "gpt-4-32k":
|
|
|
213 |
prompt += c + '\n\n'
|
214 |
|
215 |
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
|
216 |
+
"Cite each reference using [File number] [ Page Number] notation (every result has this number at the beginning). "\
|
217 |
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
|
218 |
"with the same name, create separate answers for each. Only include information found in the results and "\
|
219 |
"don't add any additional information. Make sure the answer is correct and don't output false content. "\
|
|
|
222 |
"answer should be short and concise. \n\nQuery: {question}\nAnswer: "
|
223 |
|
224 |
prompt += f"Query: {question}\nAnswer:"
|
225 |
+
# print("prompt == " + str(prompt))
|
226 |
answer = generate_text_text_davinci_003(openAI_key, prompt,"text-davinci-003")
|
227 |
return answer
|
228 |
|
|
|
253 |
gr.Markdown(f'<center><h3>{title}</h3></center>')
|
254 |
gr.Markdown(description)
|
255 |
|
256 |
+
with gr.Row():
|
|
|
257 |
with gr.Group():
|
258 |
gr.Markdown(f'<p style="text-align:center">Get your Open AI API key <a href="https://platform.openai.com/account/api-keys">here</a></p>')
|
259 |
with gr.Accordion("API Key"):
|
260 |
openAI_key = gr.Textbox(label='Enter your OpenAI API key here', password=True)
|
261 |
+
url = gr.Textbox(label='Enter PDF URL here (Example: https://arxiv.org/pdf/1706.03762.pdf )')
|
262 |
gr.Markdown("<center><h4>OR<h4></center>")
|
263 |
+
files = gr.File(label='Upload your PDF/ Research Paper / Book here', file_types=['.pdf'], file_count="multiple")
|
264 |
question = gr.Textbox(label='Enter your question here')
|
265 |
gr.Examples(
|
266 |
[[q] for q in questions],
|
|
|
283 |
with gr.Group():
|
284 |
chatbot = gr.Chatbot(placeholder="Chat History", label="Chat History", lines=50, elem_id="chatbot")
|
285 |
|
286 |
+
|
|
|
287 |
# Bind the click event of the button to the question_answer function
|
288 |
btn.click(
|
289 |
question_answer,
|
290 |
+
inputs=[chatbot, url, files, question, openAI_key, model],
|
291 |
outputs=[chatbot],
|
292 |
)
|
293 |
|
294 |
+
demo.launch()
|
|
|
|