albhu commited on
Commit
a284200
·
verified ·
1 Parent(s): 359b434

Update search.py

Browse files
Files changed (1) hide show
  1. search.py +110 -4
search.py CHANGED
@@ -1,15 +1,19 @@
 
 
 
 
 
 
1
  from transformers import AutoTokenizer, AutoModelForCausalLM
2
  from docx import Document
3
  from pdfminer.high_level import extract_text
4
- from transformers import GPT2Tokenizer
5
  from dataclasses import dataclass
6
  from typing import List
7
  from tqdm import tqdm
8
- import os
9
- import pandas as pd
10
  import re
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
- import numpy as np
 
13
 
14
  tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
15
  model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True)
@@ -133,3 +137,105 @@ def get_embedding(text, tokenizer):
133
  print("Error obtaining embedding:", e)
134
  embedding = []
135
  return embedding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import os
5
+ from dotenv import load_dotenv
6
+ import search # Import the search module
7
  from transformers import AutoTokenizer, AutoModelForCausalLM
8
  from docx import Document
9
  from pdfminer.high_level import extract_text
 
10
  from dataclasses import dataclass
11
  from typing import List
12
  from tqdm import tqdm
 
 
13
  import re
14
  from sklearn.feature_extraction.text import TfidfVectorizer
15
+
16
+ load_dotenv()
17
 
18
  tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
19
  model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True)
 
137
  print("Error obtaining embedding:", e)
138
  embedding = []
139
  return embedding
140
+
141
+ def save_as_pdf(conversation):
142
+ pdf_filename = "conversation.pdf"
143
+ c = canvas.Canvas(pdf_filename, pagesize=letter)
144
+
145
+ c.drawString(100, 750, "Conversation:")
146
+ y_position = 730
147
+ for q, a in conversation:
148
+ c.drawString(120, y_position, f"Q: {q}")
149
+ c.drawString(120, y_position - 20, f"A: {a}")
150
+ y_position -= 40
151
+
152
+ c.save()
153
+
154
+ st.markdown(f"Download [PDF](./{pdf_filename})")
155
+
156
+ def save_as_docx(conversation):
157
+ doc = Document()
158
+ doc.add_heading('Conversation', 0)
159
+
160
+ for q, a in conversation:
161
+ doc.add_paragraph(f'Q: {q}')
162
+ doc.add_paragraph(f'A: {a}')
163
+
164
+ doc_filename = "conversation.docx"
165
+ doc.save(doc_filename)
166
+
167
+ st.markdown(f"Download [DOCX](./{doc_filename})")
168
+
169
+ def save_as_xlsx(conversation):
170
+ df = pd.DataFrame(conversation, columns=["Question", "Answer"])
171
+ xlsx_filename = "conversation.xlsx"
172
+ df.to_excel(xlsx_filename, index=False)
173
+
174
+ st.markdown(f"Download [XLSX](./{xlsx_filename})")
175
+
176
+ def save_as_txt(conversation):
177
+ txt_filename = "conversation.txt"
178
+ with open(txt_filename, "w") as txt_file:
179
+ for q, a in conversation:
180
+ txt_file.write(f"Q: {q}\nA: {a}\n\n")
181
+
182
+ st.markdown(f"Download [TXT](./{txt_filename})")
183
+
184
+ def main():
185
+ st.markdown('<h1>Ask anything from Legal Texts</h1><p style="font-size: 12; color: gray;"></p>', unsafe_allow_html=True)
186
+ st.markdown("<h2>Upload documents</h2>", unsafe_allow_html=True)
187
+
188
+ uploaded_files = st.file_uploader("Upload one or more documents", type=['pdf', 'docx'], accept_multiple_files=True)
189
+ question = st.text_input("Ask a question based on the documents", key="question_input")
190
+
191
+ progress = st.progress(0)
192
+ for i in range(100):
193
+ progress.progress(i + 1)
194
+ time.sleep(0.01)
195
+
196
+ if uploaded_files:
197
+ df = pd.DataFrame(columns=["page_num", "paragraph_num", "content", "tokens"])
198
+ for uploaded_file in uploaded_files:
199
+ paragraphs = read_pdf_pdfminer(uploaded_file) if uploaded_file.type == "application/pdf" else read_docx(uploaded_file)
200
+ temp_df = pd.DataFrame(
201
+ [(p.page_num, p.paragraph_num, p.content, count_tokens(p.content, tokenizer))
202
+ for p in paragraphs],
203
+ columns=["page_num", "paragraph_num", "content", "tokens"]
204
+ )
205
+ df = pd.concat([df, temp_df], ignore_index=True)
206
+
207
+ if "interactions" not in st.session_state:
208
+ st.session_state["interactions"] = []
209
+
210
+ answer = ""
211
+ if question != st.session_state.get("last_question", ""):
212
+ st.text("Searching...")
213
+ answer = answer_query_with_context(question, df, tokenizer, model)
214
+ st.session_state["interactions"].append((question, answer))
215
+ st.write(answer)
216
+
217
+ st.markdown("### Interaction History")
218
+ for q, a in st.session_state["interactions"]:
219
+ st.write(f"**Q:** {q}\n\n**A:** {a}")
220
+
221
+ st.session_state["last_question"] = question
222
+
223
+ st.markdown("<h2>Sample paragraphs</h2>", unsafe_allow_html=True)
224
+ sample_size = min(len(df), 5)
225
+ st.dataframe(df.sample(n=sample_size))
226
+
227
+ if st.button("Save as PDF"):
228
+ save_as_pdf(st.session_state["interactions"])
229
+ if st.button("Save as DOCX"):
230
+ save_as_docx(st.session_state["interactions"])
231
+ if st.button("Save as XLSX"):
232
+ save_as_xlsx(st.session_state["interactions"])
233
+ if st.button("Save as TXT"):
234
+ save_as_txt(st.session_state["interactions"])
235
+
236
+
237
+ else:
238
+ st.markdown("<h2>Please upload a document to proceed.</h2>", unsafe_allow_html=True)
239
+
240
+ if __name__ == "__main__":
241
+ main()