Initial Push of small files
Browse files- app.py +244 -0
- opm_logo.png +0 -0
- requirements.txt +15 -0
- utils.py +279 -0
app.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pypdf import PdfReader
|
3 |
+
# import replicate
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import pickle
|
8 |
+
import timeit
|
9 |
+
from PIL import Image
|
10 |
+
import datetime
|
11 |
+
import base64
|
12 |
+
|
13 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
14 |
+
from langchain.vectorstores import FAISS
|
15 |
+
from langchain.document_loaders import PyPDFLoader
|
16 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
+
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
18 |
+
from langchain.memory import ConversationBufferMemory
|
19 |
+
from langchain.chains import ConversationalRetrievalChain
|
20 |
+
from langchain.prompts.prompt import PromptTemplate
|
21 |
+
from langchain.llms import LlamaCpp
|
22 |
+
from langchain.callbacks.manager import CallbackManager
|
23 |
+
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
24 |
+
from langchain.vectorstores import Chroma
|
25 |
+
from langchain.document_loaders import PyPDFDirectoryLoader
|
26 |
+
from langchain.retrievers import BM25Retriever, EnsembleRetriever
|
27 |
+
from langchain.chat_models import ChatOpenAI
|
28 |
+
from langchain.agents.agent_toolkits import create_retriever_tool
|
29 |
+
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
|
30 |
+
from langchain.utilities import SerpAPIWrapper
|
31 |
+
|
32 |
+
from utils import build_embedding_model, build_llm
|
33 |
+
from utils import load_ensemble_retriver, load_text_chunks, load_vectorstore, load_conversational_retrievel_agent
|
34 |
+
|
35 |
+
load_dotenv()
|
36 |
+
# Getting current timestamp to keep track of historical conversations
|
37 |
+
current_timestamp = datetime.datetime.now()
|
38 |
+
timestamp_string = current_timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
39 |
+
|
40 |
+
#Directories path
|
41 |
+
persist_directory= "Database/PDF_HTML_CHROMA_DB"
|
42 |
+
all_docs_pkl_directory= 'Database/text_chunks_html_pdf.pkl'
|
43 |
+
|
44 |
+
# Initliazing sesstion states in Streamlit to cache different stuffs like model iniitialization and there by avoid re-running of alredy initialized stuffs over and again.
|
45 |
+
if "llm" not in st.session_state:
|
46 |
+
st.session_state["llm"] = build_llm()
|
47 |
+
|
48 |
+
if "embeddings" not in st.session_state:
|
49 |
+
st.session_state["embeddings"] = build_embedding_model()
|
50 |
+
|
51 |
+
if "vector_db" not in st.session_state:
|
52 |
+
st.session_state["vector_db"] = load_vectorstore(persist_directory=persist_directory, embeddings=st.session_state["embeddings"])
|
53 |
+
|
54 |
+
if "text_chunks" not in st.session_state:
|
55 |
+
st.session_state["text_chunks"] = load_text_chunks(text_chunks_pkl_dir=all_docs_pkl_directory)
|
56 |
+
|
57 |
+
if "ensemble_retriver" not in st.session_state:
|
58 |
+
st.session_state["ensemble_retriver"] = load_ensemble_retriver(text_chunks=st.session_state["text_chunks"], embeddings=st.session_state["embeddings"], chroma_vectorstore=st.session_state["vector_db"] )
|
59 |
+
|
60 |
+
if "agent_executor" not in st.session_state:
|
61 |
+
st.session_state["agent_executor"] = load_conversational_retrievel_agent(retriever=st.session_state["ensemble_retriver"], llm=st.session_state["llm"])
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
# App title
|
66 |
+
st.set_page_config(
|
67 |
+
page_title="OMP Search Bot",
|
68 |
+
layout="wide",
|
69 |
+
initial_sidebar_state="expanded",
|
70 |
+
)
|
71 |
+
|
72 |
+
st.markdown("""
|
73 |
+
<style>
|
74 |
+
.block-container {
|
75 |
+
padding-top: 2.2rem}
|
76 |
+
</style>
|
77 |
+
""", unsafe_allow_html=True)
|
78 |
+
# To get header in the App
|
79 |
+
col1, col2= st.columns(2)
|
80 |
+
|
81 |
+
title1 = """
|
82 |
+
<p style="font-size: 26px;text-align: right; color: #0C3453; font-weight: bold">OPM Retirement Services Assistant</p>
|
83 |
+
"""
|
84 |
+
|
85 |
+
def clear_chat_history():
|
86 |
+
st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]
|
87 |
+
|
88 |
+
file_ = open("opm_logo.png", "rb")
|
89 |
+
contents = file_.read()
|
90 |
+
data_url = base64.b64encode(contents).decode("utf-8")
|
91 |
+
file_.close()
|
92 |
+
|
93 |
+
st.markdown(
|
94 |
+
f"""
|
95 |
+
<div style="background-color: white; padding: 15px; border-radius: 10px;">
|
96 |
+
<div style="display: flex; justify-content: space-between;">
|
97 |
+
<div>
|
98 |
+
<img src="data:image/png;base64,{data_url}" style="max-width: 100%;" alt="OPM Logo" />
|
99 |
+
</div>
|
100 |
+
<div style="flex: 1; padding: 15px;">
|
101 |
+
{title1}
|
102 |
+
""",
|
103 |
+
unsafe_allow_html=True
|
104 |
+
)
|
105 |
+
st.write("")
|
106 |
+
|
107 |
+
|
108 |
+
st.write('<p style="color: #B0B0B0;margin: 0;">OPM is here to help you transition from serving the American people to enjoying your retirement. This retirement services assistant shows our commitment to supporting new and existing retirees throughout the retirement journey. Our assistant is trained on 1500+ documents related to OPM retirement services and can answer your questions in conversational style. Just ask away..</p>', unsafe_allow_html=True)
|
109 |
+
|
110 |
+
st.markdown("""---""")
|
111 |
+
|
112 |
+
text_html = """
|
113 |
+
<p style="font-size: 14px; text-align: center; color: #727477; margin: 0;">
|
114 |
+
Type your question in conversational style
|
115 |
+
</p>
|
116 |
+
<p style="font-size: 14px; text-align: center; color: #727477; margin: 0;">
|
117 |
+
Example: What are interim benefits?
|
118 |
+
</p>
|
119 |
+
"""
|
120 |
+
|
121 |
+
st.write(text_html, unsafe_allow_html=True)
|
122 |
+
|
123 |
+
|
124 |
+
with st.sidebar:
|
125 |
+
st.subheader("")
|
126 |
+
|
127 |
+
if st.session_state["vector_db"] and st.session_state["llm"]:
|
128 |
+
# Store LLM generated responses
|
129 |
+
if "messages" not in st.session_state.keys():
|
130 |
+
st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?", "Source":""}]
|
131 |
+
|
132 |
+
# Display or clear chat messages
|
133 |
+
for message in st.session_state.messages:
|
134 |
+
with st.chat_message(message["role"]):
|
135 |
+
st.write(message["content"])
|
136 |
+
if message["Source"]=="":
|
137 |
+
st.write("")
|
138 |
+
else:
|
139 |
+
with st.expander("source"):
|
140 |
+
for idx, item in enumerate(message["Source"]):
|
141 |
+
st.markdown(item["Page"])
|
142 |
+
st.markdown(item["Source"])
|
143 |
+
st.markdown(item["page_content"])
|
144 |
+
st.write("---")
|
145 |
+
|
146 |
+
|
147 |
+
# Initialize the session state to store chat history
|
148 |
+
if "stored_session" not in st.session_state:
|
149 |
+
st.session_state["stored_session"] = []
|
150 |
+
|
151 |
+
# Create a list to store expanders
|
152 |
+
if "expanders" not in st.session_state:
|
153 |
+
st.session_state["expanders"] = []
|
154 |
+
|
155 |
+
# Define a function to add a new chat expander
|
156 |
+
def add_chat_expander(chat_history):
|
157 |
+
current_timestamp = datetime.datetime.now()
|
158 |
+
timestamp_string = current_timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
159 |
+
st.session_state["expanders"].append({"timestamp": timestamp_string, "chat_history": chat_history})
|
160 |
+
|
161 |
+
def clear_chat_history():
|
162 |
+
"""
|
163 |
+
To remove existing chat history and start new conversation
|
164 |
+
"""
|
165 |
+
stored_session = []
|
166 |
+
for dict_message in st.session_state.messages:
|
167 |
+
if dict_message["role"] == "user":
|
168 |
+
string_dialogue = "User: " + dict_message["content"] + "\n\n"
|
169 |
+
st.session_state["stored_session"].append(string_dialogue)
|
170 |
+
|
171 |
+
else:
|
172 |
+
string_dialogue = "Assistant: " + dict_message["content"] + "\n\n"
|
173 |
+
st.session_state["stored_session"].append(string_dialogue)
|
174 |
+
stored_session.append(string_dialogue)
|
175 |
+
|
176 |
+
# Add a new chat expander
|
177 |
+
add_chat_expander(stored_session)
|
178 |
+
st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?", "Source":""}]
|
179 |
+
|
180 |
+
st.sidebar.button('New chat', on_click=clear_chat_history, use_container_width=True)
|
181 |
+
st.sidebar.text("")
|
182 |
+
st.sidebar.write('<p style="font-size: 16px;text-align: center; color: #727477; font-weight: bold">Chat history</p>', unsafe_allow_html=True)
|
183 |
+
# Display existing chat expanders
|
184 |
+
for expander_info in st.session_state["expanders"]:
|
185 |
+
with st.sidebar.expander("Conversation ended at:"+"\n\n"+expander_info["timestamp"]):
|
186 |
+
for message in expander_info["chat_history"]:
|
187 |
+
if message.startswith("User:"):
|
188 |
+
st.write(f'<span style="color: #EF6A6A;">{message}</span>', unsafe_allow_html=True)
|
189 |
+
elif message.startswith("Assistant:"):
|
190 |
+
st.write(f'<span style="color: #F7BD45;">{message}</span>', unsafe_allow_html=True)
|
191 |
+
else:
|
192 |
+
st.write(message)
|
193 |
+
|
194 |
+
|
195 |
+
def generate_llm_response(agent_executor, prompt_input):
|
196 |
+
result = agent_executor({"input": prompt_input})
|
197 |
+
return [result['output'], result['intermediate_steps']]
|
198 |
+
|
199 |
+
|
200 |
+
# User-provided prompt
|
201 |
+
if prompt := st.chat_input(disabled= not st.session_state["vector_db"]):
|
202 |
+
st.session_state.messages.append({"role": "user", "content": prompt, "Source":""})
|
203 |
+
with st.chat_message("user"):
|
204 |
+
st.write(prompt)
|
205 |
+
|
206 |
+
# Generate a new response if last message is not from assistant
|
207 |
+
if st.session_state.messages[-1]["role"] != "assistant":
|
208 |
+
with st.chat_message("assistant"):
|
209 |
+
with st.spinner("Searching..."):
|
210 |
+
start = timeit.default_timer()
|
211 |
+
response = generate_llm_response(agent_executor=st.session_state["agent_executor"], prompt_input=prompt)
|
212 |
+
placeholder = st.empty()
|
213 |
+
full_response = ''
|
214 |
+
for item in response[0]:
|
215 |
+
full_response += item
|
216 |
+
placeholder.markdown(full_response)
|
217 |
+
# The following logic will work in the way given below.
|
218 |
+
# -- Check if intermediary steps are present in the output of the given prompt.
|
219 |
+
# -- If not, we can conclude that, agent has used internet search as tool.
|
220 |
+
# -- Check if intermediary steps are present in the output of the prompt.
|
221 |
+
# -- If intermediary steps are present, it means agent has used exising custom knowledge base for iformation retrival and therefore we need to give souce docs as output along with LLM's reponse.
|
222 |
+
if len(response[1])>0:
|
223 |
+
st.text("-------------------------------------")
|
224 |
+
docs= st.session_state["ensemble_retriver"].get_relevant_documents(prompt)
|
225 |
+
source_doc_list= []
|
226 |
+
for doc in docs:
|
227 |
+
source_doc_list.append(doc.dict())
|
228 |
+
merged_source_doc= []
|
229 |
+
with st.expander("source"):
|
230 |
+
for idx, item in enumerate(source_doc_list):
|
231 |
+
source_doc = {"Page": f"Source {idx + 1}", "Source": f"**Source:** {item['metadata']['source'].split('/')[-1]}",
|
232 |
+
"page_content":item["page_content"]}
|
233 |
+
merged_source_doc.append(source_doc)
|
234 |
+
st.markdown(f"Source {idx + 1}")
|
235 |
+
st.markdown(f"**Source:** {item['metadata']['source'].split('/')[-1]}")
|
236 |
+
st.markdown(item["page_content"])
|
237 |
+
st.write("---") # Add a separator between entries
|
238 |
+
message = {"role": "assistant", "content": full_response, "Source":merged_source_doc}
|
239 |
+
st.session_state.messages.append(message)
|
240 |
+
else:
|
241 |
+
message = {"role": "assistant", "content": full_response, "Source":""}
|
242 |
+
st.session_state.messages.append(message)
|
243 |
+
end = timeit.default_timer()
|
244 |
+
print(f"Time to retrieve response: {end - start}")
|
opm_logo.png
ADDED
![]() |
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
chromadb==0.4.6
|
2 |
+
langchain==0.0.278
|
3 |
+
openai==0.27.8
|
4 |
+
numpy==1.25.2
|
5 |
+
pandas==2.0.3
|
6 |
+
Pillow==9.5.0
|
7 |
+
pypdf==3.15.1
|
8 |
+
PyPDF2==3.0.1
|
9 |
+
python-dotenv==1.0.0
|
10 |
+
sentence-transformers==2.2.2
|
11 |
+
streamlit==1.25.0
|
12 |
+
streamlit-chat==0.1.1
|
13 |
+
rank-bm25==0.2.2
|
14 |
+
google-search-results==2.4.2
|
15 |
+
tiktoken
|
utils.py
ADDED
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pypdf import PdfReader
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import pickle
|
7 |
+
import timeit
|
8 |
+
from PIL import Image
|
9 |
+
import zipfile
|
10 |
+
import datetime
|
11 |
+
import shutil
|
12 |
+
from collections import defaultdict
|
13 |
+
import pandas as pd
|
14 |
+
|
15 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
16 |
+
from langchain.document_loaders import PyPDFLoader
|
17 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
18 |
+
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
19 |
+
from langchain.memory import ConversationBufferMemory
|
20 |
+
from langchain.chains import ConversationalRetrievalChain
|
21 |
+
from langchain.prompts.prompt import PromptTemplate
|
22 |
+
from langchain.vectorstores import Chroma
|
23 |
+
from langchain.document_loaders import PyPDFDirectoryLoader
|
24 |
+
from langchain.retrievers import BM25Retriever, EnsembleRetriever
|
25 |
+
from langchain.document_loaders import UnstructuredHTMLLoader
|
26 |
+
from langchain.llms import OpenAI
|
27 |
+
from langchain.chat_models import ChatOpenAI
|
28 |
+
from langchain.agents.agent_toolkits import create_retriever_tool
|
29 |
+
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
|
30 |
+
from langchain.utilities import SerpAPIWrapper
|
31 |
+
from langchain.agents import Tool
|
32 |
+
from langchain.agents import load_tools
|
33 |
+
|
34 |
+
load_dotenv()
|
35 |
+
|
36 |
+
|
37 |
+
current_timestamp = datetime.datetime.now()
|
38 |
+
timestamp_string = current_timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
39 |
+
|
40 |
+
|
41 |
+
def build_llm():
|
42 |
+
'''
|
43 |
+
Loading OpenAI model
|
44 |
+
'''
|
45 |
+
# llm= OpenAI(temperature=0.2)
|
46 |
+
llm= ChatOpenAI(temperature = 0, max_tokens=256)
|
47 |
+
return llm
|
48 |
+
|
49 |
+
def build_embedding_model():
|
50 |
+
'''
|
51 |
+
Loading Sentence transformer model for text embedding
|
52 |
+
'''
|
53 |
+
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
|
54 |
+
model_kwargs={'device': 'cpu'})
|
55 |
+
return embeddings
|
56 |
+
|
57 |
+
def unzip_opm():
|
58 |
+
# Specify the path to your ZIP file
|
59 |
+
zip_file_path = r'OPM_Files/OPM_Retirement_backup-20230902T130906Z-001.zip'
|
60 |
+
|
61 |
+
# Get the directory where the ZIP file is located
|
62 |
+
extract_path = os.path.dirname(zip_file_path)
|
63 |
+
|
64 |
+
# Create a folder with the same name as the ZIP file (without the .zip extension)
|
65 |
+
extract_folder = os.path.splitext(os.path.basename(zip_file_path))[0]
|
66 |
+
extract_folder_path = os.path.join(extract_path, extract_folder)
|
67 |
+
|
68 |
+
# Create the folder if it doesn't exist
|
69 |
+
if not os.path.exists(extract_folder_path):
|
70 |
+
os.makedirs(extract_folder_path)
|
71 |
+
|
72 |
+
# Open the ZIP file for reading
|
73 |
+
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
74 |
+
# Extract all the contents into the created folder
|
75 |
+
zip_ref.extractall(extract_folder_path)
|
76 |
+
|
77 |
+
print(f'Unzipped {zip_file_path} to {extract_folder_path}')
|
78 |
+
return extract_folder_path
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
return
|
85 |
+
|
86 |
+
def count_files_by_type(folder_path):
|
87 |
+
'''
|
88 |
+
Counting files by file type in the specified folder
|
89 |
+
'''
|
90 |
+
file_count_by_type = defaultdict(int)
|
91 |
+
|
92 |
+
for root, _, files in os.walk(folder_path):
|
93 |
+
for file in files:
|
94 |
+
_, extension = os.path.splitext(file)
|
95 |
+
file_count_by_type[extension] += 1
|
96 |
+
|
97 |
+
return file_count_by_type
|
98 |
+
|
99 |
+
def generate_file_count_table(file_count_by_type):
|
100 |
+
'''
|
101 |
+
Generate a table files count file type
|
102 |
+
'''
|
103 |
+
data = {"File Type": [], "Number of Files": []}
|
104 |
+
for extension, count in file_count_by_type.items():
|
105 |
+
data["File Type"].append(extension)
|
106 |
+
data["Number of Files"].append(count)
|
107 |
+
|
108 |
+
df = pd.DataFrame(data)
|
109 |
+
df = df.sort_values(by="Number of Files", ascending=False) # Sort by number of files
|
110 |
+
return df
|
111 |
+
|
112 |
+
def move_files_to_folders(folder_path):
|
113 |
+
'''
|
114 |
+
Move files to respective folder. Example, PDF docs to PDFs folder, HTML docs to HTMLs folder.
|
115 |
+
'''
|
116 |
+
for root, _, files in os.walk(folder_path):
|
117 |
+
for file in files:
|
118 |
+
_, extension = os.path.splitext(file)
|
119 |
+
source_path = os.path.join(root, file)
|
120 |
+
|
121 |
+
if extension == '.pdf':
|
122 |
+
dest_folder = "PDFs"
|
123 |
+
elif extension == '.html':
|
124 |
+
dest_folder = "HTMLs"
|
125 |
+
else:
|
126 |
+
continue
|
127 |
+
|
128 |
+
dest_path = os.path.join(dest_folder, file)
|
129 |
+
os.makedirs(dest_folder, exist_ok=True)
|
130 |
+
shutil.copy(source_path, dest_path)
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
def load_vectorstore(persist_directory, embeddings):
|
135 |
+
'''
|
136 |
+
This function will try first to load chroma database from the disk. If it does exist,
|
137 |
+
It will do the following,
|
138 |
+
1) Load the pdfs
|
139 |
+
2) create text chunks
|
140 |
+
3) Index it and store it in a Chroma DB
|
141 |
+
4) Peform the same for HTML files
|
142 |
+
5) Store the final chroma db in the disk
|
143 |
+
'''
|
144 |
+
if os.path.exists(persist_directory):
|
145 |
+
print("Using existing vectore store for these documents.")
|
146 |
+
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
|
147 |
+
print("Chroma DB loaded from the disk")
|
148 |
+
return vectorstore
|
149 |
+
else:
|
150 |
+
folder_path= unzip_opm()
|
151 |
+
print("Vector store is not available. Creating new one.")
|
152 |
+
file_count_by_type = count_files_by_type(folder_path)
|
153 |
+
file_count_table = generate_file_count_table(file_count_by_type)
|
154 |
+
print("File Count Table:")
|
155 |
+
print(file_count_table)
|
156 |
+
#move files into respective folders
|
157 |
+
move_files_to_folders(folder_path)
|
158 |
+
print("PDF and HTML files copied to separate folders.")
|
159 |
+
|
160 |
+
# Load the pdf files from the pdffolder in order to create new chroma db
|
161 |
+
pdf_folder_path= f"{folder_path}/PDFs" #pdf folder
|
162 |
+
html_folder_path= f"{folder_path}/HTMLs" #html folder
|
163 |
+
pdf_dir_loader = PyPDFDirectoryLoader(pdf_folder_path)
|
164 |
+
pdf_pages = pdf_dir_loader.load()
|
165 |
+
print("PDF files are loaded from the folder.")
|
166 |
+
|
167 |
+
|
168 |
+
#Loading HTML files from the html folder in order to create new chroma db
|
169 |
+
HTML_docs_path_list = [os.path.join(html_folder_path, f) for f in os.listdir(html_folder_path) if os.path.isfile(os.path.join(html_folder_path, f))]
|
170 |
+
|
171 |
+
html_loaders= []
|
172 |
+
for html_file in HTML_docs_path_list:
|
173 |
+
loader = UnstructuredHTMLLoader(html_file)
|
174 |
+
html_loaders.append(loader)
|
175 |
+
|
176 |
+
html_pages = []
|
177 |
+
docs_cannot_load= []
|
178 |
+
for loader in html_loaders:
|
179 |
+
try:
|
180 |
+
html_pages.extend(loader.load())
|
181 |
+
except:
|
182 |
+
print("Cannot load the file:", loader)
|
183 |
+
docs_cannot_load.append(loader)
|
184 |
+
print("HTML files are loaded from the folder.")
|
185 |
+
# Create text chunks from the PDF docs
|
186 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
187 |
+
# Set a really small chunk size, just to show.
|
188 |
+
chunk_size = 1000,
|
189 |
+
chunk_overlap = 200,
|
190 |
+
length_function = len,
|
191 |
+
is_separator_regex = False,
|
192 |
+
)
|
193 |
+
|
194 |
+
pdf_texts = text_splitter.transform_documents(pdf_pages)
|
195 |
+
# Create text chunks from the HTML docs
|
196 |
+
html_texts = text_splitter.transform_documents(html_pages)
|
197 |
+
# Merging all the text chunks (HTML + PDF)
|
198 |
+
all_texts= pdf_texts+html_texts
|
199 |
+
print("PDF and HTML docs are split into chunks and created a final list representing all the chunks.")
|
200 |
+
|
201 |
+
# Create embeddings for all the text chunks and store it in a Chroma DB
|
202 |
+
vectorstore = Chroma.from_documents(all_texts,
|
203 |
+
embeddings,
|
204 |
+
persist_directory=persist_directory)
|
205 |
+
vectorstore.persist()
|
206 |
+
print("Chroma DB created and loaded")
|
207 |
+
return vectorstore
|
208 |
+
|
209 |
+
|
210 |
+
def load_text_chunks(text_chunks_pkl_dir):
|
211 |
+
'''
|
212 |
+
Loading the pickle file that holds all the documents from the disk.
|
213 |
+
If it does not exist, create new one.
|
214 |
+
Text documents are required to create BM25 Retriever. But loading all the documents in
|
215 |
+
every session will be a time consuming process. So we are storing all the docs in a pickle file
|
216 |
+
and load the pickle file from the disk to overcome this problem.
|
217 |
+
'''
|
218 |
+
try:
|
219 |
+
print("Text chunks are loading from the disk")
|
220 |
+
with open(text_chunks_pkl_dir, 'rb') as file:
|
221 |
+
cached_text_chunks = pickle.load(file)
|
222 |
+
# Now, `cached_text_chunks` contains your cached data
|
223 |
+
print("Text chunks are loaded from the disk")
|
224 |
+
return cached_text_chunks
|
225 |
+
except:
|
226 |
+
print("Creating text chunks from the docs and caching it.")
|
227 |
+
folder_path= unzip_opm()
|
228 |
+
pdf_folder_path= f"{folder_path}/PDFs" #pdf folder
|
229 |
+
html_folder_path= f"{folder_path}/HTMLs" #html folder
|
230 |
+
pdf_dir_loader = PyPDFDirectoryLoader(pdf_folder_path)
|
231 |
+
pdf_pages = pdf_dir_loader.load()
|
232 |
+
HTML_docs_path_list = [os.path.join(html_folder_path, f) for f in os.listdir(html_folder_path) if os.path.isfile(os.path.join(html_folder_path, f))]
|
233 |
+
|
234 |
+
html_loaders= []
|
235 |
+
for html_file in HTML_docs_path_list:
|
236 |
+
loader = UnstructuredHTMLLoader(html_file)
|
237 |
+
html_loaders.append(loader)
|
238 |
+
|
239 |
+
html_pages = []
|
240 |
+
for loader in html_loaders:
|
241 |
+
try:
|
242 |
+
html_pages.extend(loader.load())
|
243 |
+
except:
|
244 |
+
print("Cannot load the file:", loader)
|
245 |
+
all_texts= pdf_pages+html_pages
|
246 |
+
# Cache the list to a file
|
247 |
+
with open('text_chunks.pkl', 'wb') as file:
|
248 |
+
pickle.dump(all_texts, file)
|
249 |
+
print("Text chunks are created and cached")
|
250 |
+
|
251 |
+
def load_ensemble_retriver(text_chunks, embeddings, chroma_vectorstore):
|
252 |
+
"""Load ensemble retiriever with BM25 and Chroma as individual retrievers"""
|
253 |
+
bm25_retriever = BM25Retriever.from_documents(text_chunks)
|
254 |
+
bm25_retriever.k = 1
|
255 |
+
chroma_retriever = chroma_vectorstore.as_retriever(search_kwargs={"k": 1})
|
256 |
+
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.3, 0.7])
|
257 |
+
return ensemble_retriever
|
258 |
+
|
259 |
+
|
260 |
+
def load_conversational_retrievel_agent(retriever, llm):
|
261 |
+
'''Load Conversational Retrievel agent with following tasks as tools,
|
262 |
+
1) OPM Knowledge base query
|
263 |
+
2) INternet search with SerpAPI
|
264 |
+
This agent combines RAG, chat interfaces, agents.
|
265 |
+
'''
|
266 |
+
retriever_tool = create_retriever_tool(
|
267 |
+
retriever,
|
268 |
+
"Search_US_Office_of_Personnel_Management_Document",
|
269 |
+
"Searches and returns documents regarding the U.S. Office of Personnel Management (OPM).")
|
270 |
+
search_api = SerpAPIWrapper()
|
271 |
+
search_api_tool = Tool(
|
272 |
+
name = "Current_Search",
|
273 |
+
func=search_api.run,
|
274 |
+
description="useful for when you need to answer questions about current events or the current state of the world"
|
275 |
+
)
|
276 |
+
tools = [retriever_tool]
|
277 |
+
agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=True, max_token_limit=512)
|
278 |
+
return agent_executor
|
279 |
+
|