Slfagrouche commited on
Commit
cde15ae
·
verified ·
1 Parent(s): e558654

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +266 -0
app.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Said Lfagrouche_RAG_Based_on_ syllabi_app
3
+ """
4
+
5
+ import getpass
6
+ import gradio as gr
7
+ import os
8
+ import pprint
9
+ import sys
10
+
11
+ from google.colab import drive
12
+ from gradio.themes.base import Base
13
+ from icecream import ic
14
+ from pymongo import MongoClient
15
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
16
+ from weaviate.embedded import EmbeddedOptions
17
+
18
+ # langchain imports
19
+ from langchain.callbacks.tracers import ConsoleCallbackHandler
20
+ from langchain.document_loaders import PyPDFLoader, TextLoader
21
+ from langchain.embeddings import OpenAIEmbeddings
22
+ from langchain.prompts import PromptTemplate, ChatPromptTemplate
23
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
24
+ from langchain.vectorstores import MongoDBAtlasVectorSearch, Weaviate
25
+ from langchain_core.messages import HumanMessage, SystemMessage
26
+ from langchain_core.output_parsers import StrOutputParser
27
+ from langchain_core.runnables import RunnablePassthrough
28
+ from langchain_openai import ChatOpenAI
29
+
30
+ # langchain_community imports
31
+ from langchain_community.embeddings import HuggingFaceEmbeddings
32
+ from langchain_community.llms import HuggingFacePipeline
33
+
34
+ # Get secret keys.
35
+ os.environ["OPENAI_API_KEY"] = getpass.getpass()
36
+ os.environ["MONGO_URI"] = getpass.getpass()
37
+
38
+ # Retrieve environment variables.
39
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
40
+ MONGO_URI = os.getenv('MONGO_URI')
41
+
42
+ # For Google Colab.
43
+ # Mount (connect) our Google Drive to our Colab environment.
44
+ # This will establish a connection to our Google Drive, making it accessible from our Colab notebook.
45
+ drive.mount("/content/drive/")
46
+
47
+ # For Google Colab.
48
+ ! ls "/content/drive/MyDrive/RAG Project"
49
+
50
+ # For Google Colab.
51
+ # Append our directory path to the Python system path.
52
+ directory_path = "/content/drive/MyDrive/RAG Project"
53
+
54
+ sys.path.append(directory_path)
55
+
56
+ # Print the updated system path to the console.
57
+ print("sys.path =", sys.path)
58
+
59
+ # Get all the filenames under our directory path.
60
+ my_pdfs = os.listdir(directory_path)
61
+ my_pdfs
62
+
63
+ # Connect to MongoDB Atlas cluster using the connection string.
64
+ cluster = MongoClient(MONGO_URI)
65
+
66
+ # Define the MongoDB database and collection name.
67
+ DB_NAME = "pdfs"
68
+ COLLECTION_NAME = "pdfs_collection"
69
+
70
+ # Connect to the specific collection in the database.
71
+ MONGODB_COLLECTION = cluster[DB_NAME][COLLECTION_NAME]
72
+
73
+ vector_search_index = "vector_index"
74
+
75
+ # Load the PDF.
76
+ loaders = []
77
+ for my_pdf in my_pdfs:
78
+ my_pdf_path = os.path.join(directory_path, my_pdf)
79
+ loaders.append(PyPDFLoader(my_pdf_path))
80
+
81
+ print("len(loaders) =", len(loaders))
82
+
83
+ loaders
84
+
85
+ # Load the PDF.
86
+ # data = [loader.load() for loader in loaders]
87
+
88
+ data = []
89
+ for loader in loaders:
90
+ data.append(loader.load())
91
+
92
+ print("len(data) =", len(data), "\n")
93
+
94
+ # First PDF file.
95
+ data[0]
96
+
97
+ # Initialize the text splitter.
98
+ # Uses a text splitter to split the data into smaller documents.
99
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
100
+ text_splitter
101
+
102
+ # docs = [text_splitter.split_documents(doc) for doc in data]
103
+
104
+ docs = []
105
+ for doc in data:
106
+ chunk = text_splitter.split_documents(doc)
107
+ docs.append(chunk)
108
+
109
+ # Debugging purposes.
110
+ # Print the number of total documents to be stored in the vector database.
111
+ total = 0
112
+ for i in range(len(docs)):
113
+ if i == len(docs) - 1:
114
+ print(len(docs[i]), end="")
115
+ else:
116
+ print(len(docs[i]), "+ " ,end="")
117
+ total += len(docs[i])
118
+ print(" =", total, " total documents\n")
119
+
120
+ # Print the first document.
121
+ print(docs[0], "\n\n\n")
122
+
123
+ # Print the total number of PDF files.
124
+ # docs is a list of lists where each list stores all the documents for one PDF file.
125
+ print(len(docs))
126
+
127
+ docs
128
+
129
+ # Merge the documents to be embededed and store them in the vector database.
130
+ merged_documents = []
131
+
132
+ for doc in docs:
133
+ merged_documents.extend(doc)
134
+
135
+ # Print the merged list of all the documents.
136
+ print("len(merged_documents) =", len(merged_documents))
137
+ print(merged_documents)
138
+
139
+ # Hugging Face model for embeddings.
140
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
141
+ model_kwargs = {'device': 'cpu'}
142
+ embeddings = HuggingFaceEmbeddings(
143
+ model_name=model_name,
144
+ model_kwargs=model_kwargs,
145
+ )
146
+
147
+ import weaviate
148
+ from weaviate.embedded import EmbeddedOptions
149
+
150
+ client = weaviate.Client(
151
+ embedded_options=EmbeddedOptions()
152
+ )
153
+
154
+ vector_search = Weaviate.from_documents(
155
+ client = client,
156
+ documents = merged_documents,
157
+ embedding = OpenAIEmbeddings(),
158
+ by_text = False
159
+ )
160
+ # At this point, 'docs' are split and indexed in Weaviate, enabling text search capabilities.
161
+
162
+ # Semantic Search.
163
+ # query = "When is the spring recess at The City College of New York for Spring 2024?"
164
+ query = "What are the professor names for this semester"
165
+ results = vector_search.similarity_search(query=query, k=10) # 10 most similar documents.
166
+
167
+ print("\n")
168
+ pprint.pprint(results)
169
+ # ic(results) # Debugging purposes.
170
+
171
+ # Semantic Search with Score.
172
+ # query = "When is the spring recess at The City College of New York for Spring 2024?"
173
+ query = "Where is operating system exam taken?"
174
+ results = vector_search.similarity_search_with_score(
175
+ query = query, k = 10 # 10 most similar documents.
176
+ )
177
+
178
+ pprint.pprint(results)
179
+ # ic(results) # Debugging purposes.
180
+
181
+ # Filter on metadata.
182
+ # Semantic search with filtering.
183
+ query = "Where is Data tools and algorithm exam taken?"
184
+
185
+ results = vector_search.similarity_search_with_score(
186
+ query = query,
187
+ k = 10, # 10 most similar documents.
188
+ pre_filter = { "page": { "$eq": 1 } } # Filtering on the page number.
189
+ )
190
+
191
+ pprint.pprint(results)
192
+ # ic(results) # Debugging purposes.
193
+
194
+ # Instantiate Weaviate Vector Search as a retriever
195
+ retriever = vector_search.as_retriever(
196
+ search_type = "similarity", # similarity, mmr, similarity_score_threshold. https://api.python.langchain.com/en/latest/vectorstores/langchain_core.vectorstores.VectorStore.html#langchain_core.vectorstores.VectorStore.as_retriever
197
+ search_kwargs = {"k": 5, "score_threshold": 0.89}
198
+ )
199
+
200
+ # Define a prompt template.
201
+ # Define a LangChain prompt template to instruct the LLM to use our documents as the context.
202
+ # LangChain passes these documents to the {context} input variable and the user's query to the {question} variable.
203
+ template = """
204
+ Use the following pieces of context to answer the question at the end.
205
+ If you do not know the answer, just say that you do not know, do not try to make up an answer.
206
+
207
+ {context}
208
+
209
+ Question: {question}
210
+ """
211
+
212
+ custom_rag_prompt = PromptTemplate.from_template(template)
213
+
214
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2) # Increasing the temperature, the model becomes more creative and takes longer for inference.
215
+
216
+ # Input : docs (list of documents)
217
+ # Output: A single string that concatenates the page_content of each document in the list, separated by two newline characters.
218
+ def format_docs(docs):
219
+ return "\n\n".join(doc.page_content for doc in docs)
220
+
221
+ # Regular chain format is defined as: chain = context_setup | prompt_template | model | output_parser
222
+
223
+ rag_chain = (
224
+ {"context": retriever | format_docs, "question": RunnablePassthrough()} # Setup the context and question for the chain
225
+ | custom_rag_prompt # Apply a custom prompt template to format the input for the LLM
226
+ | llm # Process the formatted input through a language model (LLM)
227
+ | StrOutputParser() # Parse the LLM's output into a structured format
228
+ )
229
+
230
+ # Prompt the chain.
231
+ query = "What is student favourite class"
232
+ answer = rag_chain.invoke(query)
233
+
234
+ print("\nQuestion: " + query)
235
+ print("Answer: " + answer)
236
+
237
+ # Return the source documents
238
+ documents = retriever.get_relevant_documents(query)
239
+
240
+ print("\nSource documents:")
241
+ pprint.pprint(documents)
242
+
243
+ # Input : query.
244
+ # Output: answer.
245
+ def get_response(query):
246
+ return rag_chain.invoke(query)
247
+
248
+ # Gradio application.
249
+ with gr.Blocks(theme=Base(), title="RAG QA App Using Spring 2024 Syllabuses PDFs, Weaviate As The Vector Database, and Gradio") as demo:
250
+ gr.Markdown(
251
+ """
252
+ # RAG Question Answering App Using PDF Files, MongoDB As The Vector Database, and Gradio
253
+ """)
254
+ textbox = gr.Textbox(label="Question:")
255
+ with gr.Row():
256
+ button = gr.Button("Submit", variant="primary")
257
+ with gr.Column():
258
+ output1 = gr.Textbox(lines=1, max_lines=10, label="Answer:")
259
+
260
+
261
+ # Call get_response function upon clicking the Submit button.
262
+ button.click(get_response, textbox, outputs=[output1])
263
+
264
+ demo.launch(share=True)
265
+
266
+