added EvalDataset Generation
Browse files
app.py
CHANGED
@@ -56,6 +56,7 @@ from huggingface_hub import login
|
|
56 |
login(token=HF_KEY)
|
57 |
vectorstore=None
|
58 |
rerankingModel=None
|
|
|
59 |
|
60 |
class BSIChatbot:
|
61 |
embedding_model = None
|
@@ -158,14 +159,14 @@ class BSIChatbot:
|
|
158 |
##Was macht man mit start Index herausfinden und wie metadata adden
|
159 |
docs_processed = []
|
160 |
for doc in RAW_KNOWLEDGE_BASE:
|
161 |
-
|
162 |
doc_cache = markdown_splitter.split_text(doc.page_content)
|
163 |
# print(f"Word-Length in doc_cache after MarkdownSplitter:{len(doc_cache.split())}")
|
164 |
doc_cache = text_splitter.split_documents(doc_cache)
|
165 |
# print(f"Word-Length in doc_cache after text_splitter:{len(doc_cache.split())}")
|
166 |
for chunk in doc_cache:
|
167 |
chunk.metadata.update({"source": doc.metadata['source']})
|
168 |
-
|
169 |
# DEBUG:
|
170 |
# print(f"doc_cache after Metadata added:{doc_cache}\n")
|
171 |
docs_processed += doc_cache
|
@@ -197,7 +198,7 @@ class BSIChatbot:
|
|
197 |
#for doc in RAW_KNOWLEDGE_BASE:
|
198 |
# docs_processed += text_splitter.split_documents([doc])
|
199 |
|
200 |
-
|
201 |
# Max_Sequence_Length of e5 large instr = 512 Tokens
|
202 |
|
203 |
|
@@ -230,14 +231,14 @@ class BSIChatbot:
|
|
230 |
vectorstore.save_local(self.embedPath)
|
231 |
#self.vectorstore.index = index_gpu
|
232 |
end = time.time()
|
233 |
-
|
234 |
else:
|
235 |
start = time.time()
|
236 |
vectorstore = FAISS.load_local(self.embedPath, self.embedding_model, allow_dangerous_deserialization=True)
|
237 |
#self.vectorstore.index = index_gpu
|
238 |
end = time.time()
|
239 |
|
240 |
-
|
241 |
|
242 |
def retrieveSimiliarEmbedding(self, query):
|
243 |
global vectorstore
|
@@ -254,7 +255,7 @@ class BSIChatbot:
|
|
254 |
# finalchunks.append(chunk)
|
255 |
#retrieved_chunks = finalchunks
|
256 |
end = time.time()
|
257 |
-
|
258 |
#print("\n==================================Top document==================================")
|
259 |
#print(retrieved_chunks[0].page_content)
|
260 |
#print(retrieved_chunks[1].page_content)
|
@@ -263,7 +264,7 @@ class BSIChatbot:
|
|
263 |
#print(retrieved_chunks[0].metadata)
|
264 |
#print(retrieved_chunks[1].metadata)
|
265 |
#print(retrieved_chunks[2].metadata)
|
266 |
-
|
267 |
return retrieved_chunks
|
268 |
|
269 |
def retrieveDocFromFaiss(self):
|
@@ -325,17 +326,19 @@ class BSIChatbot:
|
|
325 |
|
326 |
def retrieval(self, query, rerankingStep, hybridSearch):
|
327 |
global vectorstore
|
|
|
328 |
if hybridSearch == True:
|
329 |
allDocs = self.retrieveDocFromFaiss()
|
330 |
-
bm25_retriever
|
|
|
331 |
#TODO!
|
332 |
retriever_k=15
|
333 |
bm25_retriever.k= retriever_k
|
334 |
vectordb = vectorstore.as_retriever(search_kwargs={"k":retriever_k})
|
335 |
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, vectordb], weights=[0.5, 0.5])
|
336 |
retrieved_chunks = ensemble_retriever.get_relevant_documents(query)
|
337 |
-
|
338 |
-
|
339 |
else:
|
340 |
retrieved_chunks = self.retrieveSimiliarEmbedding(query)
|
341 |
retrieved_chunks_text = []
|
@@ -442,6 +445,7 @@ class BSIChatbot:
|
|
442 |
|
443 |
# Alles außer letzte Useranfrage, Normaler Query
|
444 |
query, context = self.retrieval(query, rerankingStep, True)
|
|
|
445 |
if stepBackPrompt == True:
|
446 |
stepBackQuery = self.stepBackPrompt(query)
|
447 |
print("DBG stepBackQuery:" + stepBackQuery)
|
@@ -474,7 +478,7 @@ class BSIChatbot:
|
|
474 |
# question=query, context=context, history=history[:-1]
|
475 |
# )
|
476 |
|
477 |
-
|
478 |
pattern = r"Filename:(.*?);"
|
479 |
last_value = final_prompt[-1]["content"]
|
480 |
|
|
|
56 |
login(token=HF_KEY)
|
57 |
vectorstore=None
|
58 |
rerankingModel=None
|
59 |
+
bm25_retriever=None
|
60 |
|
61 |
class BSIChatbot:
|
62 |
embedding_model = None
|
|
|
159 |
##Was macht man mit start Index herausfinden und wie metadata adden
|
160 |
docs_processed = []
|
161 |
for doc in RAW_KNOWLEDGE_BASE:
|
162 |
+
#newprint(f"Word-Length in doc:{len(doc.page_content.split())}")
|
163 |
doc_cache = markdown_splitter.split_text(doc.page_content)
|
164 |
# print(f"Word-Length in doc_cache after MarkdownSplitter:{len(doc_cache.split())}")
|
165 |
doc_cache = text_splitter.split_documents(doc_cache)
|
166 |
# print(f"Word-Length in doc_cache after text_splitter:{len(doc_cache.split())}")
|
167 |
for chunk in doc_cache:
|
168 |
chunk.metadata.update({"source": doc.metadata['source']})
|
169 |
+
#newprint(f"Chunk_Debug len: {len(chunk.page_content.split())} and Chunk:{chunk}")
|
170 |
# DEBUG:
|
171 |
# print(f"doc_cache after Metadata added:{doc_cache}\n")
|
172 |
docs_processed += doc_cache
|
|
|
198 |
#for doc in RAW_KNOWLEDGE_BASE:
|
199 |
# docs_processed += text_splitter.split_documents([doc])
|
200 |
|
201 |
+
#newprint(f"Docs processed:{len(docs_processed)}")
|
202 |
# Max_Sequence_Length of e5 large instr = 512 Tokens
|
203 |
|
204 |
|
|
|
231 |
vectorstore.save_local(self.embedPath)
|
232 |
#self.vectorstore.index = index_gpu
|
233 |
end = time.time()
|
234 |
+
#newprint("Saving Embeddings took", end-start, "seconds!")
|
235 |
else:
|
236 |
start = time.time()
|
237 |
vectorstore = FAISS.load_local(self.embedPath, self.embedding_model, allow_dangerous_deserialization=True)
|
238 |
#self.vectorstore.index = index_gpu
|
239 |
end = time.time()
|
240 |
|
241 |
+
#newprint("Loading Embeddings took", end - start, "seconds!")
|
242 |
|
243 |
def retrieveSimiliarEmbedding(self, query):
|
244 |
global vectorstore
|
|
|
255 |
# finalchunks.append(chunk)
|
256 |
#retrieved_chunks = finalchunks
|
257 |
end = time.time()
|
258 |
+
#newrint("Retrieving Chunks with similiar embeddings took", end - start, "seconds!")
|
259 |
#print("\n==================================Top document==================================")
|
260 |
#print(retrieved_chunks[0].page_content)
|
261 |
#print(retrieved_chunks[1].page_content)
|
|
|
264 |
#print(retrieved_chunks[0].metadata)
|
265 |
#print(retrieved_chunks[1].metadata)
|
266 |
#print(retrieved_chunks[2].metadata)
|
267 |
+
#newprint(f"printing first chunk to see whats inside: {retrieved_chunks[0]}")
|
268 |
return retrieved_chunks
|
269 |
|
270 |
def retrieveDocFromFaiss(self):
|
|
|
326 |
|
327 |
def retrieval(self, query, rerankingStep, hybridSearch):
|
328 |
global vectorstore
|
329 |
+
global bm25_retriever
|
330 |
if hybridSearch == True:
|
331 |
allDocs = self.retrieveDocFromFaiss()
|
332 |
+
if bm25_retriever == None:
|
333 |
+
bm25_retriever = BM25Retriever.from_documents(allDocs)
|
334 |
#TODO!
|
335 |
retriever_k=15
|
336 |
bm25_retriever.k= retriever_k
|
337 |
vectordb = vectorstore.as_retriever(search_kwargs={"k":retriever_k})
|
338 |
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, vectordb], weights=[0.5, 0.5])
|
339 |
retrieved_chunks = ensemble_retriever.get_relevant_documents(query)
|
340 |
+
#newprint("DBG: Number of Chunks retrieved")
|
341 |
+
#newprint(len(retrieved_chunks))
|
342 |
else:
|
343 |
retrieved_chunks = self.retrieveSimiliarEmbedding(query)
|
344 |
retrieved_chunks_text = []
|
|
|
445 |
|
446 |
# Alles außer letzte Useranfrage, Normaler Query
|
447 |
query, context = self.retrieval(query, rerankingStep, True)
|
448 |
+
|
449 |
if stepBackPrompt == True:
|
450 |
stepBackQuery = self.stepBackPrompt(query)
|
451 |
print("DBG stepBackQuery:" + stepBackQuery)
|
|
|
478 |
# question=query, context=context, history=history[:-1]
|
479 |
# )
|
480 |
|
481 |
+
#newprint(f"Query:\n{final_prompt}")
|
482 |
pattern = r"Filename:(.*?);"
|
483 |
last_value = final_prompt[-1]["content"]
|
484 |
|