added EvalDataset Generation
Browse files
app.py
CHANGED
|
@@ -56,6 +56,7 @@ from huggingface_hub import login
|
|
| 56 |
login(token=HF_KEY)
|
| 57 |
vectorstore=None
|
| 58 |
rerankingModel=None
|
|
|
|
| 59 |
|
| 60 |
class BSIChatbot:
|
| 61 |
embedding_model = None
|
|
@@ -158,14 +159,14 @@ class BSIChatbot:
|
|
| 158 |
##Was macht man mit start Index herausfinden und wie metadata adden
|
| 159 |
docs_processed = []
|
| 160 |
for doc in RAW_KNOWLEDGE_BASE:
|
| 161 |
-
|
| 162 |
doc_cache = markdown_splitter.split_text(doc.page_content)
|
| 163 |
# print(f"Word-Length in doc_cache after MarkdownSplitter:{len(doc_cache.split())}")
|
| 164 |
doc_cache = text_splitter.split_documents(doc_cache)
|
| 165 |
# print(f"Word-Length in doc_cache after text_splitter:{len(doc_cache.split())}")
|
| 166 |
for chunk in doc_cache:
|
| 167 |
chunk.metadata.update({"source": doc.metadata['source']})
|
| 168 |
-
|
| 169 |
# DEBUG:
|
| 170 |
# print(f"doc_cache after Metadata added:{doc_cache}\n")
|
| 171 |
docs_processed += doc_cache
|
|
@@ -197,7 +198,7 @@ class BSIChatbot:
|
|
| 197 |
#for doc in RAW_KNOWLEDGE_BASE:
|
| 198 |
# docs_processed += text_splitter.split_documents([doc])
|
| 199 |
|
| 200 |
-
|
| 201 |
# Max_Sequence_Length of e5 large instr = 512 Tokens
|
| 202 |
|
| 203 |
|
|
@@ -230,14 +231,14 @@ class BSIChatbot:
|
|
| 230 |
vectorstore.save_local(self.embedPath)
|
| 231 |
#self.vectorstore.index = index_gpu
|
| 232 |
end = time.time()
|
| 233 |
-
|
| 234 |
else:
|
| 235 |
start = time.time()
|
| 236 |
vectorstore = FAISS.load_local(self.embedPath, self.embedding_model, allow_dangerous_deserialization=True)
|
| 237 |
#self.vectorstore.index = index_gpu
|
| 238 |
end = time.time()
|
| 239 |
|
| 240 |
-
|
| 241 |
|
| 242 |
def retrieveSimiliarEmbedding(self, query):
|
| 243 |
global vectorstore
|
|
@@ -254,7 +255,7 @@ class BSIChatbot:
|
|
| 254 |
# finalchunks.append(chunk)
|
| 255 |
#retrieved_chunks = finalchunks
|
| 256 |
end = time.time()
|
| 257 |
-
|
| 258 |
#print("\n==================================Top document==================================")
|
| 259 |
#print(retrieved_chunks[0].page_content)
|
| 260 |
#print(retrieved_chunks[1].page_content)
|
|
@@ -263,7 +264,7 @@ class BSIChatbot:
|
|
| 263 |
#print(retrieved_chunks[0].metadata)
|
| 264 |
#print(retrieved_chunks[1].metadata)
|
| 265 |
#print(retrieved_chunks[2].metadata)
|
| 266 |
-
|
| 267 |
return retrieved_chunks
|
| 268 |
|
| 269 |
def retrieveDocFromFaiss(self):
|
|
@@ -325,17 +326,19 @@ class BSIChatbot:
|
|
| 325 |
|
| 326 |
def retrieval(self, query, rerankingStep, hybridSearch):
|
| 327 |
global vectorstore
|
|
|
|
| 328 |
if hybridSearch == True:
|
| 329 |
allDocs = self.retrieveDocFromFaiss()
|
| 330 |
-
bm25_retriever
|
|
|
|
| 331 |
#TODO!
|
| 332 |
retriever_k=15
|
| 333 |
bm25_retriever.k= retriever_k
|
| 334 |
vectordb = vectorstore.as_retriever(search_kwargs={"k":retriever_k})
|
| 335 |
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, vectordb], weights=[0.5, 0.5])
|
| 336 |
retrieved_chunks = ensemble_retriever.get_relevant_documents(query)
|
| 337 |
-
|
| 338 |
-
|
| 339 |
else:
|
| 340 |
retrieved_chunks = self.retrieveSimiliarEmbedding(query)
|
| 341 |
retrieved_chunks_text = []
|
|
@@ -442,6 +445,7 @@ class BSIChatbot:
|
|
| 442 |
|
| 443 |
# Alles außer letzte Useranfrage, Normaler Query
|
| 444 |
query, context = self.retrieval(query, rerankingStep, True)
|
|
|
|
| 445 |
if stepBackPrompt == True:
|
| 446 |
stepBackQuery = self.stepBackPrompt(query)
|
| 447 |
print("DBG stepBackQuery:" + stepBackQuery)
|
|
@@ -474,7 +478,7 @@ class BSIChatbot:
|
|
| 474 |
# question=query, context=context, history=history[:-1]
|
| 475 |
# )
|
| 476 |
|
| 477 |
-
|
| 478 |
pattern = r"Filename:(.*?);"
|
| 479 |
last_value = final_prompt[-1]["content"]
|
| 480 |
|
|
|
|
| 56 |
login(token=HF_KEY)
|
| 57 |
vectorstore=None
|
| 58 |
rerankingModel=None
|
| 59 |
+
bm25_retriever=None
|
| 60 |
|
| 61 |
class BSIChatbot:
|
| 62 |
embedding_model = None
|
|
|
|
| 159 |
##Was macht man mit start Index herausfinden und wie metadata adden
|
| 160 |
docs_processed = []
|
| 161 |
for doc in RAW_KNOWLEDGE_BASE:
|
| 162 |
+
#newprint(f"Word-Length in doc:{len(doc.page_content.split())}")
|
| 163 |
doc_cache = markdown_splitter.split_text(doc.page_content)
|
| 164 |
# print(f"Word-Length in doc_cache after MarkdownSplitter:{len(doc_cache.split())}")
|
| 165 |
doc_cache = text_splitter.split_documents(doc_cache)
|
| 166 |
# print(f"Word-Length in doc_cache after text_splitter:{len(doc_cache.split())}")
|
| 167 |
for chunk in doc_cache:
|
| 168 |
chunk.metadata.update({"source": doc.metadata['source']})
|
| 169 |
+
#newprint(f"Chunk_Debug len: {len(chunk.page_content.split())} and Chunk:{chunk}")
|
| 170 |
# DEBUG:
|
| 171 |
# print(f"doc_cache after Metadata added:{doc_cache}\n")
|
| 172 |
docs_processed += doc_cache
|
|
|
|
| 198 |
#for doc in RAW_KNOWLEDGE_BASE:
|
| 199 |
# docs_processed += text_splitter.split_documents([doc])
|
| 200 |
|
| 201 |
+
#newprint(f"Docs processed:{len(docs_processed)}")
|
| 202 |
# Max_Sequence_Length of e5 large instr = 512 Tokens
|
| 203 |
|
| 204 |
|
|
|
|
| 231 |
vectorstore.save_local(self.embedPath)
|
| 232 |
#self.vectorstore.index = index_gpu
|
| 233 |
end = time.time()
|
| 234 |
+
#newprint("Saving Embeddings took", end-start, "seconds!")
|
| 235 |
else:
|
| 236 |
start = time.time()
|
| 237 |
vectorstore = FAISS.load_local(self.embedPath, self.embedding_model, allow_dangerous_deserialization=True)
|
| 238 |
#self.vectorstore.index = index_gpu
|
| 239 |
end = time.time()
|
| 240 |
|
| 241 |
+
#newprint("Loading Embeddings took", end - start, "seconds!")
|
| 242 |
|
| 243 |
def retrieveSimiliarEmbedding(self, query):
|
| 244 |
global vectorstore
|
|
|
|
| 255 |
# finalchunks.append(chunk)
|
| 256 |
#retrieved_chunks = finalchunks
|
| 257 |
end = time.time()
|
| 258 |
+
#newrint("Retrieving Chunks with similiar embeddings took", end - start, "seconds!")
|
| 259 |
#print("\n==================================Top document==================================")
|
| 260 |
#print(retrieved_chunks[0].page_content)
|
| 261 |
#print(retrieved_chunks[1].page_content)
|
|
|
|
| 264 |
#print(retrieved_chunks[0].metadata)
|
| 265 |
#print(retrieved_chunks[1].metadata)
|
| 266 |
#print(retrieved_chunks[2].metadata)
|
| 267 |
+
#newprint(f"printing first chunk to see whats inside: {retrieved_chunks[0]}")
|
| 268 |
return retrieved_chunks
|
| 269 |
|
| 270 |
def retrieveDocFromFaiss(self):
|
|
|
|
| 326 |
|
| 327 |
def retrieval(self, query, rerankingStep, hybridSearch):
|
| 328 |
global vectorstore
|
| 329 |
+
global bm25_retriever
|
| 330 |
if hybridSearch == True:
|
| 331 |
allDocs = self.retrieveDocFromFaiss()
|
| 332 |
+
if bm25_retriever == None:
|
| 333 |
+
bm25_retriever = BM25Retriever.from_documents(allDocs)
|
| 334 |
#TODO!
|
| 335 |
retriever_k=15
|
| 336 |
bm25_retriever.k= retriever_k
|
| 337 |
vectordb = vectorstore.as_retriever(search_kwargs={"k":retriever_k})
|
| 338 |
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, vectordb], weights=[0.5, 0.5])
|
| 339 |
retrieved_chunks = ensemble_retriever.get_relevant_documents(query)
|
| 340 |
+
#newprint("DBG: Number of Chunks retrieved")
|
| 341 |
+
#newprint(len(retrieved_chunks))
|
| 342 |
else:
|
| 343 |
retrieved_chunks = self.retrieveSimiliarEmbedding(query)
|
| 344 |
retrieved_chunks_text = []
|
|
|
|
| 445 |
|
| 446 |
# Alles außer letzte Useranfrage, Normaler Query
|
| 447 |
query, context = self.retrieval(query, rerankingStep, True)
|
| 448 |
+
|
| 449 |
if stepBackPrompt == True:
|
| 450 |
stepBackQuery = self.stepBackPrompt(query)
|
| 451 |
print("DBG stepBackQuery:" + stepBackQuery)
|
|
|
|
| 478 |
# question=query, context=context, history=history[:-1]
|
| 479 |
# )
|
| 480 |
|
| 481 |
+
#newprint(f"Query:\n{final_prompt}")
|
| 482 |
pattern = r"Filename:(.*?);"
|
| 483 |
last_value = final_prompt[-1]["content"]
|
| 484 |
|