MikeMann commited on
Commit
bca051f
·
1 Parent(s): 5527c4e

added EvalDataset Generation

Browse files
Files changed (1) hide show
  1. app.py +15 -11
app.py CHANGED
@@ -56,6 +56,7 @@ from huggingface_hub import login
56
  login(token=HF_KEY)
57
  vectorstore=None
58
  rerankingModel=None
 
59
 
60
  class BSIChatbot:
61
  embedding_model = None
@@ -158,14 +159,14 @@ class BSIChatbot:
158
  ##Was macht man mit start Index herausfinden und wie metadata adden
159
  docs_processed = []
160
  for doc in RAW_KNOWLEDGE_BASE:
161
- print(f"Word-Length in doc:{len(doc.page_content.split())}")
162
  doc_cache = markdown_splitter.split_text(doc.page_content)
163
  # print(f"Word-Length in doc_cache after MarkdownSplitter:{len(doc_cache.split())}")
164
  doc_cache = text_splitter.split_documents(doc_cache)
165
  # print(f"Word-Length in doc_cache after text_splitter:{len(doc_cache.split())}")
166
  for chunk in doc_cache:
167
  chunk.metadata.update({"source": doc.metadata['source']})
168
- print(f"Chunk_Debug len: {len(chunk.page_content.split())} and Chunk:{chunk}")
169
  # DEBUG:
170
  # print(f"doc_cache after Metadata added:{doc_cache}\n")
171
  docs_processed += doc_cache
@@ -197,7 +198,7 @@ class BSIChatbot:
197
  #for doc in RAW_KNOWLEDGE_BASE:
198
  # docs_processed += text_splitter.split_documents([doc])
199
 
200
- print(f"Docs processed:{len(docs_processed)}")
201
  # Max_Sequence_Length of e5 large instr = 512 Tokens
202
 
203
 
@@ -230,14 +231,14 @@ class BSIChatbot:
230
  vectorstore.save_local(self.embedPath)
231
  #self.vectorstore.index = index_gpu
232
  end = time.time()
233
- print("Saving Embeddings took", end-start, "seconds!")
234
  else:
235
  start = time.time()
236
  vectorstore = FAISS.load_local(self.embedPath, self.embedding_model, allow_dangerous_deserialization=True)
237
  #self.vectorstore.index = index_gpu
238
  end = time.time()
239
 
240
- print("Loading Embeddings took", end - start, "seconds!")
241
 
242
  def retrieveSimiliarEmbedding(self, query):
243
  global vectorstore
@@ -254,7 +255,7 @@ class BSIChatbot:
254
  # finalchunks.append(chunk)
255
  #retrieved_chunks = finalchunks
256
  end = time.time()
257
- print("Retrieving Chunks with similiar embeddings took", end - start, "seconds!")
258
  #print("\n==================================Top document==================================")
259
  #print(retrieved_chunks[0].page_content)
260
  #print(retrieved_chunks[1].page_content)
@@ -263,7 +264,7 @@ class BSIChatbot:
263
  #print(retrieved_chunks[0].metadata)
264
  #print(retrieved_chunks[1].metadata)
265
  #print(retrieved_chunks[2].metadata)
266
- print(f"printing first chunk to see whats inside: {retrieved_chunks[0]}")
267
  return retrieved_chunks
268
 
269
  def retrieveDocFromFaiss(self):
@@ -325,17 +326,19 @@ class BSIChatbot:
325
 
326
  def retrieval(self, query, rerankingStep, hybridSearch):
327
  global vectorstore
 
328
  if hybridSearch == True:
329
  allDocs = self.retrieveDocFromFaiss()
330
- bm25_retriever = BM25Retriever.from_documents(allDocs)
 
331
  #TODO!
332
  retriever_k=15
333
  bm25_retriever.k= retriever_k
334
  vectordb = vectorstore.as_retriever(search_kwargs={"k":retriever_k})
335
  ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, vectordb], weights=[0.5, 0.5])
336
  retrieved_chunks = ensemble_retriever.get_relevant_documents(query)
337
- print("DBG: Number of Chunks retrieved")
338
- print(len(retrieved_chunks))
339
  else:
340
  retrieved_chunks = self.retrieveSimiliarEmbedding(query)
341
  retrieved_chunks_text = []
@@ -442,6 +445,7 @@ class BSIChatbot:
442
 
443
  # Alles außer letzte Useranfrage, Normaler Query
444
  query, context = self.retrieval(query, rerankingStep, True)
 
445
  if stepBackPrompt == True:
446
  stepBackQuery = self.stepBackPrompt(query)
447
  print("DBG stepBackQuery:" + stepBackQuery)
@@ -474,7 +478,7 @@ class BSIChatbot:
474
  # question=query, context=context, history=history[:-1]
475
  # )
476
 
477
- print(f"Query:\n{final_prompt}")
478
  pattern = r"Filename:(.*?);"
479
  last_value = final_prompt[-1]["content"]
480
 
 
56
  login(token=HF_KEY)
57
  vectorstore=None
58
  rerankingModel=None
59
+ bm25_retriever=None
60
 
61
  class BSIChatbot:
62
  embedding_model = None
 
159
  ##Was macht man mit start Index herausfinden und wie metadata adden
160
  docs_processed = []
161
  for doc in RAW_KNOWLEDGE_BASE:
162
+ #newprint(f"Word-Length in doc:{len(doc.page_content.split())}")
163
  doc_cache = markdown_splitter.split_text(doc.page_content)
164
  # print(f"Word-Length in doc_cache after MarkdownSplitter:{len(doc_cache.split())}")
165
  doc_cache = text_splitter.split_documents(doc_cache)
166
  # print(f"Word-Length in doc_cache after text_splitter:{len(doc_cache.split())}")
167
  for chunk in doc_cache:
168
  chunk.metadata.update({"source": doc.metadata['source']})
169
+ #newprint(f"Chunk_Debug len: {len(chunk.page_content.split())} and Chunk:{chunk}")
170
  # DEBUG:
171
  # print(f"doc_cache after Metadata added:{doc_cache}\n")
172
  docs_processed += doc_cache
 
198
  #for doc in RAW_KNOWLEDGE_BASE:
199
  # docs_processed += text_splitter.split_documents([doc])
200
 
201
+ #newprint(f"Docs processed:{len(docs_processed)}")
202
  # Max_Sequence_Length of e5 large instr = 512 Tokens
203
 
204
 
 
231
  vectorstore.save_local(self.embedPath)
232
  #self.vectorstore.index = index_gpu
233
  end = time.time()
234
+ #newprint("Saving Embeddings took", end-start, "seconds!")
235
  else:
236
  start = time.time()
237
  vectorstore = FAISS.load_local(self.embedPath, self.embedding_model, allow_dangerous_deserialization=True)
238
  #self.vectorstore.index = index_gpu
239
  end = time.time()
240
 
241
+ #newprint("Loading Embeddings took", end - start, "seconds!")
242
 
243
  def retrieveSimiliarEmbedding(self, query):
244
  global vectorstore
 
255
  # finalchunks.append(chunk)
256
  #retrieved_chunks = finalchunks
257
  end = time.time()
258
+ #newrint("Retrieving Chunks with similiar embeddings took", end - start, "seconds!")
259
  #print("\n==================================Top document==================================")
260
  #print(retrieved_chunks[0].page_content)
261
  #print(retrieved_chunks[1].page_content)
 
264
  #print(retrieved_chunks[0].metadata)
265
  #print(retrieved_chunks[1].metadata)
266
  #print(retrieved_chunks[2].metadata)
267
+ #newprint(f"printing first chunk to see whats inside: {retrieved_chunks[0]}")
268
  return retrieved_chunks
269
 
270
  def retrieveDocFromFaiss(self):
 
326
 
327
  def retrieval(self, query, rerankingStep, hybridSearch):
328
  global vectorstore
329
+ global bm25_retriever
330
  if hybridSearch == True:
331
  allDocs = self.retrieveDocFromFaiss()
332
+ if bm25_retriever == None:
333
+ bm25_retriever = BM25Retriever.from_documents(allDocs)
334
  #TODO!
335
  retriever_k=15
336
  bm25_retriever.k= retriever_k
337
  vectordb = vectorstore.as_retriever(search_kwargs={"k":retriever_k})
338
  ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, vectordb], weights=[0.5, 0.5])
339
  retrieved_chunks = ensemble_retriever.get_relevant_documents(query)
340
+ #newprint("DBG: Number of Chunks retrieved")
341
+ #newprint(len(retrieved_chunks))
342
  else:
343
  retrieved_chunks = self.retrieveSimiliarEmbedding(query)
344
  retrieved_chunks_text = []
 
445
 
446
  # Alles außer letzte Useranfrage, Normaler Query
447
  query, context = self.retrieval(query, rerankingStep, True)
448
+
449
  if stepBackPrompt == True:
450
  stepBackQuery = self.stepBackPrompt(query)
451
  print("DBG stepBackQuery:" + stepBackQuery)
 
478
  # question=query, context=context, history=history[:-1]
479
  # )
480
 
481
+ #newprint(f"Query:\n{final_prompt}")
482
  pattern = r"Filename:(.*?);"
483
  last_value = final_prompt[-1]["content"]
484