Spaces:
Sleeping
Sleeping
Update function/chatbot.py
Browse files- function/chatbot.py +19 -19
function/chatbot.py
CHANGED
|
@@ -116,7 +116,7 @@ def text_preprocessing(text):
|
|
| 116 |
text = ' '.join(words)
|
| 117 |
return text
|
| 118 |
def find_matching_files_in_docs_12_id(text, id):
|
| 119 |
-
folder_path = f"/code/
|
| 120 |
search_terms = []
|
| 121 |
search_terms_old = []
|
| 122 |
matching_index = []
|
|
@@ -169,7 +169,7 @@ def save_list_CSV_id(file_list, id):
|
|
| 169 |
text = ""
|
| 170 |
for x in file_list:
|
| 171 |
if x.endswith('.xlsx'):
|
| 172 |
-
old = f"/code/
|
| 173 |
new = old.replace(".xlsx", ".csv")
|
| 174 |
convert_xlsx_to_csv(old, new)
|
| 175 |
x = x.replace(".xlsx", ".csv")
|
|
@@ -279,7 +279,7 @@ def question_answer(question):
|
|
| 279 |
return answer
|
| 280 |
|
| 281 |
def check_persist_directory(id, file_name):
|
| 282 |
-
directory_path = f"/code/vector_database/{id}/{file_name}"
|
| 283 |
return os.path.exists(directory_path)
|
| 284 |
|
| 285 |
from langchain_community.vectorstores import FAISS
|
|
@@ -303,11 +303,11 @@ def aws1_all_id(new_dict, text_alls, id, thread_id):
|
|
| 303 |
texts_data = text_splitter.split_text(data)
|
| 304 |
|
| 305 |
if check_persist_directory(id, file_name):
|
| 306 |
-
vectordb_query = Chroma(persist_directory=f"
|
| 307 |
else:
|
| 308 |
vectordb_query = Chroma.from_texts(texts_data,
|
| 309 |
embedding=embeddings,
|
| 310 |
-
persist_directory=f"
|
| 311 |
|
| 312 |
k_1 = len(texts_data)
|
| 313 |
retriever = vectordb_query.as_retriever(search_kwargs={f"k": k_1})
|
|
@@ -317,12 +317,12 @@ def aws1_all_id(new_dict, text_alls, id, thread_id):
|
|
| 317 |
weights=[0.6, 0.4])
|
| 318 |
docs = ensemble_retriever.get_relevant_documents(f"{query}")
|
| 319 |
|
| 320 |
-
path = f"/code/vector_database/FAISS/{id}/{file_name}"
|
| 321 |
if check_path_exists(path):
|
| 322 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
| 323 |
else:
|
| 324 |
docsearch = FAISS.from_documents(docs, embeddings)
|
| 325 |
-
docsearch.save_local(f"/code/vector_database/FAISS/{id}/{file_name}")
|
| 326 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
| 327 |
|
| 328 |
k_2 = len(docs)
|
|
@@ -443,7 +443,7 @@ def load_file(loader):
|
|
| 443 |
|
| 444 |
def extract_data2(id):
|
| 445 |
documents = []
|
| 446 |
-
directory_path = f"/code/
|
| 447 |
if not os.path.exists(directory_path) or not any(
|
| 448 |
os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
|
| 449 |
return False
|
|
@@ -490,11 +490,11 @@ def extract_data2(id):
|
|
| 490 |
texts = text_splitter.split_documents(documents)
|
| 491 |
Chroma.from_documents(documents=texts,
|
| 492 |
embedding=embeddings,
|
| 493 |
-
persist_directory=f"/code/vector_database/{id}")
|
| 494 |
return texts
|
| 495 |
|
| 496 |
def generate_question(id):
|
| 497 |
-
directory_path = f"/code/
|
| 498 |
if not os.path.exists(directory_path) or not any(
|
| 499 |
os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
|
| 500 |
return False
|
|
@@ -593,19 +593,19 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
|
|
| 593 |
print(relevance_score_float)
|
| 594 |
if relevance_score_float <= 0.12:
|
| 595 |
documents1 = []
|
| 596 |
-
for file in os.listdir(f"/code/
|
| 597 |
if file.endswith('.csv'):
|
| 598 |
-
csv_path = f"/code/
|
| 599 |
loader = UnstructuredCSVLoader(csv_path)
|
| 600 |
documents1.extend(loader.load())
|
| 601 |
elif file.endswith('.xlsx'):
|
| 602 |
-
excel_path = f"/code/
|
| 603 |
loader = UnstructuredExcelLoader(excel_path)
|
| 604 |
documents1.extend(loader.load())
|
| 605 |
text_splitter_csv = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=1500)
|
| 606 |
texts_csv = text_splitter_csv.split_documents(documents1)
|
| 607 |
vectordb_csv = Chroma.from_documents(documents=texts_csv,
|
| 608 |
-
embedding=embeddings, persist_directory=f'/code/vector_database/csv/{thread_id}')
|
| 609 |
k = len(texts_csv)
|
| 610 |
retriever_csv = vectordb_csv.as_retriever(search_kwargs={"k": k})
|
| 611 |
llm = Cohere(temperature=0)
|
|
@@ -660,12 +660,12 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
|
|
| 660 |
k_1 = len(texts)
|
| 661 |
file_name = os.path.basename(file_path)
|
| 662 |
if check_persist_directory(id, file_name):
|
| 663 |
-
vectordb_file = Chroma(persist_directory=f"/code/vector_database/{id}/{file_name}",
|
| 664 |
embedding_function=embeddings)
|
| 665 |
else:
|
| 666 |
vectordb_file = Chroma.from_documents(texts,
|
| 667 |
embedding=embeddings,
|
| 668 |
-
persist_directory=f"/code/vector_database/{id}/{file_name}")
|
| 669 |
retriever_file = vectordb_file.as_retriever(search_kwargs={f"k": k_1})
|
| 670 |
bm25_retriever = BM25Retriever.from_documents(texts)
|
| 671 |
bm25_retriever.k = k_1
|
|
@@ -673,12 +673,12 @@ def handle_query(question, text_all, compression_retriever, id, thread_id):
|
|
| 673 |
weights=[0.6, 0.4])
|
| 674 |
docs = ensemble_retriever.get_relevant_documents(f"{query}")
|
| 675 |
|
| 676 |
-
path = f"/code/vector_database/FAISS/{id}/{file_name}"
|
| 677 |
if check_path_exists(path):
|
| 678 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
| 679 |
else:
|
| 680 |
docsearch = FAISS.from_documents(docs, embeddings)
|
| 681 |
-
docsearch.save_local(f"/code/vector_database/FAISS/{id}/{file_name}")
|
| 682 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
| 683 |
k_2 = len(docs)
|
| 684 |
retrieve3 = docsearch.as_retriever(search_kwargs={f"k": k_2})
|
|
@@ -701,7 +701,7 @@ def handle_query_upgrade_keyword_old(query_all, text_all, id,chat_history):
|
|
| 701 |
test = query_analyzer(query_all)
|
| 702 |
test_string = str(test)
|
| 703 |
matches = re.findall(r"'([^']*)'", test_string)
|
| 704 |
-
vectordb = Chroma(persist_directory=f"/code/vector_database/{id}", embedding_function=embeddings)
|
| 705 |
k = len(text_all)
|
| 706 |
retriever = vectordb.as_retriever(search_kwargs={"k": k})
|
| 707 |
compressor = CohereRerank(top_n=5, model="rerank-english-v2.0")
|
|
|
|
| 116 |
text = ' '.join(words)
|
| 117 |
return text
|
| 118 |
def find_matching_files_in_docs_12_id(text, id):
|
| 119 |
+
folder_path = f"/code/temp/{id}"
|
| 120 |
search_terms = []
|
| 121 |
search_terms_old = []
|
| 122 |
matching_index = []
|
|
|
|
| 169 |
text = ""
|
| 170 |
for x in file_list:
|
| 171 |
if x.endswith('.xlsx'):
|
| 172 |
+
old = f"/code/temp/{id}/{x}"
|
| 173 |
new = old.replace(".xlsx", ".csv")
|
| 174 |
convert_xlsx_to_csv(old, new)
|
| 175 |
x = x.replace(".xlsx", ".csv")
|
|
|
|
| 279 |
return answer
|
| 280 |
|
| 281 |
def check_persist_directory(id, file_name):
|
| 282 |
+
directory_path = f"/code/temp/vector_database/{id}/{file_name}"
|
| 283 |
return os.path.exists(directory_path)
|
| 284 |
|
| 285 |
from langchain_community.vectorstores import FAISS
|
|
|
|
| 303 |
texts_data = text_splitter.split_text(data)
|
| 304 |
|
| 305 |
if check_persist_directory(id, file_name):
|
| 306 |
+
vectordb_query = Chroma(persist_directory=f"/code/temp/vector_database/{id}/{file_name}", embedding_function=embeddings)
|
| 307 |
else:
|
| 308 |
vectordb_query = Chroma.from_texts(texts_data,
|
| 309 |
embedding=embeddings,
|
| 310 |
+
persist_directory=f"/code/temp/vector_database/{id}/{file_name}")
|
| 311 |
|
| 312 |
k_1 = len(texts_data)
|
| 313 |
retriever = vectordb_query.as_retriever(search_kwargs={f"k": k_1})
|
|
|
|
| 317 |
weights=[0.6, 0.4])
|
| 318 |
docs = ensemble_retriever.get_relevant_documents(f"{query}")
|
| 319 |
|
| 320 |
+
path = f"/code/temp/vector_database/FAISS/{id}/{file_name}"
|
| 321 |
if check_path_exists(path):
|
| 322 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
| 323 |
else:
|
| 324 |
docsearch = FAISS.from_documents(docs, embeddings)
|
| 325 |
+
docsearch.save_local(f"/code/temp/vector_database/FAISS/{id}/{file_name}")
|
| 326 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
| 327 |
|
| 328 |
k_2 = len(docs)
|
|
|
|
| 443 |
|
| 444 |
def extract_data2(id):
|
| 445 |
documents = []
|
| 446 |
+
directory_path = f"/code/temp/{id}"
|
| 447 |
if not os.path.exists(directory_path) or not any(
|
| 448 |
os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
|
| 449 |
return False
|
|
|
|
| 490 |
texts = text_splitter.split_documents(documents)
|
| 491 |
Chroma.from_documents(documents=texts,
|
| 492 |
embedding=embeddings,
|
| 493 |
+
persist_directory=f"/code/temp/vector_database/{id}")
|
| 494 |
return texts
|
| 495 |
|
| 496 |
def generate_question(id):
|
| 497 |
+
directory_path = f"/code/temp/{id}"
|
| 498 |
if not os.path.exists(directory_path) or not any(
|
| 499 |
os.path.isfile(os.path.join(directory_path, f)) for f in os.listdir(directory_path)):
|
| 500 |
return False
|
|
|
|
| 593 |
print(relevance_score_float)
|
| 594 |
if relevance_score_float <= 0.12:
|
| 595 |
documents1 = []
|
| 596 |
+
for file in os.listdir(f"/code/temp/{id}"):
|
| 597 |
if file.endswith('.csv'):
|
| 598 |
+
csv_path = f"/code/temp/{id}/" + file
|
| 599 |
loader = UnstructuredCSVLoader(csv_path)
|
| 600 |
documents1.extend(loader.load())
|
| 601 |
elif file.endswith('.xlsx'):
|
| 602 |
+
excel_path = f"/code/temp/{id}/" + file
|
| 603 |
loader = UnstructuredExcelLoader(excel_path)
|
| 604 |
documents1.extend(loader.load())
|
| 605 |
text_splitter_csv = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=1500)
|
| 606 |
texts_csv = text_splitter_csv.split_documents(documents1)
|
| 607 |
vectordb_csv = Chroma.from_documents(documents=texts_csv,
|
| 608 |
+
embedding=embeddings, persist_directory=f'/code/temp/vector_database/csv/{thread_id}')
|
| 609 |
k = len(texts_csv)
|
| 610 |
retriever_csv = vectordb_csv.as_retriever(search_kwargs={"k": k})
|
| 611 |
llm = Cohere(temperature=0)
|
|
|
|
| 660 |
k_1 = len(texts)
|
| 661 |
file_name = os.path.basename(file_path)
|
| 662 |
if check_persist_directory(id, file_name):
|
| 663 |
+
vectordb_file = Chroma(persist_directory=f"/code/temp/vector_database/{id}/{file_name}",
|
| 664 |
embedding_function=embeddings)
|
| 665 |
else:
|
| 666 |
vectordb_file = Chroma.from_documents(texts,
|
| 667 |
embedding=embeddings,
|
| 668 |
+
persist_directory=f"/code/temp/vector_database/{id}/{file_name}")
|
| 669 |
retriever_file = vectordb_file.as_retriever(search_kwargs={f"k": k_1})
|
| 670 |
bm25_retriever = BM25Retriever.from_documents(texts)
|
| 671 |
bm25_retriever.k = k_1
|
|
|
|
| 673 |
weights=[0.6, 0.4])
|
| 674 |
docs = ensemble_retriever.get_relevant_documents(f"{query}")
|
| 675 |
|
| 676 |
+
path = f"/code/temp/vector_database/FAISS/{id}/{file_name}"
|
| 677 |
if check_path_exists(path):
|
| 678 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
| 679 |
else:
|
| 680 |
docsearch = FAISS.from_documents(docs, embeddings)
|
| 681 |
+
docsearch.save_local(f"/code/temp/vector_database/FAISS/{id}/{file_name}")
|
| 682 |
docsearch = FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
| 683 |
k_2 = len(docs)
|
| 684 |
retrieve3 = docsearch.as_retriever(search_kwargs={f"k": k_2})
|
|
|
|
| 701 |
test = query_analyzer(query_all)
|
| 702 |
test_string = str(test)
|
| 703 |
matches = re.findall(r"'([^']*)'", test_string)
|
| 704 |
+
vectordb = Chroma(persist_directory=f"/code/temp/vector_database/{id}", embedding_function=embeddings)
|
| 705 |
k = len(text_all)
|
| 706 |
retriever = vectordb.as_retriever(search_kwargs={"k": k})
|
| 707 |
compressor = CohereRerank(top_n=5, model="rerank-english-v2.0")
|