Spaces:
Build error
Build error
File size: 3,597 Bytes
7f0844d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import config
import utils
from nltk.tokenize import word_tokenize
from typing import List
import nltk
import torch
import pickle
from langchain.docstore.document import Document as LangchainDocument
from rank_bm25 import BM25Okapi
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
import os
def create_vector_db(docs: List[LangchainDocument]):
db_path: str = config.DB_PATH
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model = HuggingFaceEmbeddings(
model_name=config.EMBEDDING_MODEL_NAME,
multi_process=True,
model_kwargs={"device": device},
encode_kwargs={"normalize_embeddings": True},
)
if os.path.exists(db_path):
print(f"Завантаження векторної бази даних з {db_path}")
knowledge_vector_database = FAISS.load_local(
db_path,
embedding_model,
allow_dangerous_deserialization=True
)
return knowledge_vector_database
elif docs is not None:
print("Створення нової векторної бази даних")
knowledge_vector_database = FAISS.from_documents(
docs, embedding_model, distance_strategy=DistanceStrategy.COSINE
)
knowledge_vector_database.save_local(db_path)
print(f"Векторна база даних збережена в {db_path}")
return knowledge_vector_database
else:
raise ValueError(
"""Documents are missing!
Please load the documents and set get_data=True in app.py."""
)
def create_bm25(docs: List[LangchainDocument]):
bm25_path: str = config.BM25_PATH
if os.path.exists(bm25_path):
print(f"Завантаження BM25 індексу з {bm25_path}")
with open(bm25_path, "rb") as file:
bm25 = pickle.load(file)
return bm25
elif docs is not None:
print("Створення нового BM25 індексу")
tokenized_docs = [word_tokenize(doc.page_content.lower()) for doc in docs]
bm25 = BM25Okapi(tokenized_docs)
with open(bm25_path, "wb") as file:
pickle.dump(bm25, file)
print(f"BM25 індекс збережено в {bm25_path}")
return bm25
else:
raise ValueError(
"""Documents are missing!
Please load the documents and set get_data=True in app.py."""
)
def search(docs_processed, bm_25: BM25Okapi, vector_db: FAISS, query, top_k, use_bm25=True, use_semantic_search=True):
if use_bm25 and use_semantic_search:
bm25_retriever = BM25Retriever.from_documents(docs_processed)
bm25_retriever.k = top_k
faiss_retriever = vector_db.as_retriever(search_kwargs={"k": top_k})
ensemble_retriever = EnsembleRetriever(
retrievers=[bm25_retriever, faiss_retriever],
weights=[0.5, 0.5]
)
result = ensemble_retriever.invoke(query)
return result
elif use_bm25:
tokenized_query = word_tokenize(query.lower())
result = bm_25.get_top_n(tokenized_query, [doc.page_content for doc in docs_processed], n=top_k)
elif use_semantic_search:
result = vector_db.similarity_search(query, k=top_k)
else:
result = []
return result
|