File size: 3,597 Bytes
7f0844d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import config
import utils
from nltk.tokenize import word_tokenize
from typing import List
import nltk
import torch
import pickle
from langchain.docstore.document import Document as LangchainDocument
from rank_bm25 import BM25Okapi
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
import os


def create_vector_db(docs: List[LangchainDocument]):
    db_path: str = config.DB_PATH
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    embedding_model = HuggingFaceEmbeddings(
        model_name=config.EMBEDDING_MODEL_NAME,
        multi_process=True,
        model_kwargs={"device": device},
        encode_kwargs={"normalize_embeddings": True},
    )
    
    if os.path.exists(db_path):
        print(f"Завантаження векторної бази даних з {db_path}")
        knowledge_vector_database = FAISS.load_local(
            db_path, 
            embedding_model,
            allow_dangerous_deserialization=True
        )
        return knowledge_vector_database
    elif docs is not None:
        print("Створення нової векторної бази даних")
        knowledge_vector_database = FAISS.from_documents(
            docs, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )
        knowledge_vector_database.save_local(db_path)
        print(f"Векторна база даних збережена в {db_path}")
        return knowledge_vector_database
    else:
      raise ValueError(
            """Documents are missing! 
            Please load the documents and set get_data=True in app.py."""
        )
    


def create_bm25(docs: List[LangchainDocument]):
    bm25_path: str = config.BM25_PATH
    if os.path.exists(bm25_path):
        print(f"Завантаження BM25 індексу з {bm25_path}")
        with open(bm25_path, "rb") as file:
            bm25 = pickle.load(file)
        return bm25
    elif docs is not None:
        print("Створення нового BM25 індексу")
        tokenized_docs = [word_tokenize(doc.page_content.lower()) for doc in docs]
        bm25 = BM25Okapi(tokenized_docs)
        with open(bm25_path, "wb") as file:
            pickle.dump(bm25, file)
        print(f"BM25 індекс збережено в {bm25_path}")
        return bm25
    else:
      raise ValueError(
            """Documents are missing! 
            Please load the documents and set get_data=True in app.py."""
        )


def search(docs_processed, bm_25: BM25Okapi, vector_db: FAISS, query, top_k, use_bm25=True, use_semantic_search=True):
    if use_bm25 and use_semantic_search:
        bm25_retriever = BM25Retriever.from_documents(docs_processed)
        bm25_retriever.k = top_k
        faiss_retriever = vector_db.as_retriever(search_kwargs={"k": top_k})
        ensemble_retriever = EnsembleRetriever(
            retrievers=[bm25_retriever, faiss_retriever],
            weights=[0.5, 0.5]
        )      
        result = ensemble_retriever.invoke(query)
        return result

    elif use_bm25:
        tokenized_query = word_tokenize(query.lower())
        result = bm_25.get_top_n(tokenized_query, [doc.page_content for doc in docs_processed], n=top_k)

    elif use_semantic_search:
        result = vector_db.similarity_search(query, k=top_k)
    else:
       result = []
    return result