rag / z_embedding.py
Deepak Sahu
updating references
3bfe553
from typing import List, Optional
from uuid import uuid4
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from langchain.docstore.document import Document as LangchainDocument
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from z_document_reader import read_wiki_html
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
def get_embedding_model():
embedding_model = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL_NAME,
multi_process=False,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
)
return embedding_model
def split_documents(
chunk_size: int,
knowledge_base: List[LangchainDocument],
tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
"""
Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
"""
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
AutoTokenizer.from_pretrained(tokenizer_name),
chunk_size=chunk_size,
chunk_overlap=int(chunk_size / 10),
add_start_index=True,
strip_whitespace=True,
# separators=MARKDOWN_SEPARATORS,
)
docs_processed = []
for doc in knowledge_base:
docs_processed += text_splitter.split_documents([doc])
# Remove duplicates
unique_texts = {}
docs_processed_unique = []
for doc in docs_processed:
if doc.page_content not in unique_texts:
unique_texts[doc.page_content] = True
docs_processed_unique.append(doc)
return docs_processed_unique
def construct_vector_db(docs_processed, emb_model):
vdb = FAISS.from_documents(
docs_processed, emb_model, distance_strategy=DistanceStrategy.COSINE
)
return vdb
def get_data_files(location:str ="_data/") -> list:
"""
Returns html file paths
"""
from glob import glob
files = glob(location + "*.html")
files += glob(location + "*.htm")
return files
def generate_and_save_vector_store(vector_store_location:str="cache_vector_store"):
"""
One time function to create and store vector
"""
data_files = get_data_files()
TEXT_KBs, IMAGE_KBs = list(), list()
for file in data_files:
TEXT_KB, IMAGE_KB = read_wiki_html(file)
TEXT_KBs.extend(TEXT_KB)
IMAGE_KBs.extend(IMAGE_KB)
#
docs_text_processed = split_documents(
512, # We choose a chunk size adapted to our model
TEXT_KBs,
tokenizer_name=EMBEDDING_MODEL_NAME,
)
docs_imgs_processed = split_documents(
512, # We choose a chunk size adapted to our model
IMAGE_KBs,
tokenizer_name=EMBEDDING_MODEL_NAME,
)
emb_model = get_embedding_model()
vector_store_text = construct_vector_db(docs_text_processed, emb_model)
vector_store_images = construct_vector_db(docs_imgs_processed, emb_model)
vector_store_text.save_local(vector_store_location+"_text")
vector_store_images.save_local(vector_store_location+"_images")
def load_vector_store(vector_store_location:str="cache_vector_store"):
'''Returns two vector stores; one for text and another for image
'''
emb_model = get_embedding_model()
vs_text = FAISS.load_local(
vector_store_location+"_text", emb_model, allow_dangerous_deserialization=True
)
vs_image = FAISS.load_local(
vector_store_location+"_images", emb_model, allow_dangerous_deserialization=True
)
return vs_text, vs_image
if __name__ == "__main__":
generate_and_save_vector_store()
load_vector_store()
pass