Spaces:

LunaticMaestro
/

rag

Sleeping

rag / z_embedding.py

Deepak Sahu

updating references

3bfe553 3 months ago

3.88 kB

	from typing import List, Optional
	from uuid import uuid4
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from transformers import AutoTokenizer
	from langchain.docstore.document import Document as LangchainDocument
	from tqdm import tqdm

	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_community.vectorstores.utils import DistanceStrategy

	from z_document_reader import read_wiki_html

	EMBEDDING_MODEL_NAME = "thenlper/gte-small"

	def get_embedding_model():

	embedding_model = HuggingFaceEmbeddings(
	model_name=EMBEDDING_MODEL_NAME,
	multi_process=False,
	model_kwargs={"device": "cpu"},
	encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
	)
	return embedding_model

	def split_documents(
	chunk_size: int,
	knowledge_base: List[LangchainDocument],
	tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
	) -> List[LangchainDocument]:
	"""
	Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
	"""
	text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
	AutoTokenizer.from_pretrained(tokenizer_name),
	chunk_size=chunk_size,
	chunk_overlap=int(chunk_size / 10),
	add_start_index=True,
	strip_whitespace=True,
	# separators=MARKDOWN_SEPARATORS,
	)

	docs_processed = []
	for doc in knowledge_base:
	docs_processed += text_splitter.split_documents([doc])

	# Remove duplicates
	unique_texts = {}
	docs_processed_unique = []
	for doc in docs_processed:
	if doc.page_content not in unique_texts:
	unique_texts[doc.page_content] = True
	docs_processed_unique.append(doc)

	return docs_processed_unique

	def construct_vector_db(docs_processed, emb_model):
	vdb = FAISS.from_documents(
	docs_processed, emb_model, distance_strategy=DistanceStrategy.COSINE
	)
	return vdb


	def get_data_files(location:str ="_data/") -> list:
	"""
	Returns html file paths
	"""
	from glob import glob
	files = glob(location + "*.html")
	files += glob(location + "*.htm")
	return files

	def generate_and_save_vector_store(vector_store_location:str="cache_vector_store"):
	"""
	One time function to create and store vector
	"""
	data_files = get_data_files()
	TEXT_KBs, IMAGE_KBs = list(), list()
	for file in data_files:
	TEXT_KB, IMAGE_KB = read_wiki_html(file)
	TEXT_KBs.extend(TEXT_KB)
	IMAGE_KBs.extend(IMAGE_KB)

	#
	docs_text_processed = split_documents(
	512, # We choose a chunk size adapted to our model
	TEXT_KBs,
	tokenizer_name=EMBEDDING_MODEL_NAME,
	)
	docs_imgs_processed = split_documents(
	512, # We choose a chunk size adapted to our model
	IMAGE_KBs,
	tokenizer_name=EMBEDDING_MODEL_NAME,
	)

	emb_model = get_embedding_model()

	vector_store_text = construct_vector_db(docs_text_processed, emb_model)
	vector_store_images = construct_vector_db(docs_imgs_processed, emb_model)

	vector_store_text.save_local(vector_store_location+"_text")
	vector_store_images.save_local(vector_store_location+"_images")

	def load_vector_store(vector_store_location:str="cache_vector_store"):
	'''Returns two vector stores; one for text and another for image
	'''
	emb_model = get_embedding_model()

	vs_text = FAISS.load_local(
	vector_store_location+"_text", emb_model, allow_dangerous_deserialization=True
	)
	vs_image = FAISS.load_local(
	vector_store_location+"_images", emb_model, allow_dangerous_deserialization=True
	)

	return vs_text, vs_image

	if __name__ == "__main__":
	generate_and_save_vector_store()
	load_vector_store()
	pass