import os from datasets import load_dataset from langchain_community.document_loaders.csv_loader import CSVLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain.embeddings import CacheBackedEmbeddings from langchain.storage import LocalFileStore from langchain_community.vectorstores import FAISS def fetch_retriever_or_load_local_retriever(): csv_path = "./imdb.csv" if not os.path.exists(csv_path): dataset = load_dataset("ShubhamChoksi/IMDB_Movies") dataset["train"].to_csv("imdb.csv") loader = CSVLoader(file_path=csv_path) data = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) chunked_documents = text_splitter.split_documents(data) embedding_model = OpenAIEmbeddings() store = LocalFileStore("./cache/") cached_embedder = CacheBackedEmbeddings.from_bytes_store( embedding_model, store, namespace=embedding_model.model ) index_path = "local_index" if os.path.exists(index_path): vector_store = FAISS.load_local( index_path, cached_embedder, allow_dangerous_deserialization=True ) else: vector_store = FAISS.from_documents(chunked_documents, cached_embedder) vector_store.save_local(index_path) return vector_store.as_retriever()