Spaces:
Sleeping
Sleeping
import os | |
from datasets import load_dataset | |
from langchain_community.document_loaders.csv_loader import CSVLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_openai import OpenAIEmbeddings | |
from langchain.embeddings import CacheBackedEmbeddings | |
from langchain.storage import LocalFileStore | |
from langchain_community.vectorstores import FAISS | |
def fetch_retriever_or_load_local_retriever(): | |
csv_path = "./imdb.csv" | |
if not os.path.exists(csv_path): | |
dataset = load_dataset("ShubhamChoksi/IMDB_Movies") | |
dataset["train"].to_csv("imdb.csv") | |
loader = CSVLoader(file_path=csv_path) | |
data = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
chunked_documents = text_splitter.split_documents(data) | |
embedding_model = OpenAIEmbeddings() | |
store = LocalFileStore("./cache/") | |
cached_embedder = CacheBackedEmbeddings.from_bytes_store( | |
embedding_model, store, namespace=embedding_model.model | |
) | |
index_path = "local_index" | |
if os.path.exists(index_path): | |
vector_store = FAISS.load_local( | |
index_path, cached_embedder, allow_dangerous_deserialization=True | |
) | |
else: | |
vector_store = FAISS.from_documents(chunked_documents, cached_embedder) | |
vector_store.save_local(index_path) | |
return vector_store.as_retriever() | |