Spaces:
Sleeping
Sleeping
File size: 1,396 Bytes
2d71cfe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import os
from datasets import load_dataset
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_community.vectorstores import FAISS
def fetch_retriever_or_load_local_retriever():
csv_path = "./imdb.csv"
if not os.path.exists(csv_path):
dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
dataset["train"].to_csv("imdb.csv")
loader = CSVLoader(file_path=csv_path)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunked_documents = text_splitter.split_documents(data)
embedding_model = OpenAIEmbeddings()
store = LocalFileStore("./cache/")
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
embedding_model, store, namespace=embedding_model.model
)
index_path = "local_index"
if os.path.exists(index_path):
vector_store = FAISS.load_local(
index_path, cached_embedder, allow_dangerous_deserialization=True
)
else:
vector_store = FAISS.from_documents(chunked_documents, cached_embedder)
vector_store.save_local(index_path)
return vector_store.as_retriever()
|