File size: 1,396 Bytes
2d71cfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
from datasets import load_dataset
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_community.vectorstores import FAISS


def fetch_retriever_or_load_local_retriever():
    csv_path = "./imdb.csv"
    if not os.path.exists(csv_path):
        dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
        dataset["train"].to_csv("imdb.csv")

    loader = CSVLoader(file_path=csv_path)
    data = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunked_documents = text_splitter.split_documents(data)

    embedding_model = OpenAIEmbeddings()
    store = LocalFileStore("./cache/")
    cached_embedder = CacheBackedEmbeddings.from_bytes_store(
        embedding_model, store, namespace=embedding_model.model
    )

    index_path = "local_index"
    if os.path.exists(index_path):
        vector_store = FAISS.load_local(
            index_path, cached_embedder, allow_dangerous_deserialization=True
        )
    else:
        vector_store = FAISS.from_documents(chunked_documents, cached_embedder)
        vector_store.save_local(index_path)

    return vector_store.as_retriever()