imdb_bot / build_db.py
Matt Carroll
IMDB chat bot works locally with 'chainlit run app.py'
fc67742
from datasets import load_dataset
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from pathlib import Path
from functools import reduce
import os
def build_db(openai_api_key):
data = load_imdb_data()
(embedder, embedding_model) = create_embedder(openai_api_key)
build_vectore_store(data, embedder)
vector_store = load_vector_store(embedder)
run_test_query(embedding_model, vector_store)
def load_imdb_data():
print("Loading IMDB dataset")
dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
dataset_dict = dataset
dataset_dict["train"].to_csv('data/imdb.csv')
print("")
print("Creating dataframe")
movies_dataframe = pd.read_csv('data/imdb.csv')
print(movies_dataframe.head())
print("")
print("Loading data from CSV")
loader = CSVLoader(file_path='data/imdb.csv')
data = loader.load()
print("Done loading data...")
print("Length: " + str(len(data))) # ensure we have actually loaded data into a format LangChain can recognize
print("Data list type: " + str(type(data)))
print("Data type: " + str(type(data[0])))
print(data[0])
print("")
print("Calculating total length of data")
add_length = lambda sum, doc: len(doc.page_content) + sum
total_length = reduce(add_length, data, 0)
print("Total number of characters in dataset: " + str(total_length))
print("Total divided by 1,000: " + str(total_length / 1000))
print("")
return data
def create_embedder(openai_api_key):
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
# Create the embedding store file if it doesn't already exist.
storeFile = str(Path.cwd() / 'data/embedding-store')
# storeFilePath = Path(storeFile)
# if not storeFilePath.exists():
# storeFilePath.touch()
# Create the embedder, using a local file store as the backing store.
store = LocalFileStore(storeFile)
embedder = CacheBackedEmbeddings.from_bytes_store(
embedding_model,
store
)
return (embedder, embedding_model)
def build_vectore_store(data, embedder):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
)
chunked_documents = text_splitter.split_documents(data)
len(chunked_documents) # ensure we have actually split the data into chunks
print("Trying to load vector store from file...")
vector_store = None
try:
os.makedirs("data/week2-movies", exist_ok=True)
vector_store = FAISS.load_local("data/week2-movies", embedder, allow_dangerous_deserialization=True)
except Exception as e:
vector_store = None
if vector_store is None:
print("No local vector store found - computing a new one...")
vector_store = FAISS.from_documents(data, embedder)
print("Done computing new vectore store. Saving to local file.")
vector_store.save_local("data/week2-movies")
else:
print("Found vector store in local file. Using that.")
print("")
def load_vector_store(embedder):
vector_store = FAISS.load_local("data/week2-movies", embedder, allow_dangerous_deserialization=True)
return vector_store
def run_test_query(embedding_model, vector_store):
print("Verifying that we can query the vectore dB...")
query = "I have a need. A need for speed."
embedded_query = embedding_model.embed_query(query)
similar_documents = vector_store.similarity_search_by_vector(embedded_query)
for page in similar_documents:
print(str(page.page_content))
print("")
if __name__ == "__main__":
openai_api_key = os.getenv("openai_api_key")
build_db(openai_api_key)