Adding initial app code
Browse files
app.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from langchain_community.document_loaders.csv_loader import CSVLoader
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain.embeddings import CacheBackedEmbeddings
|
5 |
+
from langchain.storage import LocalFileStore
|
6 |
+
from langchain_openai import OpenAIEmbeddings
|
7 |
+
from langchain_community.vectorstores import FAISS
|
8 |
+
|
9 |
+
dataset = load_dataset('ShubhamChoksi/IMDB_Movies')
|
10 |
+
dataset_dict = dataset
|
11 |
+
dataset_dict["train"].to_csv('imdb.csv')
|
12 |
+
|
13 |
+
loader = CSVLoader(file_path='imdb.csv')
|
14 |
+
data = loader.load()
|
15 |
+
|
16 |
+
|
17 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
18 |
+
chunk_size=1000, chunk_overlap=100
|
19 |
+
)
|
20 |
+
chunked_documents = text_splitter.split_documents(data)
|
21 |
+
|
22 |
+
import os
|
23 |
+
|
24 |
+
openai_api_key = os.getenv("openai_api_key")
|
25 |
+
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=openai_api_key)
|
26 |
+
|
27 |
+
store = LocalFileStore("./cache/")
|
28 |
+
cached_embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, store, namespace=embedding_model.model)
|
29 |
+
|
30 |
+
vector_file = "local_vector"
|
31 |
+
vector_store = FAISS.from_documents(chunked_documents, cached_embedder)
|
32 |
+
vector_store.save_local(vector_file)
|
33 |
+
|
34 |
+
query = "What are some good sci-fi movies from the 1980s?"
|
35 |
+
|
36 |
+
embedded_query = embedding_model.embed_query(query)
|
37 |
+
|
38 |
+
similar_documents = vector_store.similarity_search_by_vector(embedded_query) # TODO: How do we do a similarity search to find documents similar to our query?
|
39 |
+
|
40 |
+
for page in similar_documents:
|
41 |
+
print(page.page_content)
|
42 |
+
|