santhoshs commited on
Commit
51eae94
·
1 Parent(s): a477937

Adding initial app code

Browse files
Files changed (1) hide show
  1. app.py +42 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from langchain_community.document_loaders.csv_loader import CSVLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.embeddings import CacheBackedEmbeddings
5
+ from langchain.storage import LocalFileStore
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain_community.vectorstores import FAISS
8
+
9
+ dataset = load_dataset('ShubhamChoksi/IMDB_Movies')
10
+ dataset_dict = dataset
11
+ dataset_dict["train"].to_csv('imdb.csv')
12
+
13
+ loader = CSVLoader(file_path='imdb.csv')
14
+ data = loader.load()
15
+
16
+
17
+ text_splitter = RecursiveCharacterTextSplitter(
18
+ chunk_size=1000, chunk_overlap=100
19
+ )
20
+ chunked_documents = text_splitter.split_documents(data)
21
+
22
+ import os
23
+
24
+ openai_api_key = os.getenv("openai_api_key")
25
+ embedding_model = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=openai_api_key)
26
+
27
+ store = LocalFileStore("./cache/")
28
+ cached_embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, store, namespace=embedding_model.model)
29
+
30
+ vector_file = "local_vector"
31
+ vector_store = FAISS.from_documents(chunked_documents, cached_embedder)
32
+ vector_store.save_local(vector_file)
33
+
34
+ query = "What are some good sci-fi movies from the 1980s?"
35
+
36
+ embedded_query = embedding_model.embed_query(query)
37
+
38
+ similar_documents = vector_store.similarity_search_by_vector(embedded_query) # TODO: How do we do a similarity search to find documents similar to our query?
39
+
40
+ for page in similar_documents:
41
+ print(page.page_content)
42
+