Victor Hom commited on
Commit
af34abb
·
1 Parent(s): cd03d4e
Files changed (1) hide show
  1. app.py +62 -0
app.py CHANGED
@@ -7,6 +7,15 @@ import chainlit as cl # importing chainlit for our app
7
  from chainlit.prompt import Prompt, PromptMessage # importing prompt tools
8
  from chainlit.playground.providers import ChatOpenAI # importing ChatOpenAI tools
9
  from dotenv import load_dotenv
 
 
 
 
 
 
 
 
 
10
 
11
  load_dotenv()
12
 
@@ -18,6 +27,59 @@ user_template = """{input}
18
  Think through your response step by step.
19
  """
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  @cl.on_chat_start # marks a function that will be executed at the start of a user session
23
  async def start_chat():
 
7
  from chainlit.prompt import Prompt, PromptMessage # importing prompt tools
8
  from chainlit.playground.providers import ChatOpenAI # importing ChatOpenAI tools
9
  from dotenv import load_dotenv
10
+ from langchain.document_loaders import CSVLoader
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain_openai import OpenAIEmbeddings
13
+ from langchain.embeddings import CacheBackedEmbeddings
14
+ from langchain.storage import LocalFileStore
15
+ from langchain_community.vectorstores import FAISS
16
+ from datasets import load_dataset
17
+
18
+
19
 
20
  load_dotenv()
21
 
 
27
  Think through your response step by step.
28
  """
29
 
30
+ def setup():
31
+ dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
32
+ print(dataset['train'][0])
33
+ print("data from huggingface dataset\n")
34
+
35
+ dataset_dict = dataset
36
+ dataset_dict["train"] # TODO - what method do we have to use to store imdb.csv from ShubhamChoksi/IMDB_Movies?
37
+
38
+ dataset_dict["train"].to_csv("imdb.csv")
39
+ loader = CSVLoader(file_path='imdb.csv')
40
+ data = loader.load()
41
+ len(data)
42
+ print(data[0])
43
+ print("loaded data from csv\n")
44
+
45
+ text_splitter = RecursiveCharacterTextSplitter(
46
+ chunk_size = 1000,
47
+ chunk_overlap = 100,
48
+ )
49
+
50
+ chunked_documents = text_splitter.split_documents(data)
51
+
52
+
53
+ len(chunked_documents) # ensure we have actually split the data into chunks
54
+ print(chunked_documents[0])
55
+
56
+ openai_api_key = os.getenv("OPENAI_API_KEY")
57
+
58
+ embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
59
+
60
+
61
+ store = LocalFileStore("./cache/")
62
+ embedder = CacheBackedEmbeddings.from_bytes_store(
63
+ embedding_model, store, namespace=embedding_model.model
64
+ )
65
+
66
+ vector_store = FAISS.from_documents(chunked_documents, embedder)
67
+
68
+ vector_store.save_local("./vector_store")
69
+
70
+ vector_store = FAISS.load_local("./vector_store", embedder, allow_dangerous_deserialization=True)
71
+
72
+ retriever = vector_store.as_retriever()
73
+
74
+ query = "What are some good westerns movies?"
75
+ embedded_query = embedding_model.embed_query(query)
76
+ similar_documents = vector_store.similarity_search_by_vector(embedded_query)
77
+ for page in similar_documents:
78
+ # TODO: Print the similar documents that the similarity search returns?
79
+ print(page)
80
+ print("00-----0000")
81
+ print(page)
82
+ print("-------------")
83
 
84
  @cl.on_chat_start # marks a function that will be executed at the start of a user session
85
  async def start_chat():