xke commited on
Commit
780f5aa
·
1 Parent(s): 849e183

try Chromadb version

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +50 -11
  3. requirements.txt +1 -1
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+ .env
app.py CHANGED
@@ -1,20 +1,48 @@
1
  import chainlit as cl
2
  from datasets import load_dataset
3
  from langchain_community.document_loaders import CSVLoader
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_openai import OpenAIEmbeddings
6
- from langchain.embeddings import CacheBackedEmbeddings
7
- from langchain.storage import LocalFileStore
8
- from langchain_community.vectorstores import FAISS
9
- from langchain_core.runnables.base import RunnableSequence
10
  from langchain_core.runnables.passthrough import RunnablePassthrough
11
  from langchain_core.output_parsers import StrOutputParser
12
  from langchain_core.prompts import ChatPromptTemplate
13
  from langchain_openai import ChatOpenAI
14
  from langchain.schema.runnable import Runnable, RunnablePassthrough, RunnableConfig
15
  from langchain.callbacks.base import BaseCallbackHandler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def setup_data():
 
18
  dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
19
  dataset_dict = dataset
20
  dataset_dict["train"].to_csv("imdb.csv")
@@ -27,17 +55,28 @@ def setup_data():
27
  chunk_overlap=100
28
  )
29
 
30
- chunked_documents = text_splitter.split_documents(data)
31
- embedding_model = OpenAIEmbeddings()
32
 
33
- store = LocalFileStore("./cache/")
34
- embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, store, namespace=embedding_model.model)
35
 
36
- vector_store = FAISS.from_documents(chunked_documents, embedder)
37
- vector_store.save_local("faiss_index")
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- return vector_store
40
 
 
41
 
42
  doc_search = setup_data()
43
  model = ChatOpenAI(model_name="gpt-4o", temperature=0, streaming=True)
 
1
  import chainlit as cl
2
  from datasets import load_dataset
3
  from langchain_community.document_loaders import CSVLoader
4
+ from langchain_community.vectorstores.chroma import Chroma
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_openai import OpenAIEmbeddings
7
+ #from langchain.embeddings import CacheBackedEmbeddings
8
+ #from langchain.storage import LocalFileStore
9
+ #from langchain_community.vectorstores import FAISS
10
+ #from langchain_core.runnables.base import RunnableSequence
11
  from langchain_core.runnables.passthrough import RunnablePassthrough
12
  from langchain_core.output_parsers import StrOutputParser
13
  from langchain_core.prompts import ChatPromptTemplate
14
  from langchain_openai import ChatOpenAI
15
  from langchain.schema.runnable import Runnable, RunnablePassthrough, RunnableConfig
16
  from langchain.callbacks.base import BaseCallbackHandler
17
+ from langchain.indexes import SQLRecordManager, index
18
+
19
+ # def setup_data():
20
+ # dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
21
+ # dataset_dict = dataset
22
+ # dataset_dict["train"].to_csv("imdb.csv")
23
+
24
+ # loader = CSVLoader(file_path="imdb.csv")
25
+ # data = loader.load()
26
+
27
+ # text_splitter = RecursiveCharacterTextSplitter(
28
+ # chunk_size=1000,
29
+ # chunk_overlap=100
30
+ # )
31
+
32
+ # chunked_documents = text_splitter.split_documents(data)
33
+ # embedding_model = OpenAIEmbeddings()
34
+
35
+ # store = LocalFileStore("./cache/")
36
+ # embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, store, namespace=embedding_model.model)
37
+
38
+ # vector_store = FAISS.from_documents(chunked_documents, embedder)
39
+ # vector_store.save_local("faiss_index")
40
+
41
+ # return vector_store
42
+
43
 
44
  def setup_data():
45
+
46
  dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
47
  dataset_dict = dataset
48
  dataset_dict["train"].to_csv("imdb.csv")
 
55
  chunk_overlap=100
56
  )
57
 
58
+ docs = text_splitter.split_documents(data) # chunked documents
59
+ embeddings_model = OpenAIEmbeddings()
60
 
61
+ doc_search = Chroma.from_documents(docs, embeddings_model)
 
62
 
63
+ namespace = "chromadb/my_documents"
64
+ record_manager = SQLRecordManager(
65
+ namespace, db_url="sqlite:///record_manager_cache.sql"
66
+ )
67
+ record_manager.create_schema()
68
+
69
+ index_result = index(
70
+ docs,
71
+ record_manager,
72
+ doc_search,
73
+ cleanup="incremental",
74
+ source_id_key="source",
75
+ )
76
 
77
+ print(f"Indexing stats: {index_result}")
78
 
79
+ return doc_search
80
 
81
  doc_search = setup_data()
82
  model = ChatOpenAI(model_name="gpt-4o", temperature=0, streaming=True)
requirements.txt CHANGED
@@ -5,4 +5,4 @@ langchain_openai
5
  faiss-cpu
6
  tiktoken
7
  chainlit
8
-
 
5
  faiss-cpu
6
  tiktoken
7
  chainlit
8
+ chromadb