Spaces:
Runtime error
Runtime error
xke
commited on
Commit
·
780f5aa
1
Parent(s):
849e183
try Chromadb version
Browse files- .gitignore +2 -0
- app.py +50 -11
- requirements.txt +1 -1
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
|
2 |
+
.env
|
app.py
CHANGED
@@ -1,20 +1,48 @@
|
|
1 |
import chainlit as cl
|
2 |
from datasets import load_dataset
|
3 |
from langchain_community.document_loaders import CSVLoader
|
|
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
from langchain_openai import OpenAIEmbeddings
|
6 |
-
from langchain.embeddings import CacheBackedEmbeddings
|
7 |
-
from langchain.storage import LocalFileStore
|
8 |
-
from langchain_community.vectorstores import FAISS
|
9 |
-
from langchain_core.runnables.base import RunnableSequence
|
10 |
from langchain_core.runnables.passthrough import RunnablePassthrough
|
11 |
from langchain_core.output_parsers import StrOutputParser
|
12 |
from langchain_core.prompts import ChatPromptTemplate
|
13 |
from langchain_openai import ChatOpenAI
|
14 |
from langchain.schema.runnable import Runnable, RunnablePassthrough, RunnableConfig
|
15 |
from langchain.callbacks.base import BaseCallbackHandler
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def setup_data():
|
|
|
18 |
dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
|
19 |
dataset_dict = dataset
|
20 |
dataset_dict["train"].to_csv("imdb.csv")
|
@@ -27,17 +55,28 @@ def setup_data():
|
|
27 |
chunk_overlap=100
|
28 |
)
|
29 |
|
30 |
-
|
31 |
-
|
32 |
|
33 |
-
|
34 |
-
embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, store, namespace=embedding_model.model)
|
35 |
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
|
40 |
|
|
|
41 |
|
42 |
doc_search = setup_data()
|
43 |
model = ChatOpenAI(model_name="gpt-4o", temperature=0, streaming=True)
|
|
|
1 |
import chainlit as cl
|
2 |
from datasets import load_dataset
|
3 |
from langchain_community.document_loaders import CSVLoader
|
4 |
+
from langchain_community.vectorstores.chroma import Chroma
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
from langchain_openai import OpenAIEmbeddings
|
7 |
+
#from langchain.embeddings import CacheBackedEmbeddings
|
8 |
+
#from langchain.storage import LocalFileStore
|
9 |
+
#from langchain_community.vectorstores import FAISS
|
10 |
+
#from langchain_core.runnables.base import RunnableSequence
|
11 |
from langchain_core.runnables.passthrough import RunnablePassthrough
|
12 |
from langchain_core.output_parsers import StrOutputParser
|
13 |
from langchain_core.prompts import ChatPromptTemplate
|
14 |
from langchain_openai import ChatOpenAI
|
15 |
from langchain.schema.runnable import Runnable, RunnablePassthrough, RunnableConfig
|
16 |
from langchain.callbacks.base import BaseCallbackHandler
|
17 |
+
from langchain.indexes import SQLRecordManager, index
|
18 |
+
|
19 |
+
# def setup_data():
|
20 |
+
# dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
|
21 |
+
# dataset_dict = dataset
|
22 |
+
# dataset_dict["train"].to_csv("imdb.csv")
|
23 |
+
|
24 |
+
# loader = CSVLoader(file_path="imdb.csv")
|
25 |
+
# data = loader.load()
|
26 |
+
|
27 |
+
# text_splitter = RecursiveCharacterTextSplitter(
|
28 |
+
# chunk_size=1000,
|
29 |
+
# chunk_overlap=100
|
30 |
+
# )
|
31 |
+
|
32 |
+
# chunked_documents = text_splitter.split_documents(data)
|
33 |
+
# embedding_model = OpenAIEmbeddings()
|
34 |
+
|
35 |
+
# store = LocalFileStore("./cache/")
|
36 |
+
# embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, store, namespace=embedding_model.model)
|
37 |
+
|
38 |
+
# vector_store = FAISS.from_documents(chunked_documents, embedder)
|
39 |
+
# vector_store.save_local("faiss_index")
|
40 |
+
|
41 |
+
# return vector_store
|
42 |
+
|
43 |
|
44 |
def setup_data():
|
45 |
+
|
46 |
dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
|
47 |
dataset_dict = dataset
|
48 |
dataset_dict["train"].to_csv("imdb.csv")
|
|
|
55 |
chunk_overlap=100
|
56 |
)
|
57 |
|
58 |
+
docs = text_splitter.split_documents(data) # chunked documents
|
59 |
+
embeddings_model = OpenAIEmbeddings()
|
60 |
|
61 |
+
doc_search = Chroma.from_documents(docs, embeddings_model)
|
|
|
62 |
|
63 |
+
namespace = "chromadb/my_documents"
|
64 |
+
record_manager = SQLRecordManager(
|
65 |
+
namespace, db_url="sqlite:///record_manager_cache.sql"
|
66 |
+
)
|
67 |
+
record_manager.create_schema()
|
68 |
+
|
69 |
+
index_result = index(
|
70 |
+
docs,
|
71 |
+
record_manager,
|
72 |
+
doc_search,
|
73 |
+
cleanup="incremental",
|
74 |
+
source_id_key="source",
|
75 |
+
)
|
76 |
|
77 |
+
print(f"Indexing stats: {index_result}")
|
78 |
|
79 |
+
return doc_search
|
80 |
|
81 |
doc_search = setup_data()
|
82 |
model = ChatOpenAI(model_name="gpt-4o", temperature=0, streaming=True)
|
requirements.txt
CHANGED
@@ -5,4 +5,4 @@ langchain_openai
|
|
5 |
faiss-cpu
|
6 |
tiktoken
|
7 |
chainlit
|
8 |
-
|
|
|
5 |
faiss-cpu
|
6 |
tiktoken
|
7 |
chainlit
|
8 |
+
chromadb
|