svijayanand commited on
Commit
6a67a80
·
verified ·
1 Parent(s): 911581c

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +106 -0
  3. ingest_data.py +104 -0
  4. requirements.txt +101 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ huggingface-env/
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ from dotenv import load_dotenv
4
+ from pathlib import Path
5
+ from ingest_data import download_data_and_create_embedding
6
+
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_core.runnables.passthrough import RunnablePassthrough
9
+ from langchain_core.output_parsers import StrOutputParser
10
+ from langchain_core.prompts import ChatPromptTemplate
11
+ from langchain_openai import ChatOpenAI
12
+ from ingest_data import underlying_embeddings, openai_api_key
13
+
14
+ from langchain.chat_models import ChatOpenAI
15
+ from langchain.prompts import ChatPromptTemplate
16
+ from langchain.schema import StrOutputParser
17
+
18
+
19
+ import chainlit as cl
20
+
21
+ # load env variables
22
+ load_dotenv()
23
+
24
+ # Specify the path to the file you want to check
25
+ file_path = Path('./faiss_index/index.faiss')
26
+
27
+ # Check if the file exists
28
+ if file_path.exists():
29
+ print("Embeddings already done, use the saved index")
30
+ # Combine the retrieved data with the output of the LLM
31
+ vector_store = FAISS.load_local(
32
+ "faiss_index", underlying_embeddings, allow_dangerous_deserialization=True
33
+ )
34
+ else:
35
+ vector_store = download_data_and_create_embedding()
36
+
37
+
38
+ # create a prompt template to send to our LLM that will incorporate the documents from our retriever with the
39
+ # question we ask the chat model
40
+ prompt_template = ChatPromptTemplate.from_template(
41
+ "Answer the {question} based on the following {context}."
42
+ )
43
+
44
+ # create a retriever for our documents
45
+ retriever = vector_store.as_retriever()
46
+
47
+ # create a chat model / LLM
48
+ chat_model = ChatOpenAI(
49
+ model="gpt-4o-2024-05-13", temperature=0, api_key=openai_api_key
50
+ )
51
+
52
+ # create a parser to parse the output of our LLM
53
+ parser = StrOutputParser()
54
+
55
+ # 💻 Create the sequence (recipe)
56
+ runnable_chain = (
57
+ # TODO: How do we chain the output of our retriever, prompt, model and model output parser so that we can get a good answer to our query?
58
+ {"context": retriever, "question": RunnablePassthrough()}
59
+ | prompt_template
60
+ | chat_model
61
+ | StrOutputParser()
62
+ )
63
+
64
+
65
+ # Asynchronous execution (e.g., for a better a chatbot user experience)
66
+ async def call_chain_async(question):
67
+ output_chunks = await runnable_chain.ainvoke(question)
68
+ return output_chunks
69
+
70
+
71
+ # output_stream = asyncio.run(call_chain_async("What are some good sci-fi movies from the 1980s?"))
72
+ # print("".join(output_stream))
73
+
74
+ @cl.on_chat_start
75
+ async def on_chat_start():
76
+ model = ChatOpenAI(streaming=True)
77
+ prompt = ChatPromptTemplate.from_messages(
78
+ [
79
+ (
80
+ "system",
81
+ "You're a very knowledgeable historian who provides accurate and eloquent answers to historical questions.",
82
+ ),
83
+ ("human", "{question}"),
84
+ ]
85
+ )
86
+ runnable = prompt | model | StrOutputParser()
87
+ cl.user_session.set("runnable", runnable)
88
+
89
+ # @cl.on_message
90
+ # async def on_message(message: cl.Message):
91
+ # runnable = cl.user_session.get("runnable") # type: Runnable
92
+
93
+ # msg = cl.Message(content="")
94
+
95
+ # async for chunk in runnable.astream(
96
+ # {"question": message.content},
97
+ # config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
98
+ # ):
99
+ # await msg.stream_token(chunk)
100
+
101
+ # await msg.send()
102
+
103
+ @cl.on_message
104
+ async def main(question):
105
+ response = await call_chain_async(question.content)
106
+ await cl.Message(content=response).send()
ingest_data.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ from langchain_community.vectorstores import FAISS
5
+ from datasets import load_dataset
6
+ from langchain.document_loaders.csv_loader import (
7
+ CSVLoader,
8
+ ) # import to load our imdb.csv file
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain_openai import OpenAIEmbeddings
11
+ from langchain.embeddings import CacheBackedEmbeddings
12
+ from langchain.storage import LocalFileStore
13
+ from langchain_community.vectorstores import FAISS
14
+
15
+ load_dotenv()
16
+ openai_api_key = os.getenv("OPENAI_API_KEY")
17
+ underlying_embeddings = OpenAIEmbeddings(api_key=openai_api_key)
18
+
19
+ def download_data_and_create_embedding():
20
+ # Download an IMDB datset from Hugging Face Hub, load the ShubhamChoksi/IMDB_Movies dataset
21
+ dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
22
+ print(dataset)
23
+
24
+ # store imdb.csv from ShubhamChoksi/IMDB_Movies
25
+ dataset_dict = dataset
26
+ dataset_dict["train"].to_csv("imdb.csv")
27
+
28
+ # load the csv file exported into a document
29
+ loader = CSVLoader("imdb.csv") # TODO
30
+ data = loader.load() # TODO
31
+ print(len(data)) # ensure we have actually loaded data into a format LangChain can recognize
32
+
33
+ """# Chunk the loaded data to improve retrieval performance
34
+ In a RAG system, the model needs to be able to quickly and accurately retrieve relevant information
35
+ from a knowledge base or other data sources to assist in generating high-quality responses.
36
+ However, working with large, unstructured datasets can be computationally expensive and time-consuming,
37
+ especially during the retrieval process.
38
+
39
+ By splitting the data into these smaller, overlapping chunks, the RAG system can more efficiently search
40
+ and retrieve the most relevant information to include in the generated response. This can lead to improved performance,
41
+ as the model doesn't have to process the entire dataset at once, and can focus on the most relevant parts of the data.
42
+ """
43
+
44
+ # create a text splitter with 1000 character chunks and 100 character overlap?
45
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
46
+ chunked_documents = text_splitter.split_documents(
47
+ data
48
+ ) # TODO: How do we chunk the data?
49
+ print(len(chunked_documents)) # ensure we have actually split the data into chunks
50
+
51
+ """# Use OpenAI embeddings to create a vector store
52
+ The first step in creating a vector store is to create embeddings from the data that you want the RAG system to be able
53
+ to retrieve. This is done using an embedding model, which transforms text data into a high-dimensional vector representation.
54
+ Each piece of text (such as a document, paragraph, or sentence) is converted into a vector that captures its semantic meaning.
55
+ For this exercise, we will use OpenAI's embedding model.
56
+ """
57
+
58
+ openai_api_key = os.getenv("OPENAI_API_KEY")
59
+ # create our embedding model
60
+ embedding_model = OpenAIEmbeddings(
61
+ model="text-embedding-3-large", api_key=openai_api_key
62
+ )
63
+
64
+ """# Create embedder
65
+ We will create our embedder using the `CacheBackedEmbeddings` class. This class is designed to optimize the process of generating embeddings by
66
+ caching the results of expensive embedding computations. This caching mechanism prevents the need to recompute embeddings for the same text
67
+ multiple times, which can be computationally expensive and time-consuming.
68
+ """
69
+
70
+ # create a local file store to for our cached embeddings
71
+ store = LocalFileStore(
72
+ "./cache/"
73
+ )
74
+ embedder = CacheBackedEmbeddings.from_bytes_store(
75
+ underlying_embeddings, store, namespace=underlying_embeddings.model
76
+ )
77
+
78
+ # Create vector store using Facebook AI Similarity Search (FAISS)
79
+ vector_store = FAISS.from_documents(
80
+ documents=chunked_documents, embedding=embedder
81
+ ) # TODO: How do we create our vector store using FAISS?
82
+ print(vector_store.index.ntotal)
83
+
84
+
85
+ # save our vector store locally
86
+ vector_store.save_local("faiss_index")
87
+
88
+ query_embedding(vector_store=vector_store)
89
+
90
+ return vector_store
91
+
92
+ def query_embedding(vector_store) -> None:
93
+ # Ask your RAG system a question!
94
+ query = "What are some good sci-fi movies from the 1980s?"
95
+
96
+ # embed our query
97
+ embedded_query = underlying_embeddings.embed_query(query)
98
+ similar_documents = vector_store.similarity_search_by_vector(
99
+ embedded_query
100
+ ) # TODO: How do we do a similarity search to find documents similar to our query?
101
+
102
+ for page in similar_documents:
103
+ # Print the similar documents that the similarity search returns?
104
+ print(page.page_content)
requirements.txt ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.9.5
3
+ aiosignal==1.3.1
4
+ annotated-types==0.7.0
5
+ anyio==3.7.1
6
+ async-timeout==4.0.3
7
+ asyncer==0.0.2
8
+ attrs==23.2.0
9
+ bidict==0.23.1
10
+ certifi==2024.2.2
11
+ chainlit==1.1.202
12
+ charset-normalizer==3.3.2
13
+ chevron==0.14.0
14
+ click==8.1.7
15
+ dataclasses-json==0.5.14
16
+ datasets==2.19.1
17
+ Deprecated==1.2.14
18
+ dill==0.3.8
19
+ distro==1.9.0
20
+ exceptiongroup==1.2.1
21
+ fastapi==0.110.3
22
+ fastapi-socketio==0.0.10
23
+ filelock==3.14.0
24
+ filetype==1.2.0
25
+ frozenlist==1.4.1
26
+ fsspec==2024.3.1
27
+ googleapis-common-protos==1.63.0
28
+ greenlet==3.0.3
29
+ grpcio==1.64.0
30
+ h11==0.14.0
31
+ httpcore==1.0.5
32
+ httpx==0.27.0
33
+ huggingface-hub==0.23.1
34
+ idna==3.7
35
+ importlib-metadata==7.0.0
36
+ jsonpatch==1.33
37
+ jsonpointer==2.4
38
+ langchain==0.2.0
39
+ langchain-community==0.2.0
40
+ langchain-core==0.2.1
41
+ langchain-openai==0.1.7
42
+ langchain-text-splitters==0.2.0
43
+ langsmith==0.1.61
44
+ Lazify==0.4.0
45
+ literalai==0.0.601
46
+ marshmallow==3.21.2
47
+ multidict==6.0.5
48
+ multiprocess==0.70.16
49
+ mypy-extensions==1.0.0
50
+ nest-asyncio==1.6.0
51
+ numpy==1.26.4
52
+ openai==1.30.1
53
+ opentelemetry-api==1.24.0
54
+ opentelemetry-exporter-otlp==1.24.0
55
+ opentelemetry-exporter-otlp-proto-common==1.24.0
56
+ opentelemetry-exporter-otlp-proto-grpc==1.24.0
57
+ opentelemetry-exporter-otlp-proto-http==1.24.0
58
+ opentelemetry-instrumentation==0.45b0
59
+ opentelemetry-proto==1.24.0
60
+ opentelemetry-sdk==1.24.0
61
+ opentelemetry-semantic-conventions==0.45b0
62
+ orjson==3.10.3
63
+ packaging==23.2
64
+ pandas==2.2.2
65
+ protobuf==4.25.3
66
+ pyarrow==16.1.0
67
+ pyarrow-hotfix==0.6
68
+ pydantic==2.7.1
69
+ pydantic_core==2.18.2
70
+ PyJWT==2.8.0
71
+ python-dateutil==2.9.0.post0
72
+ python-dotenv==1.0.1
73
+ python-engineio==4.9.1
74
+ python-multipart==0.0.9
75
+ python-socketio==5.11.2
76
+ pytz==2024.1
77
+ PyYAML==6.0.1
78
+ regex==2024.5.15
79
+ requests==2.32.2
80
+ simple-websocket==1.0.0
81
+ six==1.16.0
82
+ sniffio==1.3.1
83
+ SQLAlchemy==2.0.30
84
+ starlette==0.37.2
85
+ syncer==2.0.3
86
+ tenacity==8.3.0
87
+ tiktoken==0.7.0
88
+ tomli==2.0.1
89
+ tqdm==4.66.4
90
+ typing-inspect==0.9.0
91
+ typing_extensions==4.11.0
92
+ tzdata==2024.1
93
+ uptrace==1.24.0
94
+ urllib3==2.2.1
95
+ uvicorn==0.25.0
96
+ watchfiles==0.20.0
97
+ wrapt==1.16.0
98
+ wsproto==1.2.0
99
+ xxhash==3.4.1
100
+ yarl==1.9.4
101
+ zipp==3.18.2