Spaces:

svijayanand
/

movie-recommender

Build error

App Files Files Community

svijayanand commited on May 22, 2024

Commit

6a67a80

verified ·

1 Parent(s): 911581c

Upload 4 files

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +106 -0
ingest_data.py +104 -0
requirements.txt +101 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ huggingface-env/

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import asyncio
+import logging
+from dotenv import load_dotenv
+from pathlib import Path
+from ingest_data import download_data_and_create_embedding
+from langchain_community.vectorstores import FAISS
+from langchain_core.runnables.passthrough import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from ingest_data import underlying_embeddings, openai_api_key
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema import StrOutputParser
+import chainlit as cl
+# load env variables
+load_dotenv()
+# Specify the path to the file you want to check
+file_path = Path('./faiss_index/index.faiss')
+# Check if the file exists
+if file_path.exists():
+    print("Embeddings already done, use the saved index")
+    # Combine the retrieved data with the output of the LLM
+    vector_store = FAISS.load_local(
+        "faiss_index", underlying_embeddings, allow_dangerous_deserialization=True
+    )
+else:
+    vector_store = download_data_and_create_embedding()
+# create a prompt template to send to our LLM that will incorporate the documents from our retriever with the
+# question we ask the chat model
+prompt_template = ChatPromptTemplate.from_template(
+    "Answer the {question} based on the following {context}."
+)
+# create a retriever for our documents
+retriever = vector_store.as_retriever()
+# create a chat model / LLM
+chat_model = ChatOpenAI(
+    model="gpt-4o-2024-05-13", temperature=0, api_key=openai_api_key
+)
+# create a parser to parse the output of our LLM
+parser = StrOutputParser()
+# 💻 Create the sequence (recipe)
+runnable_chain = (
+    # TODO: How do we chain the output of our retriever, prompt, model and model output parser so that we can get a good answer to our query?
+    {"context": retriever, "question": RunnablePassthrough()}
+    | prompt_template
+    | chat_model
+    | StrOutputParser()
+)
+# Asynchronous execution (e.g., for a better a chatbot user experience)
+async def call_chain_async(question):
+    output_chunks = await runnable_chain.ainvoke(question)
+    return output_chunks
+# output_stream =  asyncio.run(call_chain_async("What are some good sci-fi movies from the 1980s?"))
+# print("".join(output_stream))
+@cl.on_chat_start
+async def on_chat_start():
+    model = ChatOpenAI(streaming=True)
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            (
+                "system",
+                "You're a very knowledgeable historian who provides accurate and eloquent answers to historical questions.",
+            ),
+            ("human", "{question}"),
+        ]
+    )
+    runnable = prompt | model | StrOutputParser()
+    cl.user_session.set("runnable", runnable)
+# @cl.on_message
+# async def on_message(message: cl.Message):
+#     runnable = cl.user_session.get("runnable")  # type: Runnable
+#     msg = cl.Message(content="")
+#     async for chunk in runnable.astream(
+#         {"question": message.content},
+#         config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
+#     ):
+#         await msg.stream_token(chunk)
+#     await msg.send()
+@cl.on_message
+async def main(question):
+    response = await call_chain_async(question.content)
+    await cl.Message(content=response).send()

ingest_data.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+from dotenv import load_dotenv
+from langchain_community.vectorstores import FAISS
+from datasets import load_dataset
+from langchain.document_loaders.csv_loader import (
+    CSVLoader,
+)  # import to load our imdb.csv file
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
+from langchain.embeddings import CacheBackedEmbeddings
+from langchain.storage import LocalFileStore
+from langchain_community.vectorstores import FAISS
+load_dotenv()
+openai_api_key = os.getenv("OPENAI_API_KEY")
+underlying_embeddings = OpenAIEmbeddings(api_key=openai_api_key)
+def download_data_and_create_embedding():
+  # Download an IMDB datset from Hugging Face Hub, load the ShubhamChoksi/IMDB_Movies dataset
+  dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
+  print(dataset)
+  # store imdb.csv from ShubhamChoksi/IMDB_Movies
+  dataset_dict = dataset
+  dataset_dict["train"].to_csv("imdb.csv")
+  # load the csv file exported into a document
+  loader = CSVLoader("imdb.csv")  # TODO
+  data = loader.load()  # TODO
+  print(len(data))  # ensure we have actually loaded data into a format LangChain can recognize
+  """# Chunk the loaded data to improve retrieval performance
+  In a RAG system, the model needs to be able to quickly and accurately retrieve relevant information
+  from a knowledge base or other data sources to assist in generating high-quality responses.
+  However, working with large, unstructured datasets can be computationally expensive and time-consuming,
+  especially during the retrieval process.
+  By splitting the data into these smaller, overlapping chunks, the RAG system can more efficiently search
+  and retrieve the most relevant information to include in the generated response. This can lead to improved performance,
+  as the model doesn't have to process the entire dataset at once, and can focus on the most relevant parts of the data.
+  """
+  # create a text splitter with 1000 character chunks and 100 character overlap?
+  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+  chunked_documents = text_splitter.split_documents(
+      data
+  )  # TODO: How do we chunk the data?
+  print(len(chunked_documents))  # ensure we have actually split the data into chunks
+  """# Use OpenAI embeddings to create a vector store
+  The first step in creating a vector store is to create embeddings from the data that you want the RAG system to be able
+  to retrieve. This is done using an embedding model, which transforms text data into a high-dimensional vector representation.
+  Each piece of text (such as a document, paragraph, or sentence) is converted into a vector that captures its semantic meaning.
+  For this exercise, we will use OpenAI's embedding model.
+  """
+  openai_api_key = os.getenv("OPENAI_API_KEY")
+  # create our embedding model
+  embedding_model = OpenAIEmbeddings(
+      model="text-embedding-3-large", api_key=openai_api_key
+  )
+  """# Create embedder
+  We will create our embedder using the `CacheBackedEmbeddings` class. This class is designed to optimize the process of generating embeddings by
+  caching the results of expensive embedding computations. This caching mechanism prevents the need to recompute embeddings for the same text
+  multiple times, which can be computationally expensive and time-consuming.
+  """
+  # create a local file store to for our cached embeddings
+  store = LocalFileStore(
+      "./cache/"
+  )
+  embedder = CacheBackedEmbeddings.from_bytes_store(
+      underlying_embeddings, store, namespace=underlying_embeddings.model
+  )
+  # Create vector store using Facebook AI Similarity Search (FAISS)
+  vector_store = FAISS.from_documents(
+      documents=chunked_documents, embedding=embedder
+  )  # TODO: How do we create our vector store using FAISS?
+  print(vector_store.index.ntotal)
+  # save our vector store locally
+  vector_store.save_local("faiss_index")
+  query_embedding(vector_store=vector_store)
+  return vector_store
+def query_embedding(vector_store) -> None:
+    # Ask your RAG system a question!
+    query = "What are some good sci-fi movies from the 1980s?"
+    # embed our query
+    embedded_query = underlying_embeddings.embed_query(query)
+    similar_documents = vector_store.similarity_search_by_vector(
+        embedded_query
+    )  # TODO: How do we do a similarity search to find documents similar to our query?
+    for page in similar_documents:
+        # Print the similar documents that the similarity search returns?
+        print(page.page_content)

requirements.txt ADDED Viewed

	@@ -0,0 +1,101 @@

+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==3.7.1
+async-timeout==4.0.3
+asyncer==0.0.2
+attrs==23.2.0
+bidict==0.23.1
+certifi==2024.2.2
+chainlit==1.1.202
+charset-normalizer==3.3.2
+chevron==0.14.0
+click==8.1.7
+dataclasses-json==0.5.14
+datasets==2.19.1
+Deprecated==1.2.14
+dill==0.3.8
+distro==1.9.0
+exceptiongroup==1.2.1
+fastapi==0.110.3
+fastapi-socketio==0.0.10
+filelock==3.14.0
+filetype==1.2.0
+frozenlist==1.4.1
+fsspec==2024.3.1
+googleapis-common-protos==1.63.0
+greenlet==3.0.3
+grpcio==1.64.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.23.1
+idna==3.7
+importlib-metadata==7.0.0
+jsonpatch==1.33
+jsonpointer==2.4
+langchain==0.2.0
+langchain-community==0.2.0
+langchain-core==0.2.1
+langchain-openai==0.1.7
+langchain-text-splitters==0.2.0
+langsmith==0.1.61
+Lazify==0.4.0
+literalai==0.0.601
+marshmallow==3.21.2
+multidict==6.0.5
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+numpy==1.26.4
+openai==1.30.1
+opentelemetry-api==1.24.0
+opentelemetry-exporter-otlp==1.24.0
+opentelemetry-exporter-otlp-proto-common==1.24.0
+opentelemetry-exporter-otlp-proto-grpc==1.24.0
+opentelemetry-exporter-otlp-proto-http==1.24.0
+opentelemetry-instrumentation==0.45b0
+opentelemetry-proto==1.24.0
+opentelemetry-sdk==1.24.0
+opentelemetry-semantic-conventions==0.45b0
+orjson==3.10.3
+packaging==23.2
+pandas==2.2.2
+protobuf==4.25.3
+pyarrow==16.1.0
+pyarrow-hotfix==0.6
+pydantic==2.7.1
+pydantic_core==2.18.2
+PyJWT==2.8.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-engineio==4.9.1
+python-multipart==0.0.9
+python-socketio==5.11.2
+pytz==2024.1
+PyYAML==6.0.1
+regex==2024.5.15
+requests==2.32.2
+simple-websocket==1.0.0
+six==1.16.0
+sniffio==1.3.1
+SQLAlchemy==2.0.30
+starlette==0.37.2
+syncer==2.0.3
+tenacity==8.3.0
+tiktoken==0.7.0
+tomli==2.0.1
+tqdm==4.66.4
+typing-inspect==0.9.0
+typing_extensions==4.11.0
+tzdata==2024.1
+uptrace==1.24.0
+urllib3==2.2.1
+uvicorn==0.25.0
+watchfiles==0.20.0
+wrapt==1.16.0
+wsproto==1.2.0
+xxhash==3.4.1
+yarl==1.9.4
+zipp==3.18.2