Spaces:

andytl13
/

proj2

Build error

File size: 4,355 Bytes

from datasets import load_dataset
from huggingface_hub import list_datasets
from google.colab import userdata
from langchain import OpenAI, LLMMathChain, SerpAPIWrapper
from langchain.agents import initialize_agent, Tool, AgentExecutor
from langchain_community.chat_models import ChatOpenAI
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
import os
import chainlit as cl
import openai
from google.colab import userdata
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnableMap
from langchain.schema.output_parser import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
import pandas as pd
from langchain_openai import OpenAIEmbeddings
import openai
import asyncio
from dotenv import dotenv_values

# get keys
my_secrets = dotenv_values("key.env")

# download data
#dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
#split_name = "train" # Change this to the split you want to save
#data = dataset[split_name]

# Convert the dataset to a pandas DataFrame
#df = pd.DataFrame(data)

# Define the path where you want to save the CSV file
#csv_file_path =  'data.csv'

# Save the DataFrame to a CSV file
#df.to_csv(csv_file_path, index=False)

#load the csv
loader = TextLoader('data.csv')
documents = loader.load()

#split using recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
)

docs = text_splitter.split_documents(documents)

# create embeddings
underlying_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",api_key=my_secrets["OPEN_API_KEY"])
store = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

db = FAISS.from_documents(docs, cached_embedder)

# Get the retriever for the Chat Model
retriever = db.as_retriever(
        search_kwargs={"k": 10}
)


@cl.on_chat_start
def start():

  # Create the prompt template make sure it doesn't return data not in rag
  template = """
      You're a helpful AI assistent tasked to answer the user's questions about movies.
      You can only make conversations based on the provided context about movies. If a response cannot be formed strictly using the context, politely say you don’t have knowledge about that topic under new line character 'ANSWER:' tag which is prefixed with new line character.

      Remember, you must return both an answer under 'ANSWER:' tag which is prefixed with new line character and citations in line separated format of answer and bulleted list of citiations under 'CITATIONS:' tag. A citation consists of a VERBATIM quote that \
      justifies the answer and the ID of the quoted article.  Return a citation for every quote across all articles \
      that justify the answer. Add a new line character after all citations. Use the following format for your final output:

      new line character
      ANSWER:

      CITATIONS:
      new line character

      CONTEXT:
      {context}

      QUESTION: {question}

      YOUR ANSWER:
  """

  prompt = ChatPromptTemplate.from_messages([("system", template)])

  llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, api_key=my_secrets["OPEN_API_KEY"])

  # Define the chain
  inputs = RunnableMap({
      'context': lambda x: retriever.get_relevant_documents(x['question']),
      'question': lambda x: x['question']
  })

  #create runnable chain
  runnable_chain = (
    inputs |
    prompt |
    llm |
    StrOutputParser()
  )
  cl.user_session.set("runnable_chain", runnable_chain)


@cl.on_message
async def on_message(message: cl.Message):
    runnable_chain = cl.user_session.get("runnable_chain")
    msg = message.content

    result = runnable_chain.invoke({"question": msg})

    #print(str(result))
    await cl.Message(content=result).send()