from datasets import load_dataset from huggingface_hub import list_datasets from google.colab import userdata from langchain import OpenAI, LLMMathChain, SerpAPIWrapper from langchain.agents import initialize_agent, Tool, AgentExecutor from langchain_community.chat_models import ChatOpenAI from langchain.embeddings import CacheBackedEmbeddings from langchain.storage import LocalFileStore import os import chainlit as cl import openai from google.colab import userdata from dotenv import load_dotenv from langchain_community.document_loaders import TextLoader from langchain_community.document_loaders.csv_loader import CSVLoader from langchain_community.vectorstores import FAISS from langchain.storage import LocalFileStore from langchain.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI from langchain.schema.runnable import RunnableMap from langchain.schema.output_parser import StrOutputParser from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.output_parsers import ResponseSchema, StructuredOutputParser import pandas as pd from langchain_openai import OpenAIEmbeddings import openai import asyncio from dotenv import dotenv_values # get keys my_secrets = dotenv_values("key.env") # download data #dataset = load_dataset("ShubhamChoksi/IMDB_Movies") #split_name = "train" # Change this to the split you want to save #data = dataset[split_name] # Convert the dataset to a pandas DataFrame #df = pd.DataFrame(data) # Define the path where you want to save the CSV file #csv_file_path = 'data.csv' # Save the DataFrame to a CSV file #df.to_csv(csv_file_path, index=False) #load the csv loader = TextLoader('data.csv') documents = loader.load() #split using recursive text splitter text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=100, length_function=len, is_separator_regex=False, ) docs = text_splitter.split_documents(documents) # create embeddings underlying_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",api_key=my_secrets["OPEN_API_KEY"]) store = LocalFileStore("./cache/") cached_embedder = CacheBackedEmbeddings.from_bytes_store( underlying_embeddings, store, namespace=underlying_embeddings.model ) db = FAISS.from_documents(docs, cached_embedder) # Get the retriever for the Chat Model retriever = db.as_retriever( search_kwargs={"k": 10} ) @cl.on_chat_start def start(): # Create the prompt template make sure it doesn't return data not in rag template = """ You're a helpful AI assistent tasked to answer the user's questions about movies. You can only make conversations based on the provided context about movies. If a response cannot be formed strictly using the context, politely say you don’t have knowledge about that topic under new line character 'ANSWER:' tag which is prefixed with new line character. Remember, you must return both an answer under 'ANSWER:' tag which is prefixed with new line character and citations in line separated format of answer and bulleted list of citiations under 'CITATIONS:' tag. A citation consists of a VERBATIM quote that \ justifies the answer and the ID of the quoted article. Return a citation for every quote across all articles \ that justify the answer. Add a new line character after all citations. Use the following format for your final output: new line character ANSWER: CITATIONS: new line character CONTEXT: {context} QUESTION: {question} YOUR ANSWER: """ prompt = ChatPromptTemplate.from_messages([("system", template)]) llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, api_key=my_secrets["OPEN_API_KEY"]) # Define the chain inputs = RunnableMap({ 'context': lambda x: retriever.get_relevant_documents(x['question']), 'question': lambda x: x['question'] }) #create runnable chain runnable_chain = ( inputs | prompt | llm | StrOutputParser() ) cl.user_session.set("runnable_chain", runnable_chain) @cl.on_message async def on_message(message: cl.Message): runnable_chain = cl.user_session.get("runnable_chain") msg = message.content result = runnable_chain.invoke({"question": msg}) #print(str(result)) await cl.Message(content=result).send()