File size: 4,355 Bytes
61ac6e5 536c385 61ac6e5 536c385 61ac6e5 536c385 61ac6e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from datasets import load_dataset
from huggingface_hub import list_datasets
from google.colab import userdata
from langchain import OpenAI, LLMMathChain, SerpAPIWrapper
from langchain.agents import initialize_agent, Tool, AgentExecutor
from langchain_community.chat_models import ChatOpenAI
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
import os
import chainlit as cl
import openai
from google.colab import userdata
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnableMap
from langchain.schema.output_parser import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
import pandas as pd
from langchain_openai import OpenAIEmbeddings
import openai
import asyncio
from dotenv import dotenv_values
# get keys
my_secrets = dotenv_values("key.env")
# download data
#dataset = load_dataset("ShubhamChoksi/IMDB_Movies")
#split_name = "train" # Change this to the split you want to save
#data = dataset[split_name]
# Convert the dataset to a pandas DataFrame
#df = pd.DataFrame(data)
# Define the path where you want to save the CSV file
#csv_file_path = 'data.csv'
# Save the DataFrame to a CSV file
#df.to_csv(csv_file_path, index=False)
#load the csv
loader = TextLoader('data.csv')
documents = loader.load()
#split using recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
length_function=len,
is_separator_regex=False,
)
docs = text_splitter.split_documents(documents)
# create embeddings
underlying_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",api_key=my_secrets["OPEN_API_KEY"])
store = LocalFileStore("./cache/")
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
underlying_embeddings, store, namespace=underlying_embeddings.model
)
db = FAISS.from_documents(docs, cached_embedder)
# Get the retriever for the Chat Model
retriever = db.as_retriever(
search_kwargs={"k": 10}
)
@cl.on_chat_start
def start():
# Create the prompt template make sure it doesn't return data not in rag
template = """
You're a helpful AI assistent tasked to answer the user's questions about movies.
You can only make conversations based on the provided context about movies. If a response cannot be formed strictly using the context, politely say you don’t have knowledge about that topic under new line character 'ANSWER:' tag which is prefixed with new line character.
Remember, you must return both an answer under 'ANSWER:' tag which is prefixed with new line character and citations in line separated format of answer and bulleted list of citiations under 'CITATIONS:' tag. A citation consists of a VERBATIM quote that \
justifies the answer and the ID of the quoted article. Return a citation for every quote across all articles \
that justify the answer. Add a new line character after all citations. Use the following format for your final output:
new line character
ANSWER:
CITATIONS:
new line character
CONTEXT:
{context}
QUESTION: {question}
YOUR ANSWER:
"""
prompt = ChatPromptTemplate.from_messages([("system", template)])
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, api_key=my_secrets["OPEN_API_KEY"])
# Define the chain
inputs = RunnableMap({
'context': lambda x: retriever.get_relevant_documents(x['question']),
'question': lambda x: x['question']
})
#create runnable chain
runnable_chain = (
inputs |
prompt |
llm |
StrOutputParser()
)
cl.user_session.set("runnable_chain", runnable_chain)
@cl.on_message
async def on_message(message: cl.Message):
runnable_chain = cl.user_session.get("runnable_chain")
msg = message.content
result = runnable_chain.invoke({"question": msg})
#print(str(result))
await cl.Message(content=result).send()
|