import streamlit as st import tempfile import os from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import Clarifai from langchain.chains import RetrievalQA from clarifai.modules.css import ClarifaiStreamlitCSS st.set_page_config(page_title="Chat with Documents", page_icon="🦜") st.title("🦜 RAG with Clarifai and Langchain") ClarifaiStreamlitCSS.insert_default_css(st) # 1. Data Organization: chunk documents @st.cache_resource(ttl="1h") def load_chunk_pdf(uploaded_files): # Read documents documents = [] temp_dir = tempfile.TemporaryDirectory() for file in uploaded_files: temp_filepath = os.path.join(temp_dir.name, file.name) with open(temp_filepath, "wb") as f: f.write(file.getvalue()) loader = PyPDFLoader(temp_filepath) documents.extend(loader.load()) text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) chunked_documents = text_splitter.split_documents(documents) return chunked_documents # Create vector store on Clarifai for use in step 2 def vectorstore(USER_ID, APP_ID, docs, CLARIFAI_PAT): clarifai_vector_db = Clarifai.from_documents( user_id=USER_ID, app_id=APP_ID, documents=docs, pat=CLARIFAI_PAT, number_of_docs=3, ) return clarifai_vector_db def QandA(CLARIFAI_PAT, clarifai_vector_db): from langchain.llms import Clarifai USER_ID = "openai" APP_ID = "chat-completion" MODEL_ID = "GPT-4" # LLM to use (set to GPT-4 above) clarifai_llm = Clarifai( pat=CLARIFAI_PAT, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID) # Type of Langchain chain to use, the "stuff" chain which combines chunks retrieved # and prepends them all to the prompt qa = RetrievalQA.from_chain_type( llm=clarifai_llm, chain_type="stuff", retriever=clarifai_vector_db.as_retriever() ) return qa def main(): user_question = st.text_input("Ask a question to GPT 3.5 Turbo model about your documents and click on get the response") with st.sidebar: st.subheader("Add your Clarifai PAT, USER ID, APP ID along with the documents") # Get the USER_ID, APP_ID, Clarifai API Key CLARIFAI_PAT = st.text_input("Clarifai PAT", type="password") USER_ID = st.text_input("Clarifai user id") APP_ID = st.text_input("Clarifai app id") uploaded_files = st.file_uploader( "Upload your PDFs here", accept_multiple_files=True) if not (CLARIFAI_PAT and USER_ID and APP_ID and uploaded_files): st.info("Please add your Clarifai PAT, USER_ID, APP_ID and upload files to continue.") elif st.button("Get the response"): with st.spinner("Processing"): # process pdfs docs = load_chunk_pdf(uploaded_files) # create a vector store clarifai_vector_db = vectorstore(USER_ID, APP_ID, docs, CLARIFAI_PAT) # 2. Vector Creation: create Q&A chain conversation = QandA(CLARIFAI_PAT, clarifai_vector_db) # 3. Querying: Ask the question to the GPT 4 model based on the documents # This step also combines 4. retrieval and 5. Prepending the context response = conversation.run(user_question) st.write(response) if __name__ == '__main__': main()