from langchain_community.document_loaders import PyPDFLoader import os from langchain_openai import ChatOpenAI from langchain_chroma import Chroma from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain.chains.combine_documents import create_stuff_documents_chain from langchain_core.prompts import ChatPromptTemplate from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings from setup.environment import default_model from uuid import uuid4 os.environ.get("OPENAI_API_KEY") os.environ.get("HUGGINGFACEHUB_API_TOKEN") embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") allIds = [] def getPDF(file_path): documentId = 0 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) loader = PyPDFLoader(file_path, extract_images=False) pages = loader.load_and_split(text_splitter) for page in pages: print('\n') print('allIds: ', allIds) documentId = str(uuid4()) allIds.append(documentId) page.id = documentId return pages def create_retriever(documents, vectorstore): print('\n\n') print('documents: ', documents[:2]) vectorstore.add_documents(documents=documents) retriever = vectorstore.as_retriever( # search_type="similarity", # search_kwargs={"k": 3}, ) return retriever def create_prompt_llm_chain(system_prompt, modelParam): if modelParam == default_model: model = ChatOpenAI(model=modelParam) else: model = HuggingFaceEndpoint( repo_id=modelParam, task="text-generation", # max_new_tokens=100, do_sample=False, huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN") ) system_prompt = system_prompt + "\n\n" + "{context}" prompt = ChatPromptTemplate.from_messages( [ ("system", system_prompt), ("human", "{input}"), ] ) question_answer_chain = create_stuff_documents_chain(model, prompt) return question_answer_chain