Spaces:
Running
Running
from langchain_community.document_loaders import PyPDFLoader | |
import os | |
from langchain_openai import ChatOpenAI | |
from langchain_chroma import Chroma | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain.chains.combine_documents import create_stuff_documents_chain | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings | |
from setup.environment import default_model | |
from uuid import uuid4 | |
os.environ.get("OPENAI_API_KEY") | |
os.environ.get("HUGGINGFACEHUB_API_TOKEN") | |
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
allIds = [] | |
def getPDF(file_path): | |
documentId = 0 | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
loader = PyPDFLoader(file_path, extract_images=False) | |
pages = loader.load_and_split(text_splitter) | |
for page in pages: | |
print('\n') | |
print('allIds: ', allIds) | |
documentId = str(uuid4()) | |
allIds.append(documentId) | |
page.id = documentId | |
return pages | |
def create_retriever(documents, vectorstore): | |
print('\n\n') | |
print('documents: ', documents[:2]) | |
vectorstore.add_documents(documents=documents) | |
retriever = vectorstore.as_retriever( | |
# search_type="similarity", | |
# search_kwargs={"k": 3}, | |
) | |
return retriever | |
def create_prompt_llm_chain(system_prompt, modelParam): | |
if modelParam == default_model: | |
model = ChatOpenAI(model=modelParam) | |
else: | |
model = HuggingFaceEndpoint( | |
repo_id=modelParam, | |
task="text-generation", | |
# max_new_tokens=100, | |
do_sample=False, | |
huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN") | |
) | |
system_prompt = system_prompt + "\n\n" + "{context}" | |
prompt = ChatPromptTemplate.from_messages( | |
[ | |
("system", system_prompt), | |
("human", "{input}"), | |
] | |
) | |
question_answer_chain = create_stuff_documents_chain(model, prompt) | |
return question_answer_chain |