ChatWithYourPDF

Runtime error

App Files Files Community

ChatWithYourPDF / app.py

tdeshane

fix pdf handling

a44b0d7 8 months ago

raw

history blame contribute delete

5.23 kB

	import os
	from typing import List

	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import Chroma
	from langchain.chains import (
	ConversationalRetrievalChain,
	)
	from langchain.document_loaders import PyPDFLoader
	from langchain.chat_models import ChatOpenAI
	from langchain.prompts.chat import (
	ChatPromptTemplate,
	SystemMessagePromptTemplate,
	HumanMessagePromptTemplate,
	)
	from langchain.docstore.document import Document
	from langchain.memory import ChatMessageHistory, ConversationBufferMemory


	import chainlit as cl

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

	system_template = """Use the following pieces of context to answer the users question.
	If you don't know the answer, just say that you don't know, don't try to make up an answer.
	ALWAYS return a "SOURCES" part in your answer.
	The "SOURCES" part should be a reference to the source of the document from which you got your answer.

	And if the user greets with greetings like Hi, hello, How are you, etc reply accordingly as well.

	Example of your response should be:

	The answer is foo
	SOURCES: xyz


	Begin!
	----------------
	{summaries}"""
	messages = [
	SystemMessagePromptTemplate.from_template(system_template),
	HumanMessagePromptTemplate.from_template("{question}"),
	]
	prompt = ChatPromptTemplate.from_messages(messages)
	chain_type_kwargs = {"prompt": prompt}


	def process_file(file_path: str):
	# Example using PyPDF2 to extract text from a PDF file
	from PyPDF2 import PdfReader

	reader = PdfReader(file_path)
	texts = []

	for page in reader.pages:
	texts.append(page.extract_text())

	return texts




	@cl.on_chat_start
	async def on_chat_start():
	file = None

	# Prompt users to upload either a text or PDF file
	while file is None:
	files = await cl.AskFileMessage(
	content="Please upload a text or PDF file to begin!",
	accept=["text/plain", "application/pdf"], # This line is for UI guidance
	max_size_mb=20,
	timeout=180,
	).send()
	if files:
	file = files[0] # Assuming the user uploads one file at a time

	filename = file.name

	# Initialize an empty list for texts, which will be populated based on the file type
	texts = []

	# Process the file based on its extension
	if filename.endswith('.txt'):
	# Handle as text file
	with open(file.path, "r", encoding="utf-8") as f:
	text = f.read()
	texts.append(text)
	await cl.Message(content=f"`{filename}` uploaded, it contains {len(text)} characters!").send()
	elif filename.endswith('.pdf'):
	# Handle as PDF
	texts = process_file(file.path) # Adjust this call according to your PDF processing implementation
	else:
	await cl.Message(content="Unsupported file type uploaded. Please upload a text or PDF file.").send()
	return # Exit if the file type is not supported

	# Process texts for conversational retrieval or other purposes here
	# For demonstration, we'll just set up a simple Chroma vector store and conversational retrieval chain

	# Create a Chroma vector store
	embeddings = OpenAIEmbeddings()
	docsearch = await cl.make_async(Chroma.from_texts)(
	texts, embeddings, metadatas=[{"source": f"{i}-pl"} for i in range(len(texts))]
	)

	message_history = ChatMessageHistory()
	memory = ConversationBufferMemory(
	memory_key="chat_history",
	output_key="answer",
	chat_memory=message_history,
	return_messages=True,
	)

	# Set up the conversational retrieval chain
	chain = ConversationalRetrievalChain.from_llm(
	ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
	chain_type="stuff",
	retriever=docsearch.as_retriever(),
	memory=memory,
	return_source_documents=True,
	)

	# Let the user know that the system is ready
	await cl.Message(content=f"Your file `{filename}` is now ready for questions!").send()

	# Save the chain in the user session for later use
	cl.user_session.set("chain", chain)


	@cl.on_message
	async def main(message):
	chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain
	cb = cl.AsyncLangchainCallbackHandler()

	res = await chain.acall(message.content, callbacks=[cb])
	answer = res["answer"]
	source_documents = res["source_documents"] # type: List[Document]

	text_elements = [] # type: List[cl.Text]

	if source_documents:
	for source_idx, source_doc in enumerate(source_documents):
	source_name = f"source_{source_idx}"
	# Create the text element referenced in the message
	text_elements.append(
	cl.Text(content=source_doc.page_content, name=source_name)
	)
	source_names = [text_el.name for text_el in text_elements]

	if source_names:
	answer += f"\nSources: {', '.join(source_names)}"
	else:
	answer += "\nNo sources found"

	await cl.Message(content=answer, elements=text_elements).send()