Spaces:

albhu
/

tdocaibot

Sleeping

App Files Files Community

tdocaibot / app.py

albhu

Update app.py

9d48d5a verified over 1 year ago

raw

history blame

4.18 kB

	import time
	import streamlit as st
	import pandas as pd
	from docx import Document
	from reportlab.lib.pagesizes import letter
	from reportlab.pdfgen import canvas
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import search # Import the search module

	# Initialize the tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True)

	def save_as_pdf(conversation):
	pdf_filename = "conversation.pdf"
	c = canvas.Canvas(pdf_filename, pagesize=letter)

	c.drawString(100, 750, "Conversation:")
	y_position = 730
	for q, a in conversation:
	c.drawString(120, y_position, f"Q: {q}")
	c.drawString(120, y_position - 20, f"A: {a}")
	y_position -= 40

	c.save()

	st.markdown(f"Download [PDF](./{pdf_filename})")

	def save_as_docx(conversation):
	doc = Document()
	doc.add_heading('Conversation', 0)

	for q, a in conversation:
	doc.add_paragraph(f'Q: {q}')
	doc.add_paragraph(f'A: {a}')

	doc_filename = "conversation.docx"
	doc.save(doc_filename)

	st.markdown(f"Download [DOCX](./{doc_filename})")

	def save_as_xlsx(conversation):
	df = pd.DataFrame(conversation, columns=["Question", "Answer"])
	xlsx_filename = "conversation.xlsx"
	df.to_excel(xlsx_filename, index=False)

	st.markdown(f"Download [XLSX](./{xlsx_filename})")

	def save_as_txt(conversation):
	txt_filename = "conversation.txt"
	with open(txt_filename, "w") as txt_file:
	for q, a in conversation:
	txt_file.write(f"Q: {q}\nA: {a}\n\n")

	st.markdown(f"Download [TXT](./{txt_filename})")

	def main():
	st.markdown('<h1>Ask anything from Legal Texts</h1><p style="font-size: 12; color: gray;"></p>', unsafe_allow_html=True)
	st.markdown("<h2>Upload documents</h2>", unsafe_allow_html=True)
	uploaded_files = st.file_uploader("Upload one or more documents", type=['pdf', 'docx'], accept_multiple_files=True)
	question = st.text_input("Ask a question based on the documents", key="question_input")

	progress = st.progress(0)
	for i in range(100):
	progress.progress(i + 1)
	time.sleep(0.01)

	if uploaded_files:
	df = pd.DataFrame(columns=["page_num", "paragraph_num", "content", "tokens"])
	for uploaded_file in uploaded_files:
	paragraphs = search.read_pdf(uploaded_file) if uploaded_file.type == "application/pdf" else search.read_docx(uploaded_file)
	temp_df = pd.DataFrame(
	[(p.page_num, p.paragraph_num, p.content, search.count_tokens(p.content))
	for p in paragraphs],
	columns=["page_num", "paragraph_num", "content", "tokens"]
	)
	df = pd.concat([df, temp_df], ignore_index=True)

	if "interactions" not in st.session_state:
	st.session_state["interactions"] = []

	answer = ""
	if question != st.session_state.get("last_question", ""):
	st.text("Searching...")
	answer = search.answer_query_with_context(question, df, tokenizer, model)
	st.session_state["interactions"].append((question, answer))
	st.write(answer)

	st.markdown("### Interaction History")
	for q, a in st.session_state["interactions"]:
	st.write(f"Q: {q}\n\nA: {a}")

	st.session_state["last_question"] = question

	st.markdown("<h2>Sample paragraphs</h2>", unsafe_allow_html=True)
	sample_size = min(len(df), 5)
	st.dataframe(df.sample(n=sample_size))

	if st.button("Save as PDF"):
	save_as_pdf(st.session_state["interactions"])
	if st.button("Save as DOCX"):
	save_as_docx(st.session_state["interactions"])
	if st.button("Save as XLSX"):
	save_as_xlsx(st.session_state["interactions"])
	if st.button("Save as TXT"):
	save_as_txt(st.session_state["interactions"])


	else:
	st.markdown("<h2>Please upload a document to proceed.</h2>", unsafe_allow_html=True)

	if __name__ == "__main__":
	main()