File size: 4,175 Bytes
95fca27
21583be
 
9d48d5a
95fca27
 
9d48d5a
 
95fca27
9d48d5a
 
 
95fca27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21583be
 
95fca27
 
 
 
 
 
 
 
 
 
 
 
 
9d48d5a
95fca27
 
 
 
 
 
 
 
 
 
 
 
 
9d48d5a
95fca27
21583be
95fca27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21583be
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import time
import streamlit as st
import pandas as pd
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from transformers import AutoTokenizer, AutoModelForCausalLM
import search  # Import the search module

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", trust_remote_code=True)

def save_as_pdf(conversation):
    pdf_filename = "conversation.pdf"
    c = canvas.Canvas(pdf_filename, pagesize=letter)
   
    c.drawString(100, 750, "Conversation:")
    y_position = 730
    for q, a in conversation:
        c.drawString(120, y_position, f"Q: {q}")
        c.drawString(120, y_position - 20, f"A: {a}")
        y_position -= 40
   
    c.save()
   
    st.markdown(f"Download [PDF](./{pdf_filename})")

def save_as_docx(conversation):
    doc = Document()
    doc.add_heading('Conversation', 0)
   
    for q, a in conversation:
        doc.add_paragraph(f'Q: {q}')
        doc.add_paragraph(f'A: {a}')
   
    doc_filename = "conversation.docx"
    doc.save(doc_filename)
   
    st.markdown(f"Download [DOCX](./{doc_filename})")

def save_as_xlsx(conversation):
    df = pd.DataFrame(conversation, columns=["Question", "Answer"])
    xlsx_filename = "conversation.xlsx"
    df.to_excel(xlsx_filename, index=False)
   
    st.markdown(f"Download [XLSX](./{xlsx_filename})")

def save_as_txt(conversation):
    txt_filename = "conversation.txt"
    with open(txt_filename, "w") as txt_file:
        for q, a in conversation:
            txt_file.write(f"Q: {q}\nA: {a}\n\n")
   
    st.markdown(f"Download [TXT](./{txt_filename})")

def main():
    st.markdown('<h1>Ask anything from Legal Texts</h1><p style="font-size: 12; color: gray;"></p>', unsafe_allow_html=True)
    st.markdown("<h2>Upload documents</h2>", unsafe_allow_html=True)
    uploaded_files = st.file_uploader("Upload one or more documents", type=['pdf', 'docx'], accept_multiple_files=True)
    question = st.text_input("Ask a question based on the documents", key="question_input")

    progress = st.progress(0)
    for i in range(100):
        progress.progress(i + 1)
        time.sleep(0.01)

    if uploaded_files:
        df = pd.DataFrame(columns=["page_num", "paragraph_num", "content", "tokens"])
        for uploaded_file in uploaded_files:
            paragraphs = search.read_pdf(uploaded_file) if uploaded_file.type == "application/pdf" else search.read_docx(uploaded_file)
            temp_df = pd.DataFrame(
                [(p.page_num, p.paragraph_num, p.content, search.count_tokens(p.content))
                for p in paragraphs],
                columns=["page_num", "paragraph_num", "content", "tokens"]
            )
            df = pd.concat([df, temp_df], ignore_index=True)

        if "interactions" not in st.session_state:
            st.session_state["interactions"] = []

        answer = ""
        if question != st.session_state.get("last_question", ""):
            st.text("Searching...")
            answer = search.answer_query_with_context(question, df, tokenizer, model)
            st.session_state["interactions"].append((question, answer))
            st.write(answer)

        st.markdown("### Interaction History")
        for q, a in st.session_state["interactions"]:
            st.write(f"**Q:** {q}\n\n**A:** {a}")

        st.session_state["last_question"] = question

        st.markdown("<h2>Sample paragraphs</h2>", unsafe_allow_html=True)
        sample_size = min(len(df), 5)
        st.dataframe(df.sample(n=sample_size))  

        if st.button("Save as PDF"):
            save_as_pdf(st.session_state["interactions"])
        if st.button("Save as DOCX"):
            save_as_docx(st.session_state["interactions"])
        if st.button("Save as XLSX"):
            save_as_xlsx(st.session_state["interactions"])
        if st.button("Save as TXT"):
            save_as_txt(st.session_state["interactions"])


    else:
        st.markdown("<h2>Please upload a document to proceed.</h2>", unsafe_allow_html=True)

if __name__ == "__main__":
    main()