# Write the Streamlit app script
# Write the Streamlit app script
import streamlit as st
import pdfplumber
import torch
from transformers import RobertaTokenizer, RobertaModel
import nltk
import openai
from torch import nn
import torch.nn.functional as F
from nltk.tokenize import sent_tokenize

# Download the 'punkt' package
nltk.download('punkt')

openai.api_key = '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'

# Define your model architecture
class Bert_model(nn.Module):
    def __init__(self, hidden_size, dropout_rate):
        super(Bert_model, self).__init__()
        self.hidden_size = hidden_size
        self.bert = RobertaModel.from_pretrained('deepset/roberta-base-squad2')
        self.cls_prj = nn.Linear(hidden_size, hidden_size, bias=True)
        self.cls_dropout = nn.Dropout(dropout_rate)
        self.cls_final = nn.Linear(hidden_size, 2, bias=True)

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_sequence_output = bert_outputs.last_hidden_state
        bert_pooled_output = bert_sequence_output[:, 0, :]
        pooled_output = self.cls_prj(bert_pooled_output)
        pooled_output = self.cls_dropout(pooled_output)
        logits = self.cls_final(pooled_output)
        return logits

# Load the model
model_path = "/content/model.pt"  # Replace with your actual model path
state_dict = torch.load(model_path)
device = "cuda"  # or "cpu" if GPU is not available

# Instantiate the model architecture
model = Bert_model(hidden_size=768, dropout_rate=0.1)  # Adjust the hidden size to match the saved model
model = nn.DataParallel(model)
model.load_state_dict(state_dict)
model = model.to(device)
model.eval()

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2')

# Function to preprocess PDF text
def preprocess_pdf(pdf_path, tokenizer):
    with pdfplumber.open(pdf_path) as pdf:
        text = " ".join([page.extract_text() for page in pdf.pages[2:]])
        tokenized_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids = torch.tensor([tokenized_text['input_ids']])
        attention_mask = torch.tensor([tokenized_text['attention_mask']])
        return input_ids, attention_mask, text

def translate_text(text, target_language):
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that translates English text to other languages."},
            {"role": "user", "content": f'Translate the following English text to {target_language}: "{text}"'},
        ],
    )
    return response.choices[0].message['content']

def explain_term(term):
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that provides definitions."
            },
            {
                "role": "user",
                "content": f"Explain the term: {term}"
            },
        ],
    )
    return response['choices'][0]['message']['content']

# Streamlit code to upload file
st.title('FinQA (Financial Question-Answering)')
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

# Select language
language = st.selectbox('Select your language', ['English', 'French','Chinese','Korean','Spanish','German','Japanese'])

if uploaded_file is not None:
    with open("temp.pdf", "wb") as f:
        f.write(uploaded_file.getbuffer())
    input_ids, attention_mask, text = preprocess_pdf("temp.pdf", tokenizer)
    st.write('File successfully uploaded and processed')

    # Ask a question
    question = st.text_input("Enter your question:")

    if question:
        sentences = sent_tokenize(text)
        predictions = []

        for sentence in sentences:
            inputs = tokenizer.encode_plus(question, sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask)
                probabilities = F.softmax(outputs, dim=1)
                max_value, max_index = torch.max(probabilities, dim=1)
                prediction = max_index.item()
                predictions.append((sentence, prediction, probabilities[0].tolist()))

        predictions.sort(key=lambda pair: pair[1], reverse=True)
        top_5_sentences = predictions[:13]

        #st.write("Top 5 Relevant Sentences:")
        #for sentence, prediction, probabilities in top_5_sentences:
            #st.write(f"Sentence: {sentence}, Prediction: {prediction}, Probability: {probabilities[prediction]}")

        # Prepare the chat history with the top 3 sentences
        chat_history = "\n".join([sentence[0] for sentence in top_5_sentences])

        # Ask the question using OpenAI API
        openai.api_key = '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'  # Replace with your actual OpenAI API key

        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful generator which read the short paragraphs and answer the question."},
                {"role": "user", "content": chat_history},
                {"role": "user", "content": question},
            ]
        )

        if language != 'English':
            response_content = translate_text(response.choices[0].message['content'], language)
        else:
            response_content = response.choices[0].message['content']

        st.text("Answer: " + response_content)

term = st.text_input("Enter a term you want to define:")

if term:
    # Define the term using OpenAI API
    definition = explain_term(term)

    if language != 'English':
        definition = translate_text(definition, language)

    st.text("Definition: " + definition)