# Write the Streamlit app script # Write the Streamlit app script import streamlit as st import pdfplumber import torch from transformers import RobertaTokenizer, RobertaModel import nltk import openai from torch import nn import torch.nn.functional as F from nltk.tokenize import sent_tokenize # Download the 'punkt' package nltk.download('punkt') openai.api_key = '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' # Define your model architecture class Bert_model(nn.Module): def __init__(self, hidden_size, dropout_rate): super(Bert_model, self).__init__() self.hidden_size = hidden_size self.bert = RobertaModel.from_pretrained('deepset/roberta-base-squad2') self.cls_prj = nn.Linear(hidden_size, hidden_size, bias=True) self.cls_dropout = nn.Dropout(dropout_rate) self.cls_final = nn.Linear(hidden_size, 2, bias=True) def forward(self, input_ids, attention_mask): bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) bert_sequence_output = bert_outputs.last_hidden_state bert_pooled_output = bert_sequence_output[:, 0, :] pooled_output = self.cls_prj(bert_pooled_output) pooled_output = self.cls_dropout(pooled_output) logits = self.cls_final(pooled_output) return logits # Load the model model_path = "/content/model.pt" # Replace with your actual model path state_dict = torch.load(model_path) device = "cuda" # or "cpu" if GPU is not available # Instantiate the model architecture model = Bert_model(hidden_size=768, dropout_rate=0.1) # Adjust the hidden size to match the saved model model = nn.DataParallel(model) model.load_state_dict(state_dict) model = model.to(device) model.eval() # Load the tokenizer tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2') # Function to preprocess PDF text def preprocess_pdf(pdf_path, tokenizer): with pdfplumber.open(pdf_path) as pdf: text = " ".join([page.extract_text() for page in pdf.pages[2:]]) tokenized_text = tokenizer.encode_plus( text, add_special_tokens=True, max_length=512, padding='max_length', return_attention_mask=True ) input_ids = torch.tensor([tokenized_text['input_ids']]) attention_mask = torch.tensor([tokenized_text['attention_mask']]) return input_ids, attention_mask, text def translate_text(text, target_language): response = openai.ChatCompletion.create( model="gpt-4", messages=[ {"role": "system", "content": "You are a helpful assistant that translates English text to other languages."}, {"role": "user", "content": f'Translate the following English text to {target_language}: "{text}"'}, ], ) return response.choices[0].message['content'] def explain_term(term): response = openai.ChatCompletion.create( model="gpt-4", messages=[ { "role": "system", "content": "You are a helpful assistant that provides definitions." }, { "role": "user", "content": f"Explain the term: {term}" }, ], ) return response['choices'][0]['message']['content'] # Streamlit code to upload file st.title('FinQA (Financial Question-Answering)') uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") # Select language language = st.selectbox('Select your language', ['English', 'French','Chinese','Korean','Spanish','German','Japanese']) if uploaded_file is not None: with open("temp.pdf", "wb") as f: f.write(uploaded_file.getbuffer()) input_ids, attention_mask, text = preprocess_pdf("temp.pdf", tokenizer) st.write('File successfully uploaded and processed') # Ask a question question = st.text_input("Enter your question:") if question: sentences = sent_tokenize(text) predictions = [] for sentence in sentences: inputs = tokenizer.encode_plus(question, sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512) input_ids = inputs['input_ids'].to(device) attention_mask = inputs['attention_mask'].to(device) with torch.no_grad(): outputs = model(input_ids, attention_mask) probabilities = F.softmax(outputs, dim=1) max_value, max_index = torch.max(probabilities, dim=1) prediction = max_index.item() predictions.append((sentence, prediction, probabilities[0].tolist())) predictions.sort(key=lambda pair: pair[1], reverse=True) top_5_sentences = predictions[:13] #st.write("Top 5 Relevant Sentences:") #for sentence, prediction, probabilities in top_5_sentences: #st.write(f"Sentence: {sentence}, Prediction: {prediction}, Probability: {probabilities[prediction]}") # Prepare the chat history with the top 3 sentences chat_history = "\n".join([sentence[0] for sentence in top_5_sentences]) # Ask the question using OpenAI API openai.api_key = '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' # Replace with your actual OpenAI API key response = openai.ChatCompletion.create( model="gpt-4", messages=[ {"role": "system", "content": "You are a helpful generator which read the short paragraphs and answer the question."}, {"role": "user", "content": chat_history}, {"role": "user", "content": question}, ] ) if language != 'English': response_content = translate_text(response.choices[0].message['content'], language) else: response_content = response.choices[0].message['content'] st.text("Answer: " + response_content) term = st.text_input("Enter a term you want to define:") if term: # Define the term using OpenAI API definition = explain_term(term) if language != 'English': definition = translate_text(definition, language) st.text("Definition: " + definition)