|
import streamlit as st |
|
import transformers |
|
import numpy as np |
|
import pandas as pd |
|
from scipy.special import softmax |
|
import torch |
|
|
|
|
|
def preprocess(text): |
|
new_text = [] |
|
for t in text.split(" "): |
|
t = '@user' if t.startswith('@') and len(t) > 1 else t |
|
t = 'http' if t.startswith('http') else t |
|
new_text.append(t) |
|
return " ".join(new_text) |
|
|
|
|
|
st.title("Toxicity Classification App") |
|
|
|
user_input = st.text_input("Input texts to analyze", "Cody Jiang is a happy boy!") |
|
|
|
model_names = ['distilbert-base-uncased-finetuned-sst-2-english', 'Codys-Finetuning-Language-Model'] |
|
model_name = st.selectbox("Select a pretrained model", model_names) |
|
|
|
if model_name == "Codys-Finetuning-Language-Model": |
|
model = transformers.BertForSequenceClassification.from_pretrained("./myModel/") |
|
tokenizer = transformers.BertTokenizerFast.from_pretrained("bert-base-uncased") |
|
else: |
|
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) |
|
config = transformers.AutoConfig.from_pretrained(model_name) |
|
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name) |
|
|
|
if st.button("Submit"): |
|
if model_name == "Codys-Finetuning-Language-Model": |
|
input_val = tokenizer(user_input, padding=True, truncation=True, max_length=512, return_tensors="pt") |
|
output_val = model(**input_val) |
|
probabilities = torch.sigmoid(output_val.logits) |
|
result_list = probabilities.tolist()[0] |
|
columns = ["Tweet", "Toxicity Class", "Probability"] |
|
toxicity_class = ["toxic", "severe toxic", "obscene", "threat", "insult", "identity hate"] |
|
result_df = pd.DataFrame(columns=columns) |
|
result_df.loc[0] = [user_input, toxicity_class[result_list.index(max(result_list))], max(result_list)] |
|
st.table(result_df) |
|
|
|
else: |
|
text = preprocess(user_input) |
|
encoded_input = tokenizer(text, return_tensors='pt') |
|
output = model(**encoded_input) |
|
scores = output[0][0].detach().numpy() |
|
scores = softmax(scores) |
|
|
|
ranking = np.argsort(scores) |
|
ranking = ranking[::-1] |
|
columns = ["Tweet", "Toxicity Class", "Probability"] |
|
toxicity_class = config.id2label |
|
result_list = [] |
|
columns_list = [] |
|
for i in range(scores.shape[0]): |
|
l = toxicity_class[ranking[i]] |
|
s = scores[ranking[i]] |
|
result_list.append(s) |
|
columns_list.append(l) |
|
result_df = pd.DataFrame(columns=columns) |
|
result_df.loc[0] = [user_input, columns_list[0], result_list[0]] |
|
st.table(result_df) |
|
|