File size: 2,596 Bytes
8f341e0 736271e 58f100b 736271e 147d336 736271e 8f341e0 58f100b 8f341e0 e6d7e33 8f341e0 618a9e4 de444f8 8f341e0 736271e 5a9ed3e 736271e a12c75e b361109 736271e b361109 8f341e0 9c58b27 736271e 147d336 480b37f 736271e 147d336 736271e 147d336 736271e 147d336 736271e 147d336 0a418b2 147d336 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import streamlit as st
import transformers
import numpy as np
import pandas as pd
from scipy.special import softmax
import torch
def preprocess(text):
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)
st.title("Toxicity Classification App")
user_input = st.text_input("Input texts to analyze", "Cody Jiang is a happy boy!")
model_names = ['distilbert-base-uncased-finetuned-sst-2-english', 'Codys-Finetuning-Language-Model']
model_name = st.selectbox("Select a pretrained model", model_names)
if model_name == "Codys-Finetuning-Language-Model":
model = transformers.BertForSequenceClassification.from_pretrained("./myModel/")
tokenizer = transformers.BertTokenizerFast.from_pretrained("bert-base-uncased")
else:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
config = transformers.AutoConfig.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)
if st.button("Submit"):
if model_name == "Codys-Finetuning-Language-Model":
input_val = tokenizer(user_input, padding=True, truncation=True, max_length=512, return_tensors="pt")
output_val = model(**input_val)
probabilities = torch.sigmoid(output_val.logits)
result_list = probabilities.tolist()[0]
columns = ["Tweet", "Toxicity Class", "Probability"]
toxicity_class = ["toxic", "severe toxic", "obscene", "threat", "insult", "identity hate"]
result_df = pd.DataFrame(columns=columns)
result_df.loc[0] = [user_input, toxicity_class[result_list.index(max(result_list))], max(result_list)]
st.table(result_df)
else:
text = preprocess(user_input)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
columns = ["Tweet", "Toxicity Class", "Probability"]
toxicity_class = config.id2label
result_list = []
columns_list = []
for i in range(scores.shape[0]):
l = toxicity_class[ranking[i]]
s = scores[ranking[i]]
result_list.append(s)
columns_list.append(l)
result_df = pd.DataFrame(columns=columns)
result_df.loc[0] = [user_input, columns_list[0], result_list[0]]
st.table(result_df)
|