File size: 2,596 Bytes
8f341e0
 
736271e
58f100b
736271e
 
 
147d336
736271e
 
 
 
 
 
 
 
8f341e0
58f100b
8f341e0
e6d7e33
8f341e0
618a9e4
de444f8
8f341e0
736271e
5a9ed3e
736271e
a12c75e
b361109
736271e
b361109
8f341e0
9c58b27
736271e
 
 
 
 
147d336
 
 
 
 
480b37f
736271e
 
 
 
 
 
 
147d336
 
 
 
736271e
147d336
736271e
147d336
 
736271e
147d336
 
0a418b2
147d336
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import streamlit as st
import transformers
import numpy as np
import pandas as pd
from scipy.special import softmax
import torch


def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


st.title("Toxicity Classification App")

user_input = st.text_input("Input texts to analyze", "Cody Jiang is a happy boy!")

model_names = ['distilbert-base-uncased-finetuned-sst-2-english', 'Codys-Finetuning-Language-Model']
model_name = st.selectbox("Select a pretrained model", model_names)

if model_name == "Codys-Finetuning-Language-Model":
    model = transformers.BertForSequenceClassification.from_pretrained("./myModel/")
    tokenizer = transformers.BertTokenizerFast.from_pretrained("bert-base-uncased")
else:
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    config = transformers.AutoConfig.from_pretrained(model_name)
    model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)

if st.button("Submit"):
    if model_name == "Codys-Finetuning-Language-Model":
        input_val = tokenizer(user_input, padding=True, truncation=True, max_length=512, return_tensors="pt")
        output_val = model(**input_val)
        probabilities = torch.sigmoid(output_val.logits)
        result_list = probabilities.tolist()[0]
        columns = ["Tweet", "Toxicity Class", "Probability"]
        toxicity_class = ["toxic", "severe toxic", "obscene", "threat", "insult", "identity hate"]
        result_df = pd.DataFrame(columns=columns)
        result_df.loc[0] = [user_input, toxicity_class[result_list.index(max(result_list))], max(result_list)]
        st.table(result_df)

    else:
        text = preprocess(user_input)
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        columns = ["Tweet", "Toxicity Class", "Probability"]
        toxicity_class = config.id2label
        result_list = []
        columns_list = []
        for i in range(scores.shape[0]):
            l = toxicity_class[ranking[i]]
            s = scores[ranking[i]]
            result_list.append(s)
            columns_list.append(l)
        result_df = pd.DataFrame(columns=columns)
        result_df.loc[0] = [user_input, columns_list[0], result_list[0]]
        st.table(result_df)