File size: 5,692 Bytes
e6bfe5c
 
85d8808
8d47d74
e6bfe5c
 
d25abcf
e6bfe5c
1b711d9
 
e6bfe5c
670cc01
e6bfe5c
 
271b1e6
e6bfe5c
125e609
 
 
 
b8834e9
f474e98
e6bfe5c
f474e98
 
 
a834bc3
4874aa0
6bef59a
 
a931c89
4874aa0
f474e98
443b5af
f474e98
4874aa0
a834bc3
f474e98
f33ab56
1eab2d6
f474e98
 
 
e6bfe5c
f474e98
 
aabb16c
 
 
 
 
 
 
900bd10
02c44c0
900bd10
 
 
 
 
 
 
 
aabb16c
e6bfe5c
f193a60
e6bfe5c
271b1e6
 
f1ea600
5b3d11c
f193a60
d84f151
 
 
 
4874aa0
f193a60
1b711d9
4874aa0
 
 
 
32e294e
3598e61
 
4874aa0
 
1b711d9
 
 
e6bfe5c
8961cd3
1b711d9
 
 
 
 
 
4874aa0
443b5af
e6bfe5c
 
d25abcf
e6bfe5c
d25abcf
 
 
 
 
 
 
4874aa0
bc06b7e
 
c558c48
 
 
 
a931c89
c558c48
 
 
 
7ae6e02
c558c48
d25abcf
9c13337
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Turkish NER Demo for Various Models

from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model
import sentencepiece
import streamlit as st
import pandas as pd
import spacy

st.set_page_config(layout="wide")

example_list = [
 "Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00"   
]

st.title("Demo for Sestwana NER Models")

st.write("A Setswana Langage Model Finetuned on MasakhaNER-2 for Named Entity Recognition")
st.write("Co authors :  Vukosi Marivate (@vukosi), Moseli Mots'Oehli (@MoseliMotsoehli) , Valencia Wagner, Richard Lastrucci and Isheanesu Dzingirai")
st.write("Link to model:  https://huggingface.co/dsfsi/PuoBERTa")

model_list = ['dsfsi/PuoBERTa-NER']

st.sidebar.header("Select NER Model")
model_checkpoint = st.sidebar.radio("", model_list)


if model_checkpoint == "akdeniz27/xlm-roberta-base-turkish-ner":
    aggregation = "simple"
elif model_checkpoint == "dsfsi/PuoBERTa-NER":
    aggregation = "simple"
elif model_checkpoint == "xlm-roberta-large-finetuned-conll03-english" or model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5":
    aggregation = "simple"
    st.sidebar.write("")
    st.sidebar.write("The selected NER model is included just to show the zero-shot transfer learning capability of XLM-Roberta pretrained language model.")
else:
    aggregation = "first"
    
st.subheader("Select Text Input Method")
input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text','Upload CSV File'))
if input_method == 'Select from Examples':
    selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1)
    st.subheader("Text to Run")
    input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2)
elif input_method == "Write or Paste New Text":
    st.subheader("Text to Run")
    input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2)
elif input_method == "Upload CSV File":
    st.subheader("Upload CSV File")
    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

    if uploaded_file is not None:
        df_csv = pd.read_csv(uploaded_file)
        st.write(df_csv)
        sentences = []
        for index, row in df_csv.iterrows():
            for col in df_csv.columns:
                # Add each sentence from the row and columns into the list
                sentence = row[col]
                if pd.notna(sentence):  # Ensure it is not empty or NaN
                    sentences.append(sentence)

        text_column = st.selectbox("Select the column containing text", sentences)
        input_text = text_column


@st.cache_resource
def setModel(model_checkpoint, aggregation):
    tokenizer = AutoTokenizer.from_pretrained("dsfsi/PuoBERTa-NER")
    model = AutoModelForTokenClassification.from_pretrained("dsfsi/PuoBERTa-NER")
    return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)

@st.cache_resource
def get_html(html: str):
    WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
    html = html.replace("\n", " ")
    return WRAPPER.format(html)
    
@st.cache_resource
def entity_comb(output):
    output_comb = []
    for ind, entity in enumerate(output):
        if ind == 0:
            output_comb.append(entity)
        elif output[ind]["start"] == output[ind-1]["end"] and output[ind]["entity_group"] == output[ind-1]["entity_group"]:
            output_comb[-1]["word"] = output_comb[-1]["word"] + output[ind]["word"]
            output_comb[-1]["end"] = output[ind]["end"]
        else:
            output_comb.append(entity)
    return output_comb
        
Run_Button = st.button("Run", key=None)

if Run_Button and input_text != "":
    
    ner_pipeline = setModel(model_checkpoint, aggregation)
    output = ner_pipeline(input_text)
    
    output_comb = entity_comb(output)
    
    df = pd.DataFrame.from_dict(output_comb)
    cols_to_keep = ['word','entity_group','score','start','end']
    df_final = df[cols_to_keep]
    
    st.subheader("Recognized Entities")
    st.dataframe(df_final)

    st.subheader("Spacy Style Display")
    spacy_display = {}
    spacy_display["ents"] = []
    spacy_display["text"] = input_text
    spacy_display["title"] = None

    for entity in output_comb:
        spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
        
    tner_entity_list = ["person", "group", "facility", "organization", "geopolitical area", "location", "product", "event", "work of art", "law", "language", "date", "time", "percent", "money", "quantity", "ordinal number", "cardinal number"]
    spacy_entity_list = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "MISC"]
    
    for ent in spacy_display["ents"]:
        if model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5":
            ent["label"] = spacy_entity_list[tner_entity_list.index(ent["label"])]
        else:
            if ent["label"] == "PER": ent["label"] = "PERSON"
    
    # colors = {'PER': '#85DCDF', 'LOC': '#DF85DC', 'ORG': '#DCDF85', 'MISC': '#85ABDF',}
    html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True, options={"ents": spacy_entity_list}) # , "colors": colors})
    style = "<style>mark.entity { display: inline-block }</style>"
    st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)