Spaces:
Running
Running
File size: 5,692 Bytes
e6bfe5c 85d8808 8d47d74 e6bfe5c d25abcf e6bfe5c 1b711d9 e6bfe5c 670cc01 e6bfe5c 271b1e6 e6bfe5c 125e609 b8834e9 f474e98 e6bfe5c f474e98 a834bc3 4874aa0 6bef59a a931c89 4874aa0 f474e98 443b5af f474e98 4874aa0 a834bc3 f474e98 f33ab56 1eab2d6 f474e98 e6bfe5c f474e98 aabb16c 900bd10 02c44c0 900bd10 aabb16c e6bfe5c f193a60 e6bfe5c 271b1e6 f1ea600 5b3d11c f193a60 d84f151 4874aa0 f193a60 1b711d9 4874aa0 32e294e 3598e61 4874aa0 1b711d9 e6bfe5c 8961cd3 1b711d9 4874aa0 443b5af e6bfe5c d25abcf e6bfe5c d25abcf 4874aa0 bc06b7e c558c48 a931c89 c558c48 7ae6e02 c558c48 d25abcf 9c13337 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# Turkish NER Demo for Various Models
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model
import sentencepiece
import streamlit as st
import pandas as pd
import spacy
st.set_page_config(layout="wide")
example_list = [
"Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00"
]
st.title("Demo for Sestwana NER Models")
st.write("A Setswana Langage Model Finetuned on MasakhaNER-2 for Named Entity Recognition")
st.write("Co authors : Vukosi Marivate (@vukosi), Moseli Mots'Oehli (@MoseliMotsoehli) , Valencia Wagner, Richard Lastrucci and Isheanesu Dzingirai")
st.write("Link to model: https://huggingface.co/dsfsi/PuoBERTa")
model_list = ['dsfsi/PuoBERTa-NER']
st.sidebar.header("Select NER Model")
model_checkpoint = st.sidebar.radio("", model_list)
if model_checkpoint == "akdeniz27/xlm-roberta-base-turkish-ner":
aggregation = "simple"
elif model_checkpoint == "dsfsi/PuoBERTa-NER":
aggregation = "simple"
elif model_checkpoint == "xlm-roberta-large-finetuned-conll03-english" or model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5":
aggregation = "simple"
st.sidebar.write("")
st.sidebar.write("The selected NER model is included just to show the zero-shot transfer learning capability of XLM-Roberta pretrained language model.")
else:
aggregation = "first"
st.subheader("Select Text Input Method")
input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text','Upload CSV File'))
if input_method == 'Select from Examples':
selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1)
st.subheader("Text to Run")
input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2)
elif input_method == "Write or Paste New Text":
st.subheader("Text to Run")
input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2)
elif input_method == "Upload CSV File":
st.subheader("Upload CSV File")
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
df_csv = pd.read_csv(uploaded_file)
st.write(df_csv)
sentences = []
for index, row in df_csv.iterrows():
for col in df_csv.columns:
# Add each sentence from the row and columns into the list
sentence = row[col]
if pd.notna(sentence): # Ensure it is not empty or NaN
sentences.append(sentence)
text_column = st.selectbox("Select the column containing text", sentences)
input_text = text_column
@st.cache_resource
def setModel(model_checkpoint, aggregation):
tokenizer = AutoTokenizer.from_pretrained("dsfsi/PuoBERTa-NER")
model = AutoModelForTokenClassification.from_pretrained("dsfsi/PuoBERTa-NER")
return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)
@st.cache_resource
def get_html(html: str):
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
html = html.replace("\n", " ")
return WRAPPER.format(html)
@st.cache_resource
def entity_comb(output):
output_comb = []
for ind, entity in enumerate(output):
if ind == 0:
output_comb.append(entity)
elif output[ind]["start"] == output[ind-1]["end"] and output[ind]["entity_group"] == output[ind-1]["entity_group"]:
output_comb[-1]["word"] = output_comb[-1]["word"] + output[ind]["word"]
output_comb[-1]["end"] = output[ind]["end"]
else:
output_comb.append(entity)
return output_comb
Run_Button = st.button("Run", key=None)
if Run_Button and input_text != "":
ner_pipeline = setModel(model_checkpoint, aggregation)
output = ner_pipeline(input_text)
output_comb = entity_comb(output)
df = pd.DataFrame.from_dict(output_comb)
cols_to_keep = ['word','entity_group','score','start','end']
df_final = df[cols_to_keep]
st.subheader("Recognized Entities")
st.dataframe(df_final)
st.subheader("Spacy Style Display")
spacy_display = {}
spacy_display["ents"] = []
spacy_display["text"] = input_text
spacy_display["title"] = None
for entity in output_comb:
spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
tner_entity_list = ["person", "group", "facility", "organization", "geopolitical area", "location", "product", "event", "work of art", "law", "language", "date", "time", "percent", "money", "quantity", "ordinal number", "cardinal number"]
spacy_entity_list = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "MISC"]
for ent in spacy_display["ents"]:
if model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5":
ent["label"] = spacy_entity_list[tner_entity_list.index(ent["label"])]
else:
if ent["label"] == "PER": ent["label"] = "PERSON"
# colors = {'PER': '#85DCDF', 'LOC': '#DF85DC', 'ORG': '#DCDF85', 'MISC': '#85ABDF',}
html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True, options={"ents": spacy_entity_list}) # , "colors": colors})
style = "<style>mark.entity { display: inline-block }</style>"
st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)
|