# Turkish NER Demo for Various Models from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model import sentencepiece import streamlit as st import pandas as pd import spacy st.set_page_config(layout="wide") example_list = [ "Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00" ] st.title("Demo for Sestwana NER Models") st.write("A Setswana Langage Model Finetuned on MasakhaNER-2 for Named Entity Recognition") st.write("Co authors : Vukosi Marivate (@vukosi), Moseli Mots'Oehli (@MoseliMotsoehli) , Valencia Wagner, Richard Lastrucci and Isheanesu Dzingirai") st.write("Link to model: https://huggingface.co/dsfsi/PuoBERTa") model_list = ['dsfsi/PuoBERTa-NER'] st.sidebar.header("Select NER Model") model_checkpoint = st.sidebar.radio("", model_list) if model_checkpoint == "akdeniz27/xlm-roberta-base-turkish-ner": aggregation = "simple" elif model_checkpoint == "dsfsi/PuoBERTa-NER": aggregation = "simple" elif model_checkpoint == "xlm-roberta-large-finetuned-conll03-english" or model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5": aggregation = "simple" st.sidebar.write("") st.sidebar.write("The selected NER model is included just to show the zero-shot transfer learning capability of XLM-Roberta pretrained language model.") else: aggregation = "first" st.subheader("Select Text Input Method") input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text','Upload CSV File')) if input_method == 'Select from Examples': selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1) st.subheader("Text to Run") input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2) elif input_method == "Write or Paste New Text": st.subheader("Text to Run") input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2) elif input_method == "Upload CSV File": st.subheader("Upload CSV File") uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: df_csv = pd.read_csv(uploaded_file) st.write(df_csv) sentences = [] for index, row in df_csv.iterrows(): for col in df_csv.columns: # Add each sentence from the row and columns into the list sentence = row[col] if pd.notna(sentence): # Ensure it is not empty or NaN sentences.append(sentence) text_column = st.selectbox("Select the column containing text", sentences) input_text = text_column @st.cache_resource def setModel(model_checkpoint, aggregation): tokenizer = AutoTokenizer.from_pretrained("dsfsi/PuoBERTa-NER") model = AutoModelForTokenClassification.from_pretrained("dsfsi/PuoBERTa-NER") return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=aggregation) @st.cache_resource def get_html(html: str): WRAPPER = """
{}
""" html = html.replace("\n", " ") return WRAPPER.format(html) @st.cache_resource def entity_comb(output): output_comb = [] for ind, entity in enumerate(output): if ind == 0: output_comb.append(entity) elif output[ind]["start"] == output[ind-1]["end"] and output[ind]["entity_group"] == output[ind-1]["entity_group"]: output_comb[-1]["word"] = output_comb[-1]["word"] + output[ind]["word"] output_comb[-1]["end"] = output[ind]["end"] else: output_comb.append(entity) return output_comb Run_Button = st.button("Run", key=None) if Run_Button and input_text != "": ner_pipeline = setModel(model_checkpoint, aggregation) output = ner_pipeline(input_text) output_comb = entity_comb(output) df = pd.DataFrame.from_dict(output_comb) cols_to_keep = ['word','entity_group','score','start','end'] df_final = df[cols_to_keep] st.subheader("Recognized Entities") st.dataframe(df_final) st.subheader("Spacy Style Display") spacy_display = {} spacy_display["ents"] = [] spacy_display["text"] = input_text spacy_display["title"] = None for entity in output_comb: spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]}) tner_entity_list = ["person", "group", "facility", "organization", "geopolitical area", "location", "product", "event", "work of art", "law", "language", "date", "time", "percent", "money", "quantity", "ordinal number", "cardinal number"] spacy_entity_list = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "MISC"] for ent in spacy_display["ents"]: if model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5": ent["label"] = spacy_entity_list[tner_entity_list.index(ent["label"])] else: if ent["label"] == "PER": ent["label"] = "PERSON" # colors = {'PER': '#85DCDF', 'LOC': '#DF85DC', 'ORG': '#DCDF85', 'MISC': '#85ABDF',} html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True, options={"ents": spacy_entity_list}) # , "colors": colors}) style = "" st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)