# Turkish NER Demo for Various Models from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model import sentencepiece import streamlit as st import pandas as pd import spacy st.set_page_config(layout="wide") example_list = [ "Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00" ] st.title("Demo for Sestwana NER Models") st.write("A Setswana Langage Model Finetuned on MasakhaNER-2 for Named Entity Recognition") st.write("Co authors : Vukosi Marivate (@vukosi), Moseli Mots'Oehli (@MoseliMotsoehli) , Valencia Wagner, Richard Lastrucci and Isheanesu Dzingirai") st.write("Link to model: https://huggingface.co/dsfsi/PuoBERTa") model_list = ['dsfsi/PuoBERTa-NER'] st.sidebar.header("Select NER Model") model_checkpoint = st.sidebar.radio("", model_list) if model_checkpoint == "akdeniz27/xlm-roberta-base-turkish-ner": aggregation = "simple" elif model_checkpoint == "dsfsi/PuoBERTa-NER": aggregation = "simple" elif model_checkpoint == "xlm-roberta-large-finetuned-conll03-english" or model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5": aggregation = "simple" st.sidebar.write("") st.sidebar.write("The selected NER model is included just to show the zero-shot transfer learning capability of XLM-Roberta pretrained language model.") else: aggregation = "first" st.subheader("Select Text Input Method") input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text','Upload CSV File')) if input_method == 'Select from Examples': selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1) st.subheader("Text to Run") input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2) elif input_method == "Write or Paste New Text": st.subheader("Text to Run") input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2) elif input_method == "Upload CSV File": st.subheader("Upload CSV File") uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: df_csv = pd.read_csv(uploaded_file) st.write(df_csv) sentences = [] for index, row in df_csv.iterrows(): for col in df_csv.columns: # Add each sentence from the row and columns into the list sentence = row[col] if pd.notna(sentence): # Ensure it is not empty or NaN sentences.append(sentence) text_column = st.selectbox("Select the column containing text", sentences) input_text = text_column @st.cache_resource def setModel(model_checkpoint, aggregation): tokenizer = AutoTokenizer.from_pretrained("dsfsi/PuoBERTa-NER") model = AutoModelForTokenClassification.from_pretrained("dsfsi/PuoBERTa-NER") return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=aggregation) @st.cache_resource def get_html(html: str): WRAPPER = """