PuoBERTaSpace

Sleeping

App Files Files Community

vukosi commited on May 9

Commit

295300a

verified ·

1 Parent(s): 2157d2a

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -123

app.py CHANGED Viewed

@@ -1,136 +1,84 @@
-# Turkish NER Demo for Various Models
-from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model
-import sentencepiece
 import streamlit as st
 import pandas as pd
 import spacy
 st.set_page_config(layout="wide")
-example_list = [
- "Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00"
-]
-#logo
-img1, img2, img3 = st.columns(3)
-with img2:
-    with st.container(border=False):
-        st.image("logo_transparent_small.png")
-st.title("Demo for Sestwana NER Models")
-st.write("A Setswana Langage Model Finetuned on MasakhaNER-2 for Named Entity Recognition")
-st.write("Co authors :  Vukosi Marivate (@vukosi), Moseli Mots'Oehli (@MoseliMotsoehli) , Valencia Wagner, Richard Lastrucci and Isheanesu Dzingirai")
-st.write("Link to model:  https://arxiv.org/abs/2310.09141")
 model_list = ['dsfsi/PuoBERTa-NER']
-st.sidebar.header("Select NER Model")
-model_checkpoint = st.sidebar.radio("", model_list)
-if model_checkpoint == "akdeniz27/xlm-roberta-base-turkish-ner":
-    aggregation = "simple"
-elif model_checkpoint == "dsfsi/PuoBERTa-NER":
-    aggregation = "simple"
-elif model_checkpoint == "xlm-roberta-large-finetuned-conll03-english" or model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5":
-    aggregation = "simple"
-    st.sidebar.write("")
-    st.sidebar.write("The selected NER model is included just to show the zero-shot transfer learning capability of XLM-Roberta pretrained language model.")
-else:
-    aggregation = "first"
-st.subheader("Select Text Input Method")
-input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text','Upload CSV File'))
-if input_method == 'Select from Examples':
-    selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1)
-    st.subheader("Text to Run")
-    input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2)
-elif input_method == "Write or Paste New Text":
-    st.subheader("Text to Run")
-    input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2)
-elif input_method == "Upload CSV File":
-    st.subheader("Upload CSV File")
-    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-    if uploaded_file is not None:
-        df_csv = pd.read_csv(uploaded_file)
-        st.write(df_csv)
-        sentences = []
-        for index, row in df_csv.iterrows():
-            for col in df_csv.columns:
-                # Add each sentence from the row and columns into the list
-                sentence = row[col]
-                if pd.notna(sentence):  # Ensure it is not empty or NaN
-                    sentences.append(sentence)
-        text_column = st.selectbox("Select the column containing text", sentences)
-        input_text = text_column
-@st.cache_resource
-def setModel(model_checkpoint, aggregation):
-    tokenizer = AutoTokenizer.from_pretrained("dsfsi/PuoBERTa-NER")
-    model = AutoModelForTokenClassification.from_pretrained("dsfsi/PuoBERTa-NER")
-    return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)
 @st.cache_resource
-def get_html(html: str):
-    WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
-    html = html.replace("\n", " ")
-    return WRAPPER.format(html)
-@st.cache_resource
-def entity_comb(output):
-    output_comb = []
-    for ind, entity in enumerate(output):
-        if ind == 0:
-            output_comb.append(entity)
-        elif output[ind]["start"] == output[ind-1]["end"] and output[ind]["entity_group"] == output[ind-1]["entity_group"]:
-            output_comb[-1]["word"] = output_comb[-1]["word"] + output[ind]["word"]
-            output_comb[-1]["end"] = output[ind]["end"]
         else:
-            output_comb.append(entity)
-    return output_comb
-Run_Button = st.button("Run", key=None)
-if Run_Button and input_text != "":
-    ner_pipeline = setModel(model_checkpoint, aggregation)
-    output = ner_pipeline(input_text)
-    output_comb = entity_comb(output)
-    df = pd.DataFrame.from_dict(output_comb)
-    cols_to_keep = ['word','entity_group','score','start','end']
-    df_final = df[cols_to_keep]
-    st.subheader("Recognized Entities")
-    st.dataframe(df_final)
-    st.subheader("Spacy Style Display")
-    spacy_display = {}
-    spacy_display["ents"] = []
-    spacy_display["text"] = input_text
-    spacy_display["title"] = None
-    for entity in output_comb:
-        spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
-    tner_entity_list = ["person", "group", "facility", "organization", "geopolitical area", "location", "product", "event", "work of art", "law", "language", "date", "time", "percent", "money", "quantity", "ordinal number", "cardinal number"]
-    spacy_entity_list = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "MISC"]
-    for ent in spacy_display["ents"]:
-        if model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5":
-            ent["label"] = spacy_entity_list[tner_entity_list.index(ent["label"])]
         else:
-            if ent["label"] == "PER": ent["label"] = "PERSON"
-    # colors = {'PER': '#85DCDF', 'LOC': '#DF85DC', 'ORG': '#DCDF85', 'MISC': '#85ABDF',}
-    html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True, options={"ents": spacy_entity_list}) # , "colors": colors})
-    style = "<style>mark.entity { display: inline-block }</style>"
-    st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)

 import streamlit as st
+from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
 import pandas as pd
 import spacy
+# -------------------- PAGE CONFIG --------------------
 st.set_page_config(layout="wide")
+# -------------------- UI HEADER --------------------
+st.image("logo_transparent_small.png", use_column_width="always")
+st.title("Demo for Setswana NER Models")
+st.markdown("""
+A Setswana Language Model fine-tuned on MasakhaNER-2 for Named Entity Recognition.
+**Co-authors**: Vukosi Marivate (@vukosi), Moseli Mots'Oehli (@MoseliMotsoehli), Valencia Wagner, Richard Lastrucci, and Isheanesu Dzingirai
+**Model link**: [arXiv:2310.09141](https://arxiv.org/abs/2310.09141)
+""")
+# -------------------- MODEL SELECTION --------------------
 model_list = ['dsfsi/PuoBERTa-NER']
+model_checkpoint = st.sidebar.radio("Select NER Model", model_list)
+aggregation_strategy = "simple"
+# -------------------- TEXT INPUT --------------------
+input_method = st.radio("Select Input Method", ['Example Text', 'Write Text', 'Upload CSV'])
+def get_input_text():
+    if input_method == 'Example Text':
+        examples = [
+            "Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00"
+        ]
+        return st.selectbox("Example Sentences", examples)
+    elif input_method == 'Write Text':
+        return st.text_area("Enter text", height=128)
+    elif input_method == 'Upload CSV':
+        uploaded = st.file_uploader("Upload CSV", type="csv")
+        if uploaded:
+            df = pd.read_csv(uploaded)
+            col = st.selectbox("Choose column with text", df.columns)
+            return "\n".join(df[col].dropna().astype(str).tolist())
+    return ""
+input_text = get_input_text()
 @st.cache_resource
+def load_ner_pipeline(model_checkpoint, strategy):
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
+    return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=strategy)
+def merge_entities(output):
+    merged = []
+    for i, ent in enumerate(output):
+        if i > 0 and ent["start"] == output[i-1]["end"] and ent["entity_group"] == output[i-1]["entity_group"]:
+            merged[-1]["word"] += ent["word"]
+            merged[-1]["end"] = ent["end"]
         else:
+            merged.append(ent)
+    return merged
+if st.button("Run NER") and input_text.strip():
+    with st.spinner("Running NER..."):
+        ner = load_ner_pipeline(model_checkpoint, aggregation_strategy)
+        output = ner(input_text)
+        entities = merge_entities(output)
+        if entities:
+            df = pd.DataFrame(entities)[['word','entity_group','score','start','end']]
+            st.subheader("Recognized Entities")
+            st.dataframe(df)
+            # -------------------- SPACY STYLE VISUAL --------------------
+            spacy_display = {"text": input_text, "ents": [], "title": None}
+            for ent in entities:
+                label = ent["entity_group"]
+                if label == "PER":
+                    label = "PERSON"
+                spacy_display["ents"].append({"start": ent["start"], "end": ent["end"], "label": label})
+            html = spacy.displacy.render(spacy_display, style="ent", manual=True, minify=True)
+            styled_html = f"<style>mark.entity {{ display: inline-block; }}</style><div style='overflow-x:auto;'>{html}</div>"
+            st.markdown(styled_html, unsafe_allow_html=True)
         else:
+            st.info("No entities recognized in the input.")