|
import gradio as gr |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER") |
|
|
|
model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER") |
|
|
|
|
|
def get_ner(sentence): |
|
tok_sentence = tokenizer(sentence, return_tensors='pt') |
|
|
|
with torch.no_grad(): |
|
logits = model(**tok_sentence).logits.argmax(-1) |
|
predicted_tokens_classes = [ |
|
model.config.id2label[t.item()] for t in logits[0]] |
|
|
|
predicted_labels = [] |
|
|
|
previous_token_id = 0 |
|
word_ids = tok_sentence.word_ids() |
|
for word_index in range(len(word_ids)): |
|
if word_ids[word_index] == None: |
|
previous_token_id = word_ids[word_index] |
|
elif word_ids[word_index] == previous_token_id: |
|
previous_token_id = word_ids[word_index] |
|
else: |
|
predicted_labels.append(predicted_tokens_classes[word_index]) |
|
previous_token_id = word_ids[word_index] |
|
|
|
ner_output = [] |
|
for index in range(len(sentence.split(' '))): |
|
ner_output.append( |
|
(sentence.split(' ')[index], predicted_labels[index])) |
|
return ner_output |
|
|
|
|
|
iface = gr.Interface(get_ner, |
|
gr.Textbox(placeholder="Enter sentence here..."), |
|
["highlight"], description='The 11 languages covered by IndicNER are: Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Oriya, Punjabi, Tamil, Telugu.', |
|
examples=['लगातार हमलावर हो रहे शिवपाल और राजभर को सपा की दो टूक, चिट्ठी जारी कर कहा- जहां जाना चाहें जा सकते हैं', 'ಶರಣ್ ರ ನೀವು ನೋಡಲೇಬೇಕಾದ ಟಾಪ್ 5 ಕಾಮಿಡಿ ಚಲನಚಿತ್ರಗಳು'], title='IndicNER', |
|
article='IndicNER is a model trained to complete the task of identifying named entities from sentences in Indian languages. Our model is specifically fine-tuned to the 11 Indian languages mentioned above over millions of sentences. The model is then benchmarked over a human annotated testset and multiple other publicly available Indian NER datasets.' |
|
) |
|
|
|
iface.launch(enable_queue=True) |
|
|