Spaces:
Runtime error
Runtime error
| import nltk | |
| import re | |
| import nltkmodule | |
| from nltk.tokenize import word_tokenize | |
| from sentence_transformers import SentenceTransformer | |
| import pandas as pd | |
| import numpy as np | |
| from pandas import ExcelWriter | |
| from torch.utils.data import DataLoader | |
| import math | |
| from sentence_transformers import models, losses | |
| from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer | |
| from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator | |
| from sentence_transformers.readers import * | |
| from nltk.corpus import stopwords | |
| stop_words = stopwords.words('english') | |
| import matplotlib.pyplot as plt | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import PCA | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import scipy.spatial | |
| import networkx as nx | |
| from nltk.tokenize import sent_tokenize | |
| import scispacy | |
| import spacy | |
| import en_core_sci_lg | |
| import string | |
| from nltk.stem.wordnet import WordNetLemmatizer | |
| import gradio as gr | |
| nlp = en_core_sci_lg.load() | |
| sp = en_core_sci_lg.load() | |
| all_stopwords = sp.Defaults.stop_words | |
| def remove_stopwords(sen): | |
| sen_new = " ".join([i for i in sen if i not in stop_words]) | |
| return sen_new | |
| def keyphrase_generator(article, model_1, model_2, max_num_keywords): | |
| element=[] | |
| document=[] | |
| text=[] | |
| model_1 = SentenceTransformer(model_1) | |
| model_2 = SentenceTransformer(model_2) | |
| corpus=sent_tokenize(article) | |
| clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ").tolist() | |
| corpus_embeddings = model_1.encode(clean_sentences_new) | |
| sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)]) | |
| for i in range(len(clean_sentences_new)): | |
| for j in range(len(clean_sentences_new)): | |
| if i != j: | |
| sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0] | |
| nx_graph = nx.from_numpy_array(sim_mat) | |
| scores = nx.pagerank(nx_graph) | |
| ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(corpus)), reverse=True) | |
| for elem in ranked_sentences: | |
| element.append(elem[1]) | |
| a=int((10*len(element))/100.0) | |
| if(a<5): | |
| total=5 | |
| else: | |
| total=int(a) | |
| for i in range(total): | |
| document.append(element[i]) | |
| doc=" ".join(document) | |
| for i in document: | |
| doc_1=nlp(i) | |
| text.append([X.text for X in doc_1.ents]) | |
| entity_list = [item for sublist in text for item in sublist] | |
| entity_list = [word for word in entity_list if not word in all_stopwords] | |
| entity_list=list(dict.fromkeys(entity_list)) | |
| doc_embedding = model_2.encode([doc]) | |
| candidates=entity_list | |
| candidate_embeddings = model_2.encode(candidates) | |
| distances = cosine_similarity(doc_embedding, candidate_embeddings) | |
| top_n = max_num_keywords | |
| keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]] | |
| keywords = '\n'.join(keyword_list) | |
| return keywords | |
| igen=gr.Interface(keyphrase_generator, | |
| inputs=[gr.inputs.Textbox(lines=10, placeholder="Provide article text here",default="", label="article text"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-mpnet-base-v2", label="Model for TextRank (e.g. all-mpnet-base-v2)"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-distilroberta-v1",label="Model for keyphrases (e.g. all-distilroberta-v1)"),gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")], | |
| outputs="text", theme="huggingface", | |
| title="Scientific Article Keyphrase Generator", | |
| description="Generates the keyphrases from an article which best describes the article.", | |
| article= "The work is based on a part of the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>Unsupervised Keyword Combination Query Generation from Online Health Related Content for Evidence-Based Fact Checking</a>." | |
| "\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT." | |
| "\t The list of SBERT models required in the textboxes can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>." | |
| "\t The default model names are provided which can be changed from the list of pretrained models. " | |
| "\t The value of output keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.") | |
| igen.launch(share=True) |