import json import string import numpy as np import pandas as pd from tqdm import tqdm import torch import spacy from transformers import AutoTokenizer, AutoModel from adapters import AutoAdapterModel def restore_inverted_abstract(inverted_abstr): all_indexes = [index for indexes in inverted_abstr.values() for index in indexes] if len(all_indexes) > 0: length = max(all_indexes) + 1 else: return None abstract_words = ["" for _ in range(length)] for word, indexes in inverted_abstr.items(): for index in indexes: abstract_words[index] = word return " ".join(abstract_words) def extract_title_abstract(oa_object): abstract = oa_object["abstract_inverted_index"] title_abstract_obj = { "title": oa_object["title"], "abstract": (None if abstract is None else restore_inverted_abstract(abstract)) } return title_abstract_obj def preprocess_batch(batch, tokenizer, input_is_context=False): # papers = [{'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'}, # {'title': 'Attention is all you need', 'abstract': ' The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'}] # concatenate title and abstract if not input_is_context: batch = [(d['title'] or '') + tokenizer.sep_token + (d.get('abstract') or '') for d in batch] tokenized_batch = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False, max_length=512) return tokenized_batch def sent_is_mostly_known_tokens(tokens, tokenizer, threshold=0.7): return get_fraction_of_known_tokens(tokens, tokenizer) >= threshold def get_fraction_of_known_tokens(tokens, tokenizer): total_tokens = len(tokens) if total_tokens == 0: return False # Avoid division by zero # Clean tokens and check if they exist in the tokenizer's vocab known_tokens = sum(1 for token in tokens if token.text.lower().strip(string.punctuation) in tokenizer.vocab) return known_tokens / total_tokens def prune_contexts(contexts, spacy_model, tokenizer): chosen_sents = [] fractions = [] for _, context in tqdm(contexts.iterrows(), total=len(contexts)): text = (context["left_context"] + context["mention"] + context["right_context"]).replace("\n", " ") citation_start = len(context["left_context"]) + 1 spacied = spacy_model(text) chosen_sent = None previous_sent = "" kt_fraction = None for sent in spacied.sents: if citation_start < sent.end_char and citation_start >= sent.start_char: chosen_sent = previous_sent + sent.text kt_fraction = get_fraction_of_known_tokens(sent, tokenizer) break previous_sent = sent.text if chosen_sent is None or len(chosen_sent.split()) < 5: print(f" - no context found: {spacied.text}") chosen_sent = None # if chosen_sent is not None: chosen_sents.append(chosen_sent) fractions.append(kt_fraction) return chosen_sents, fractions def embed_contexts(contexts, model, tokenizer, batch_size = 16): embeddings = [] # Process in batches with torch.no_grad(): # Disable gradient tracking to save memory for i in tqdm(range(0, len(contexts), batch_size)): batch = contexts[i:i + batch_size] try: inputs = preprocess_batch(batch, tokenizer, input_is_context=True) except Exception as e: print(e) breakpoint() batch_embeddings = embed_batch(inputs, model) embeddings.append(batch_embeddings) # Concatenate all batches back together return torch.cat(embeddings, dim=0) def embed_batch(tokenized_batch, model): output = model(**tokenized_batch) # take the first token in the batch as the embedding embeddings = output.last_hidden_state[:, 0, :] return embeddings def embed_abstracts(abstract_title_list, model, tokenizer, batch_size=16): print("Loaded specter2 model:") embeddings = [] # Process in batches with torch.no_grad(): # Disable gradient tracking to save memory for i in tqdm(range(0, len(abstract_title_list), batch_size)): batch = abstract_title_list[i:i + batch_size] inputs = preprocess_batch(batch, tokenizer) batch_embeddings = embed_batch(inputs, model) embeddings.append(batch_embeddings) # Concatenate all batches back together return torch.cat(embeddings, dim=0) def calculate_distances(embeddings_a, embeddings_b, indices, batch_size=512): # Initialize a list to store the results all_distances = [None] * len(indices) # Loop over the embeddings in batches num_batches = len(indices) // batch_size + (1 if len(indices) % batch_size != 0 else 0) for i in range(num_batches): # Get the current batch start_idx = i * batch_size end_idx = min((i + 1) * batch_size, len(indices)) batch_a, batch_b, batch_positions = [], [], [] for idx, (a, b) in enumerate(indices[start_idx:end_idx]): if a is None or b is None: all_distances[start_idx + idx] = np.nan # Assign NaN directly in place else: batch_a.append(embeddings_a[a]) batch_b.append(embeddings_b[b]) batch_positions.append(start_idx + idx) if batch_a and batch_b: batch_a = torch.from_numpy(np.array(batch_a)).float() batch_b = torch.from_numpy(np.array(batch_b)).float() # Compute L2 (Euclidean) distance for the batch distances_batch = torch.norm(batch_a - batch_b, p=2, dim=1).numpy().astype(float) # Assign computed distances in the correct positions for pos, dist in zip(batch_positions, distances_batch): all_distances[pos] = dist return all_distances def add_distances_to_df(df, index_left, index_right, embeddings, column_name): if column_name == "abstract_abstract_l2_distance": indices = [(index_left.index(doi), index_right.index(cite_id)) for doi, cite_id in zip(df["cited_in_doi"], df["citation_id"])] print("calculate distances...") distances = calculate_distances(embeddings["original_abstract"], embeddings["citation_abstract"], indices) df[column_name] = distances elif column_name == "context_abstract_l2_distance": indices = [ (index_left.index(i), index_right.index(cite_id)) if i in index_left else (None, None) for i, cite_id in enumerate(df["citation_id"]) ] print("calculate distances...") distances = calculate_distances(embeddings["citation_context_base"], embeddings["citation_abstract"], indices) df[column_name] = distances return df def add_pruned_contexts_to_df(df, df_name): tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base') nlp = spacy.load("en_core_web_lg") df["pruned_contexts"], df["known_tokens_fraction"] = prune_contexts(df, nlp, tokenizer) df.to_parquet(df_name, compression='gzip') def main_specter(retracted, overwrite=True): tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base') model = AutoAdapterModel.from_pretrained('allenai/specter2_base') # model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True) if not overwrite: embeddings_from_disk = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz') ## Paper abstracts if retracted: data = pd.read_parquet("25_02_05_retractions_abstracts_cleaned.gzip") else: # data = pd.read_parquet("24_11_30_reference_articles.gzip") data = pd.read_parquet("24_12_31_reference_articles_most_cited.gzip") print("embedding original abstracts...") if not overwrite and "original_abstract" in embeddings_from_disk: paper_abstract_embedding = embeddings_from_disk["original_abstract"] else: paper_abstract_embedding = embed_abstracts( [ {"title":r["Title"], "abstract": r["Abstract"]} for _,r in data.iterrows() ], model, tokenizer, batch_size=4 ).detach().numpy() ## Cited papers abstracts if retracted: citations_df_name = "retraction_citation_mentions.gzip" with open("retractions_citations.json") as jsonfile: cite_data = json.load(jsonfile) citations = pd.read_parquet(citations_df_name) else: citations_df_name = "reference_mc_citation_mentions.gzip" # with open("reference_citations.json") as jsonfile: with open("reference_most_cited_citations.json") as jsonfile: cite_data = json.load(jsonfile) citations = pd.read_parquet(citations_df_name) cite_data = {entry["id"]:entry for cite in cite_data.values() for entry in cite} print("embedding cited abstracts...") if not overwrite and "citation_abstract" in embeddings_from_disk: citation_abstract_embedding = embeddings_from_disk["citation_abstract"] else: citation_abstract_embedding = embed_abstracts( [ { "title":cite_data[cite]["title"], "abstract": (restore_inverted_abstract(cite_data[cite]["abstract_inverted_index"]) if cite_data[cite]["abstract_inverted_index"] is not None else None) } for cite in citations["citation_id"].unique() ], model, tokenizer, batch_size=4, ).detach().numpy() print("embedding citation contexts base...") if not overwrite and "citation_context_base" in embeddings_from_disk: citation_context_embedding_base = embeddings_from_disk["citation_context_base"] else: citation_context_embedding_base = embed_contexts( citations[ (citations["known_tokens_fraction"] >= 0.7) & (~citations["pruned_contexts"].isna()) ]["pruned_contexts"].to_list(), model, tokenizer, ).detach().numpy() print("embedding citation contexts...") if not overwrite and "citation_context" in embeddings_from_disk: citation_context_embedding = embeddings_from_disk["citation_context"] else: model.load_adapter("allenai/specter2_adhoc_query", source="hf", load_as="adhoc", set_active=True) citation_context_embedding = embed_contexts( citations[ (citations["known_tokens_fraction"] >= 0.7) & (~citations["pruned_contexts"].isna()) ]["pruned_contexts"].to_list(), model, tokenizer, ).detach().numpy() # Save np.savez( f'{("retractions" if retracted else "reference")}_embeddings_specter.npz', original_abstract=paper_abstract_embedding, citation_context=citation_context_embedding, citation_abstract=citation_abstract_embedding, citation_context_base=citation_context_embedding_base, ) # Load data = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz') print(data["original_abstract"].shape) # (768,) or (1536,) depending on the model print(data["citation_context"].shape) # (768,) or (1536,) depending on the model print(data["citation_context_base"].shape) # (768,) or (1536,) depending on the model print(data["citation_abstract"].shape) # (768,) or (1536,) depending on the model def main_scibert(retracted, overwrite=True): tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') # model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True) if not overwrite: embeddings_from_disk = np.load(f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz') ## Paper abstracts if retracted: data = pd.read_parquet("25_02_05_retractions_abstracts_cleaned.gzip") else: # data = pd.read_parquet("24_11_30_reference_articles.gzip") data = pd.read_parquet("24_12_31_reference_articles_most_cited.gzip") print("embedding original abstracts...") if not overwrite and "original_abstract" in embeddings_from_disk: paper_abstract_embedding = embeddings_from_disk["original_abstract"] else: paper_abstract_embedding = embed_abstracts( [ {"title":r["Title"], "abstract": r["Abstract"]} for _,r in data.iterrows() ], model, tokenizer, batch_size=4 ).detach().numpy() ## Cited papers abstracts if retracted: citations_df_name = "retraction_citation_mentions.gzip" with open("retractions_citations.json") as jsonfile: cite_data = json.load(jsonfile) citations = pd.read_parquet(citations_df_name) else: citations_df_name = "reference_mc_citation_mentions.gzip" # with open("reference_citations.json") as jsonfile: with open("reference_most_cited_citations.json") as jsonfile: cite_data = json.load(jsonfile) citations = pd.read_parquet(citations_df_name) cite_data = {entry["id"]:entry for cite in cite_data.values() for entry in cite} print("embedding cited abstracts...") if not overwrite and "citation_abstract" in embeddings_from_disk: citation_abstract_embedding = embeddings_from_disk["citation_abstract"] else: citation_abstract_embedding = embed_abstracts( [ { "title":cite_data[cite]["title"], "abstract": (restore_inverted_abstract(cite_data[cite]["abstract_inverted_index"]) if cite_data[cite]["abstract_inverted_index"] is not None else None) } for cite in citations["citation_id"].unique() ], model, tokenizer, batch_size=4, ).detach().numpy() print("embedding citation contexts...") if not overwrite and "citation_context" in embeddings_from_disk: citation_context_embedding = embeddings_from_disk["citation_context"] else: citation_context_embedding = embed_contexts( citations[ (citations["known_tokens_fraction"] >= 0.7) & (~citations["pruned_contexts"].isna()) ]["pruned_contexts"].to_list(), model, tokenizer, ).detach().numpy() # Save np.savez( f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz', original_abstract=paper_abstract_embedding, citation_context=citation_context_embedding, citation_abstract=citation_abstract_embedding, ) # Load data = np.load(f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz') print(data["original_abstract"].shape) # (768,) or (1536,) depending on the model print(data["citation_context"].shape) # (768,) or (1536,) depending on the model print(data["citation_context_base"].shape) # (768,) or (1536,) depending on the model print(data["citation_abstract"].shape) # (768,) or (1536,) depending on the model if __name__=="__main__": import sys retracted=(sys.argv[1] == "retracted") if retracted: print("Running embedding pipeline for retractions.") else: print("Running embedding pipeline for reference.") df = pd.read_parquet(f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip') # add_pruned_contexts_to_df(df, f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip') main_scibert(retracted, overwrite=False) # main_specter(retracted, overwrite=False) embeddings = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz') print(embeddings["original_abstract"].shape) # (768,) or (1536,) depending on the model print(embeddings["citation_context"].shape) # (768,) or (1536,) depending on the model print(embeddings["citation_abstract"].shape) # (768,) or (1536,) depending on the model # original_dois = pd.read_parquet(f'{("25_02_05_retractions_abstracts_cleaned" if retracted else "24_11_30_reference_articles")}.gzip', columns = ["OriginalPaperDOI"])["OriginalPaperDOI"].tolist() original_dois = pd.read_parquet(f'{("25_02_05_retractions_abstracts_cleaned" if retracted else "24_12_31_reference_articles_most_cited")}.gzip', columns = ["OriginalPaperDOI"])["OriginalPaperDOI"].tolist() # df = add_distances_to_df( # df, # [doi.replace("https://doi.org/", "") for doi in original_dois], # df["citation_id"].unique().tolist(), # embeddings, # "abstract_abstract_l2_distance" # ) df = add_distances_to_df( df, df.index[ (df["known_tokens_fraction"] >= 0.7) & (~df["pruned_contexts"].isna()) ].tolist(), df["citation_id"].unique().tolist(), embeddings, "context_abstract_l2_distance" ) df.to_parquet(f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip', compression='gzip')