import json import re import sys import numpy as np from pathlib import Path from typing import NamedTuple import pandas as pd TITLE_NORMALIZE = [ "alpha", "beta", "gamma", "delta", "epsilon", "kappa", "lambda" ] class Context(NamedTuple): left: str right: str split_right_pattern = re.compile(r"(?:#+)|(?:\[(?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\])") split_left_pattern = re.compile(r"(?:#+)|(?:\](?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\[)") ieee_style_pattern = re.compile(r"(?>\[(?>[^A-Za-z0-9\[\]\.]*(\d{1,3})[^A-Za-z0-9\[\]\.]*)+\][^A-Za-z0-9\[\]]*)+") auth_year_style_pattern = re.compile(r"(?>\((?>[^()]+?[,\s][1-2][0-9]{3})+\)[^()A-Za-z0-9]*)+") def filter_page_breaks(content): find_page_breaks = re.compile( r""" \n* \n # empty line -----\n # 5 dashes \n # empty line (?:.*?\n)? # Capture the footer/header \n* """, re.VERBOSE | re.M ) return re.sub(find_page_breaks, " ", content) def get_author_title_year_patterns_from_citation(cite): title = cite['title'] for w in TITLE_NORMALIZE: title = title.replace(w, "$") title = re.sub(r"[^a-zA-Z0-9]+", "_", title) # Replace en and em dashes with a hyphen # title = title.replace(" ", r"[^a-zA-Z0-9]+?") year = str(cite['publication_year']) try: first_author = cite['authorships'][0]['author']['display_name'] ## only lastname first_author = re.sub(r"[^a-zA-Z0-9]+", "_", first_author.split(" ")[-1]) except IndexError or TypeError: first_author = None return first_author, title, year def extract_potential_citations(paper): ieee_style = ieee_style_pattern.finditer(paper) ieee_style_buckets = [] for match in ieee_style: possible = set([int(n) for n in re.findall(r"\d{1,3}", match.group(1))]) ## expand ranges ranges = re.findall(r"(\d{1,3})[–——-]+(\d{1,3})", match.group(1)) if len(ranges)>0: for start, end in ranges: possible |= set(range(int(start),int(end)+1)) ieee_style_buckets.append((match.start(), match.end(), match.group(0), possible)) auth_year_style = auth_year_style_pattern.finditer(paper) auth_year_style_buckets = [] for match in auth_year_style: possible = set(re.split(r"(\b[1-2]\d{3}\b)\W*", match.group(0))) auth_year_style_buckets.append((match.start(), match.end(), match.group(0), possible)) return ieee_style_buckets, auth_year_style_buckets def find_reference_in_reference_section(paper, cite, references): """ Searches for reference section entry matching citation paper title, year, first author, and journal in a markdown file using fuzzy matching. """ patterns = get_author_title_year_patterns_from_citation(cite) if any([p is None for p in patterns]): return paper, None author, title, year = patterns patterns = [author, title, year] # Try finding all the patterns between two enumeration items starting from the back of the string # for i,s in enumerate(references): for full_ref, enum, ref_body in references: for w in TITLE_NORMALIZE: normalized = ref_body.replace(w, "$") fuzzy_ref = re.sub(r"[^a-zA-Z0-9]+", "_", normalized) if all([re.search(pattern, fuzzy_ref, re.IGNORECASE | re.MULTILINE | re.DOTALL) for pattern in patterns]): match = (cite["id"], author, title, year, enum, ref_body) # remove the reference from the paper so it can't be matched again paper = paper.replace(full_ref, "") return paper, match return paper, (cite["id"], author, title, year, None, None) def find_mentions_by_pointer(doi, ref, paper, ieee_possible): """ Match the links mentioning that reference in the text and extract context. """ mentions = [] (oa_id, _, _, _, ref_num, r) = ref for start, end, match, possible_numbers in ieee_possible: if int(ref_num) in possible_numbers: context = create_context(start, end, paper) mentions.append((doi, oa_id, ref_num, r, start, end, context.left, match, context.right)) return mentions def find_mentions_direct(doi, ref, paper, auth_style_possible): """ Match the links mentioning that reference in the text and extract context. """ mentions = [] (oa_id, a, _, y, _, _) = ref for start, end, match, possibilities in auth_style_possible: for possibility in possibilities: if y in possibility and a in possibility: context = create_context(start, end, paper) mentions.append((doi, oa_id, None, None, start, end, context.left, match, context.right)) return mentions def create_context(start, end, paper): left = paper[max(0, start - 500):start] right = paper[end:end + min(len(paper) - end, 500)] ## only take context until a next section begins or another citation appears splitleft = split_left_pattern.search(left[::-1]) if splitleft is not None: left = left[len(left) - splitleft.start():] splitright = split_right_pattern.search(right) if splitright is not None: right = right[:splitright.start()] return Context(left=left, right=right) def restore_inverted_abstract(inverted_abstr): all_indexes = [index for indexes in inverted_abstr.values() for index in indexes] if len(all_indexes) > 0: length = max(all_indexes) + 1 else: return None abstract_words = ["" for _ in range(length)] for word, indexes in inverted_abstr.items(): for index in indexes: abstract_words[index] = word return " ".join(abstract_words) def extract_title_abstract(oa_object): abstract = oa_object["abstract_inverted_index"] title_abstract_obj = { "title": oa_object["title"], "abstract": (None if abstract is None else restore_inverted_abstract(abstract)) } return title_abstract_obj def extract_citation_contexts(cites, paper): counter=0 extracted_citations = [] references_pattern = re.compile(r'(\n\W*(\d{1,3})\W(.+?)(?=(?:\n\n)|(?:\n\W*\d{1,3}\W)|\Z))', re.VERBOSE | re.I | re.M | re.S) for doi in cites: # for doi in ["10.1155/2021/4883509"]: counter+=1 paper = filter_page_breaks(paper) # print(paper) if paper is None: continue # remove title and authors from beginning of paper paper = paper[750:] citations = cites[doi] # references = re.findall(r'\n\s*(\d+)\.(.*?)(?=(?:\n\s*\d+\.)|\Z)', paper, re.VERBOSE | re.I | re.M | re.S) references = references_pattern.findall(paper) found = 0 n_mentions = 0 has_abstract_title = 0 in_ref_section_refs = [] for cite in citations: embedding_input = extract_title_abstract(cite) if embedding_input["abstract"] is None or embedding_input["title"] is None: in_ref_section_refs.append(None) continue has_abstract_title+=1 paper, in_ref_section_ref = find_reference_in_reference_section(paper, cite, references) in_ref_section_refs.append(in_ref_section_ref) ieee, auth_year = extract_potential_citations(paper) for ref in in_ref_section_refs: if ref is not None: if ref[4] is not None: mentions = find_mentions_by_pointer(doi, ref, paper, ieee) else: mentions = [] mentions += find_mentions_direct(doi, ref, paper, auth_year) extracted_citations+=mentions if len(mentions)>0: n_mentions+=len(mentions) found+=1 print(f"{counter}/{len(cites)} - {doi}: {len(citations)} citations, {has_abstract_title} embeddable citations and {found} references with {n_mentions} mentions found in markdown.") return pd.DataFrame(extracted_citations, columns = ["cited_in_doi", "citation_id", "reference_marker", "reference_target", "mention_start", "mention_end", "left_context", "mention", "right_context"])