Spaces:

chrible
/

citation-integrity

Sleeping

App Files Files Community

Christof Bless commited on May 20

Commit

b23f8b6

unverified ·

1 Parent(s): db2b5ef

first working mvp

Browse files

Files changed (4) hide show

app.py +91 -9
extract_embeddings.py +412 -0
extract_mentions.py +200 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,17 +1,27 @@
 import gradio as gr
-import requests
-from extract_citations import *
-# Set your GROBID server URL
-GROBID_URL = "http://localhost:8070/api/processReferences"
 def extract_text(pdf_file):
     if not pdf_file:
         return "Please upload a PDF file."
     try:
-        # Send PDF to GROBID for citation extraction
-        with open(pdf_file.name, 'rb') as f:
-            print("processing PDF ...")
     except Exception as e:
         return f"Error when processing PDF. {e}"
@@ -23,7 +33,79 @@ def extract_citations(doi):
     return citations_data
 # Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## Citation Integrity Score")
@@ -33,6 +115,6 @@ with gr.Blocks() as demo:
     output = gr.Textbox(label="Extracted Citations", lines=20)
     submit_btn = gr.Button("Submit")
-    submit_btn.click(fn=extract_citations, inputs=[doi_input], outputs=output)
 demo.launch()

 import gradio as gr
+import numpy as np
+import pymupdf4llm
+import spacy
+from transformers import AutoTokenizer, AutoModel
+from adapters import AutoAdapterModel
+from extract_citations import fetch_citations_for_dois
+from extract_embeddings import (
+    prune_contexts,
+    embed_abstracts,
+    embed_contexts,
+    restore_inverted_abstract,
+    calculate_distances
+)
+from extract_mentions import extract_citation_contexts
 def extract_text(pdf_file):
     if not pdf_file:
         return "Please upload a PDF file."
     try:
+        return pymupdf4llm.to_markdown(pdf_file)
     except Exception as e:
         return f"Error when processing PDF. {e}"
     return citations_data
+def get_cite_context_distance(pdf, doi):
+    # Load models
+    tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
+    model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
+    nlp = spacy.load("en_core_web_sm")
+    # fetch cited papers from OpenAlex
+    citations_data = fetch_citations_for_dois([doi])
+    # get markdown text from PDF file
+    text = extract_text(pdf.name)
+    # get the context around citation markers
+    citations = extract_citation_contexts(citations_data, text)
+    citations["pruned_contexts"], citations["known_tokens_fraction"] = prune_contexts(citations, nlp, tokenizer)
+    # embed the contexts
+    citation_context_embedding = embed_contexts(
+        citations[
+            (citations["known_tokens_fraction"] >= 0.7) &
+            (~citations["pruned_contexts"].isna())
+        ]["pruned_contexts"].to_list(),
+        model,
+        tokenizer,
+    ).detach().numpy()
+    citations_data = {entry["id"]:entry for cite in citations_data.values() for entry in cite}
+    # embed the abstract
+    citation_abstract_embedding = embed_abstracts(
+        [
+            {
+                "title":citations_data[cite]["title"],
+                "abstract": (
+                    restore_inverted_abstract(
+                        citations_data[cite]["abstract_inverted_index"]
+                    )
+                    if citations_data[cite]["abstract_inverted_index"] is not None
+                    else None
+                )
+            }
+            for cite in citations["citation_id"].unique()
+        ],
+        model,
+        tokenizer,
+        batch_size=4,
+    ).detach().numpy()
+    print(citation_abstract_embedding.shape)
+    # calculate the distances
+    index_left = citations.index[
+        (citations["known_tokens_fraction"] >= 0.7) &
+        (~citations["pruned_contexts"].isna())
+    ].tolist()
+    index_right = citations["citation_id"].unique().tolist()
+    indices = [
+        (index_left.index(i), index_right.index(cite_id))
+        if i in index_left else (None, None)
+        for i, cite_id in enumerate(citations["citation_id"])
+    ]
+    distances = np.array(calculate_distances(citation_context_embedding, citation_abstract_embedding, indices))
+    results = []
+    for i, dist in enumerate(distances):
+        if not np.isnan(dist):
+            obj = {}
+            left_context = citations.left_context[i][-50:].replace('\n', '')
+            right_context = citations.right_context[i][:50].replace('\n', '')
+            obj["cite_context_short"] = f"...{left_context}{citations.mention[i]}{right_context}..."
+            obj["cited_paper"] = citations_data[citations.citation_id[i]]["title"]
+            obj["cited_paper_id"] = citations.citation_id[i]
+            obj["distance"] = dist
+            results.append(obj)
+    return {"score": np.nanmean(distances), "individual_citations": results}
 # Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## Citation Integrity Score")
     output = gr.Textbox(label="Extracted Citations", lines=20)
     submit_btn = gr.Button("Submit")
+    submit_btn.click(fn=get_cite_context_distance, inputs=[pdf_input, doi_input], outputs=output)
 demo.launch()

extract_embeddings.py ADDED Viewed

	@@ -0,0 +1,412 @@

+import json
+import string
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import torch
+import spacy
+from transformers import AutoTokenizer, AutoModel
+from adapters import AutoAdapterModel
+def restore_inverted_abstract(inverted_abstr):
+    all_indexes = [index for indexes in inverted_abstr.values() for index in indexes]
+    if len(all_indexes) > 0:
+        length = max(all_indexes) + 1
+    else:
+        return None
+    abstract_words = ["" for _ in range(length)]
+    for word, indexes in inverted_abstr.items():
+        for index in indexes:
+            abstract_words[index] = word
+    return " ".join(abstract_words)
+def extract_title_abstract(oa_object):
+    abstract = oa_object["abstract_inverted_index"]
+    title_abstract_obj = {
+        "title": oa_object["title"],
+        "abstract": (None if abstract is None else restore_inverted_abstract(abstract))
+    }
+    return title_abstract_obj
+def preprocess_batch(batch, tokenizer, input_is_context=False):
+    # papers = [{'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'},
+    #         {'title': 'Attention is all you need', 'abstract': ' The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'}]
+    # concatenate title and abstract
+    if not input_is_context:
+        batch = [(d['title'] or '') + tokenizer.sep_token + (d.get('abstract') or '') for d in batch]
+    tokenized_batch = tokenizer(batch, padding=True, truncation=True,
+                                    return_tensors="pt", return_token_type_ids=False, max_length=512)
+    return tokenized_batch
+def sent_is_mostly_known_tokens(tokens, tokenizer, threshold=0.7):
+    return get_fraction_of_known_tokens(tokens, tokenizer) >= threshold
+def get_fraction_of_known_tokens(tokens, tokenizer):
+    total_tokens = len(tokens)
+    if total_tokens == 0:
+        return False  # Avoid division by zero
+    # Clean tokens and check if they exist in the tokenizer's vocab
+    known_tokens = sum(1 for token in tokens if token.text.lower().strip(string.punctuation) in tokenizer.vocab)
+    return known_tokens / total_tokens
+def prune_contexts(contexts, spacy_model, tokenizer):
+    chosen_sents = []
+    fractions = []
+    for _, context in tqdm(contexts.iterrows(), total=len(contexts)):
+        text = (context["left_context"] + context["mention"] + context["right_context"]).replace("\n", " ")
+        citation_start = len(context["left_context"]) + 1
+        spacied = spacy_model(text)
+        chosen_sent = None
+        previous_sent = ""
+        kt_fraction = None
+        for sent in spacied.sents:
+            if citation_start < sent.end_char and citation_start >= sent.start_char:
+                chosen_sent = previous_sent + sent.text
+                kt_fraction = get_fraction_of_known_tokens(sent, tokenizer)
+                break
+            previous_sent = sent.text
+        if chosen_sent is None or len(chosen_sent.split()) < 5:
+            print(f" - no context found: {spacied.text}")
+            chosen_sent = None
+        # if chosen_sent is not None:
+        chosen_sents.append(chosen_sent)
+        fractions.append(kt_fraction)
+    return chosen_sents, fractions
+def embed_contexts(contexts, model, tokenizer, batch_size = 16):
+    embeddings = []
+    # Process in batches
+    with torch.no_grad():  # Disable gradient tracking to save memory
+        for i in tqdm(range(0, len(contexts), batch_size)):
+            batch = contexts[i:i + batch_size]
+            try:
+                inputs = preprocess_batch(batch, tokenizer, input_is_context=True)
+            except Exception as e:
+                print(e)
+                breakpoint()
+            batch_embeddings = embed_batch(inputs, model)
+            embeddings.append(batch_embeddings)
+    # Concatenate all batches back together
+    return torch.cat(embeddings, dim=0)
+def embed_batch(tokenized_batch, model):
+    output = model(**tokenized_batch)
+    # take the first token in the batch as the embedding
+    embeddings = output.last_hidden_state[:, 0, :]
+    return embeddings
+def embed_abstracts(abstract_title_list, model, tokenizer, batch_size=16):
+    print("Loaded specter2 model:")
+    embeddings = []
+    # Process in batches
+    with torch.no_grad():  # Disable gradient tracking to save memory
+        for i in tqdm(range(0, len(abstract_title_list), batch_size)):
+            batch = abstract_title_list[i:i + batch_size]
+            inputs = preprocess_batch(batch, tokenizer)
+            batch_embeddings = embed_batch(inputs, model)
+            embeddings.append(batch_embeddings)
+    # Concatenate all batches back together
+    return torch.cat(embeddings, dim=0)
+def calculate_distances(embeddings_a, embeddings_b, indices, batch_size=512):
+    # Initialize a list to store the results
+    all_distances = [None] * len(indices)
+    # Loop over the embeddings in batches
+    num_batches = len(indices) // batch_size + (1 if len(indices) % batch_size != 0 else 0)
+    for i in range(num_batches):
+        # Get the current batch
+        start_idx = i * batch_size
+        end_idx = min((i + 1) * batch_size, len(indices))
+        batch_a, batch_b, batch_positions = [], [], []
+        for idx, (a, b) in enumerate(indices[start_idx:end_idx]):
+            if a is None or b is None:
+                all_distances[start_idx + idx] = np.nan  # Assign NaN directly in place
+            else:
+                batch_a.append(embeddings_a[a])
+                batch_b.append(embeddings_b[b])
+                batch_positions.append(start_idx + idx)
+        if batch_a and batch_b:
+            batch_a = torch.from_numpy(np.array(batch_a)).float()
+            batch_b = torch.from_numpy(np.array(batch_b)).float()
+            # Compute L2 (Euclidean) distance for the batch
+            distances_batch = torch.norm(batch_a - batch_b, p=2, dim=1).numpy().astype(float)
+            # Assign computed distances in the correct positions
+            for pos, dist in zip(batch_positions, distances_batch):
+                all_distances[pos] = dist
+    return all_distances
+def add_distances_to_df(df, index_left, index_right, embeddings, column_name):
+    if column_name == "abstract_abstract_l2_distance":
+        indices = [(index_left.index(doi), index_right.index(cite_id)) for doi, cite_id in zip(df["cited_in_doi"], df["citation_id"])]
+        print("calculate distances...")
+        distances = calculate_distances(embeddings["original_abstract"], embeddings["citation_abstract"], indices)
+        df[column_name] = distances
+    elif column_name == "context_abstract_l2_distance":
+        indices = [
+           (index_left.index(i), index_right.index(cite_id))
+           if i in index_left else (None, None)
+           for i, cite_id in enumerate(df["citation_id"])
+        ]
+        print("calculate distances...")
+        distances = calculate_distances(embeddings["citation_context_base"], embeddings["citation_abstract"], indices)
+        df[column_name] = distances
+    return df
+def add_pruned_contexts_to_df(df, df_name):
+    tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
+    nlp = spacy.load("en_core_web_lg")
+    df["pruned_contexts"], df["known_tokens_fraction"] = prune_contexts(df, nlp, tokenizer)
+    df.to_parquet(df_name, compression='gzip')
+def main_specter(retracted, overwrite=True):
+    tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
+    model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
+    # model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)
+    if not overwrite:
+        embeddings_from_disk = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz')
+    ## Paper abstracts
+    if retracted:
+        data = pd.read_parquet("25_02_05_retractions_abstracts_cleaned.gzip")
+    else:
+        # data = pd.read_parquet("24_11_30_reference_articles.gzip")
+        data = pd.read_parquet("24_12_31_reference_articles_most_cited.gzip")
+    print("embedding original abstracts...")
+    if not overwrite and "original_abstract" in embeddings_from_disk:
+        paper_abstract_embedding = embeddings_from_disk["original_abstract"]
+    else:
+        paper_abstract_embedding = embed_abstracts(
+            [
+                {"title":r["Title"], "abstract": r["Abstract"]}
+                for _,r in data.iterrows()
+            ],
+            model,
+            tokenizer,
+            batch_size=4
+        ).detach().numpy()
+    ## Cited papers abstracts
+    if retracted:
+        citations_df_name = "retraction_citation_mentions.gzip"
+        with open("retractions_citations.json") as jsonfile:
+            cite_data = json.load(jsonfile)
+            citations = pd.read_parquet(citations_df_name)
+    else:
+        citations_df_name = "reference_mc_citation_mentions.gzip"
+        # with open("reference_citations.json") as jsonfile:
+        with open("reference_most_cited_citations.json") as jsonfile:
+            cite_data = json.load(jsonfile)
+            citations = pd.read_parquet(citations_df_name)
+    cite_data = {entry["id"]:entry for cite in cite_data.values() for entry in cite}
+    print("embedding cited abstracts...")
+    if not overwrite and "citation_abstract" in embeddings_from_disk:
+        citation_abstract_embedding = embeddings_from_disk["citation_abstract"]
+    else:
+        citation_abstract_embedding = embed_abstracts(
+            [
+                {
+                    "title":cite_data[cite]["title"],
+                    "abstract": (restore_inverted_abstract(cite_data[cite]["abstract_inverted_index"]) if cite_data[cite]["abstract_inverted_index"] is not None else None)
+                }
+                for cite in citations["citation_id"].unique()
+            ],
+            model,
+            tokenizer,
+            batch_size=4,
+        ).detach().numpy()
+    print("embedding citation contexts base...")
+    if not overwrite and "citation_context_base" in embeddings_from_disk:
+        citation_context_embedding_base = embeddings_from_disk["citation_context_base"]
+    else:
+        citation_context_embedding_base = embed_contexts(
+            citations[
+                (citations["known_tokens_fraction"] >= 0.7) &
+                (~citations["pruned_contexts"].isna())
+            ]["pruned_contexts"].to_list(),
+            model,
+            tokenizer,
+        ).detach().numpy()
+    print("embedding citation contexts...")
+    if not overwrite and "citation_context" in embeddings_from_disk:
+        citation_context_embedding = embeddings_from_disk["citation_context"]
+    else:
+        model.load_adapter("allenai/specter2_adhoc_query", source="hf", load_as="adhoc", set_active=True)
+        citation_context_embedding = embed_contexts(
+            citations[
+                (citations["known_tokens_fraction"] >= 0.7) &
+                (~citations["pruned_contexts"].isna())
+            ]["pruned_contexts"].to_list(),
+            model,
+            tokenizer,
+        ).detach().numpy()
+    # Save
+    np.savez(
+        f'{("retractions" if retracted else "reference")}_embeddings_specter.npz',
+        original_abstract=paper_abstract_embedding,
+        citation_context=citation_context_embedding,
+        citation_abstract=citation_abstract_embedding,
+        citation_context_base=citation_context_embedding_base,
+    )
+    # Load
+    data = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz')
+    print(data["original_abstract"].shape)  # (768,) or (1536,) depending on the model
+    print(data["citation_context"].shape)  # (768,) or (1536,) depending on the model
+    print(data["citation_context_base"].shape)  # (768,) or (1536,) depending on the model
+    print(data["citation_abstract"].shape)  # (768,) or (1536,) depending on the model
+def main_scibert(retracted, overwrite=True):
+    tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
+    model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
+    # model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)
+    if not overwrite:
+        embeddings_from_disk = np.load(f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz')
+    ## Paper abstracts
+    if retracted:
+        data = pd.read_parquet("25_02_05_retractions_abstracts_cleaned.gzip")
+    else:
+        # data = pd.read_parquet("24_11_30_reference_articles.gzip")
+        data = pd.read_parquet("24_12_31_reference_articles_most_cited.gzip")
+    print("embedding original abstracts...")
+    if not overwrite and "original_abstract" in embeddings_from_disk:
+        paper_abstract_embedding = embeddings_from_disk["original_abstract"]
+    else:
+        paper_abstract_embedding = embed_abstracts(
+            [
+                {"title":r["Title"], "abstract": r["Abstract"]}
+                for _,r in data.iterrows()
+            ],
+            model,
+            tokenizer,
+            batch_size=4
+        ).detach().numpy()
+    ## Cited papers abstracts
+    if retracted:
+        citations_df_name = "retraction_citation_mentions.gzip"
+        with open("retractions_citations.json") as jsonfile:
+            cite_data = json.load(jsonfile)
+            citations = pd.read_parquet(citations_df_name)
+    else:
+        citations_df_name = "reference_mc_citation_mentions.gzip"
+        # with open("reference_citations.json") as jsonfile:
+        with open("reference_most_cited_citations.json") as jsonfile:
+            cite_data = json.load(jsonfile)
+            citations = pd.read_parquet(citations_df_name)
+    cite_data = {entry["id"]:entry for cite in cite_data.values() for entry in cite}
+    print("embedding cited abstracts...")
+    if not overwrite and "citation_abstract" in embeddings_from_disk:
+        citation_abstract_embedding = embeddings_from_disk["citation_abstract"]
+    else:
+        citation_abstract_embedding = embed_abstracts(
+            [
+                {
+                    "title":cite_data[cite]["title"],
+                    "abstract": (restore_inverted_abstract(cite_data[cite]["abstract_inverted_index"]) if cite_data[cite]["abstract_inverted_index"] is not None else None)
+                }
+                for cite in citations["citation_id"].unique()
+            ],
+            model,
+            tokenizer,
+            batch_size=4,
+        ).detach().numpy()
+    print("embedding citation contexts...")
+    if not overwrite and "citation_context" in embeddings_from_disk:
+        citation_context_embedding = embeddings_from_disk["citation_context"]
+    else:
+        citation_context_embedding = embed_contexts(
+            citations[
+                (citations["known_tokens_fraction"] >= 0.7) &
+                (~citations["pruned_contexts"].isna())
+            ]["pruned_contexts"].to_list(),
+            model,
+            tokenizer,
+        ).detach().numpy()
+    # Save
+    np.savez(
+        f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz',
+        original_abstract=paper_abstract_embedding,
+        citation_context=citation_context_embedding,
+        citation_abstract=citation_abstract_embedding,
+    )
+    # Load
+    data = np.load(f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz')
+    print(data["original_abstract"].shape)  # (768,) or (1536,) depending on the model
+    print(data["citation_context"].shape)  # (768,) or (1536,) depending on the model
+    print(data["citation_context_base"].shape)  # (768,) or (1536,) depending on the model
+    print(data["citation_abstract"].shape)  # (768,) or (1536,) depending on the model
+if __name__=="__main__":
+    import sys
+    retracted=(sys.argv[1] == "retracted")
+    if retracted:
+        print("Running embedding pipeline for retractions.")
+    else:
+        print("Running embedding pipeline for reference.")
+    df = pd.read_parquet(f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip')
+    # add_pruned_contexts_to_df(df, f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip')
+    main_scibert(retracted, overwrite=False)
+    # main_specter(retracted, overwrite=False)
+    embeddings = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz')
+    print(embeddings["original_abstract"].shape)  # (768,) or (1536,) depending on the model
+    print(embeddings["citation_context"].shape)  # (768,) or (1536,) depending on the model
+    print(embeddings["citation_abstract"].shape)  # (768,) or (1536,) depending on the model
+    # original_dois = pd.read_parquet(f'{("25_02_05_retractions_abstracts_cleaned" if retracted else "24_11_30_reference_articles")}.gzip', columns = ["OriginalPaperDOI"])["OriginalPaperDOI"].tolist()
+    original_dois = pd.read_parquet(f'{("25_02_05_retractions_abstracts_cleaned" if retracted else "24_12_31_reference_articles_most_cited")}.gzip', columns = ["OriginalPaperDOI"])["OriginalPaperDOI"].tolist()
+    # df = add_distances_to_df(
+    #     df,
+    #     [doi.replace("https://doi.org/", "") for doi in original_dois],
+    #     df["citation_id"].unique().tolist(),
+    #     embeddings,
+    #     "abstract_abstract_l2_distance"
+    # )
+    df = add_distances_to_df(
+        df,
+        df.index[
+            (df["known_tokens_fraction"] >= 0.7) &
+            (~df["pruned_contexts"].isna())
+        ].tolist(),
+        df["citation_id"].unique().tolist(),
+        embeddings,
+        "context_abstract_l2_distance"
+    )
+    df.to_parquet(f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip', compression='gzip')

extract_mentions.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import json
+import re
+import sys
+import numpy as np
+from pathlib import Path
+from typing import NamedTuple
+import pandas as pd
+TITLE_NORMALIZE = [
+    "alpha", "beta", "gamma", "delta", "epsilon", "kappa", "lambda"
+]
+class Context(NamedTuple):
+    left: str
+    right: str
+split_right_pattern = re.compile(r"(?:#+)|(?:\[(?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\])")
+split_left_pattern = re.compile(r"(?:#+)|(?:\](?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\[)")
+ieee_style_pattern = re.compile(r"(?>\[(?>[^A-Za-z0-9\[\]\.]*(\d{1,3})[^A-Za-z0-9\[\]\.]*)+\][^A-Za-z0-9\[\]]*)+")
+auth_year_style_pattern = re.compile(r"(?>\((?>[^()]+?[,\s][1-2][0-9]{3})+\)[^()A-Za-z0-9]*)+")
+def filter_page_breaks(content):
+    find_page_breaks = re.compile(
+        r"""
+        \n*
+        \n # empty line
+        -----\n  # 5 dashes
+        \n # empty line
+        (?:.*?\n)? # Capture the footer/header
+        \n*
+        """,
+        re.VERBOSE | re.M
+    )
+    return re.sub(find_page_breaks, " ",  content)
+def get_author_title_year_patterns_from_citation(cite):
+    title = cite['title']
+    for w in TITLE_NORMALIZE:
+        title = title.replace(w, "$")
+    title = re.sub(r"[^a-zA-Z0-9]+", "_", title)  # Replace en and em dashes with a hyphen
+    # title = title.replace(" ", r"[^a-zA-Z0-9]+?")
+    year = str(cite['publication_year'])
+    try:
+        first_author = cite['authorships'][0]['author']['display_name']
+        ## only lastname
+        first_author = re.sub(r"[^a-zA-Z0-9]+", "_", first_author.split(" ")[-1])
+    except IndexError or TypeError:
+        first_author = None
+    return first_author, title, year
+def extract_potential_citations(paper):
+    ieee_style = ieee_style_pattern.finditer(paper)
+    ieee_style_buckets = []
+    for match in ieee_style:
+        possible = set([int(n) for n in re.findall(r"\d{1,3}", match.group(1))])
+        ## expand ranges
+        ranges = re.findall(r"(\d{1,3})[–——-]+(\d{1,3})", match.group(1))
+        if len(ranges)>0:
+            for start, end in ranges:
+                possible |= set(range(int(start),int(end)+1))
+        ieee_style_buckets.append((match.start(), match.end(), match.group(0), possible))
+    auth_year_style = auth_year_style_pattern.finditer(paper)
+    auth_year_style_buckets = []
+    for match in auth_year_style:
+        possible = set(re.split(r"(\b[1-2]\d{3}\b)\W*", match.group(0)))
+        auth_year_style_buckets.append((match.start(), match.end(), match.group(0), possible))
+    return ieee_style_buckets, auth_year_style_buckets
+def find_reference_in_reference_section(paper, cite, references):
+    """
+    Searches for reference section entry matching citation paper title, year, first author, and journal in a markdown file
+    using fuzzy matching.
+    """
+    patterns = get_author_title_year_patterns_from_citation(cite)
+    if any([p is None for p in patterns]):
+        return paper, None
+    author, title, year = patterns
+    patterns = [author, title, year]
+    # Try finding all the patterns between two enumeration items starting from the back of the string
+    # for i,s in enumerate(references):
+    for full_ref, enum, ref_body in references:
+        for w in TITLE_NORMALIZE:
+            normalized = ref_body.replace(w, "$")
+        fuzzy_ref = re.sub(r"[^a-zA-Z0-9]+", "_", normalized)
+        if all([re.search(pattern, fuzzy_ref, re.IGNORECASE | re.MULTILINE | re.DOTALL) for pattern in patterns]):
+            match = (cite["id"], author, title, year, enum, ref_body)
+            # remove the reference from the paper so it can't be matched again
+            paper = paper.replace(full_ref, "")
+            return paper, match
+    return paper, (cite["id"], author, title, year, None, None)
+def find_mentions_by_pointer(doi, ref, paper, ieee_possible):
+    """
+    Match the links mentioning that reference in the text and extract context.
+    """
+    mentions = []
+    (oa_id, _, _, _, ref_num, r) = ref
+    for start, end, match, possible_numbers in ieee_possible:
+        if int(ref_num) in possible_numbers:
+            context = create_context(start, end, paper)
+            mentions.append((doi, oa_id, ref_num, r, start, end, context.left, match, context.right))
+    return mentions
+def find_mentions_direct(doi, ref, paper, auth_style_possible):
+    """
+    Match the links mentioning that reference in the text and extract context.
+    """
+    mentions = []
+    (oa_id, a, _, y, _, _) = ref
+    for start, end, match, possibilities in auth_style_possible:
+        for possibility in possibilities:
+            if y in possibility and a in possibility:
+                context = create_context(start, end, paper)
+                mentions.append((doi, oa_id, None, None, start, end, context.left, match, context.right))
+    return mentions
+def create_context(start, end, paper):
+    left = paper[max(0, start - 500):start]
+    right = paper[end:end + min(len(paper) - end, 500)]
+    ## only take context until a next section begins or another citation appears
+    splitleft = split_left_pattern.search(left[::-1])
+    if splitleft is not None:
+        left = left[len(left) - splitleft.start():]
+    splitright = split_right_pattern.search(right)
+    if splitright is not None:
+        right = right[:splitright.start()]
+    return Context(left=left, right=right)
+def restore_inverted_abstract(inverted_abstr):
+    all_indexes = [index for indexes in inverted_abstr.values() for index in indexes]
+    if len(all_indexes) > 0:
+        length = max(all_indexes) + 1
+    else:
+        return None
+    abstract_words = ["" for _ in range(length)]
+    for word, indexes in inverted_abstr.items():
+        for index in indexes:
+            abstract_words[index] = word
+    return " ".join(abstract_words)
+def extract_title_abstract(oa_object):
+    abstract = oa_object["abstract_inverted_index"]
+    title_abstract_obj = {
+        "title": oa_object["title"],
+        "abstract": (None if abstract is None else restore_inverted_abstract(abstract))
+    }
+    return title_abstract_obj
+def extract_citation_contexts(cites, paper):
+    counter=0
+    extracted_citations = []
+    references_pattern = re.compile(r'(\n\W*(\d{1,3})\W(.+?)(?=(?:\n\n)|(?:\n\W*\d{1,3}\W)|\Z))', re.VERBOSE | re.I | re.M | re.S)
+    for doi in cites:
+    # for doi in ["10.1155/2021/4883509"]:
+        counter+=1
+        paper = filter_page_breaks(paper)
+        # print(paper)
+        if paper is None:
+            continue
+        # remove title and authors from beginning of paper
+        paper = paper[750:]
+        citations = cites[doi]
+        # references = re.findall(r'\n\s*(\d+)\.(.*?)(?=(?:\n\s*\d+\.)|\Z)', paper, re.VERBOSE | re.I | re.M | re.S)
+        references = references_pattern.findall(paper)
+        found = 0
+        n_mentions = 0
+        has_abstract_title = 0
+        in_ref_section_refs = []
+        for cite in citations:
+            embedding_input = extract_title_abstract(cite)
+            if embedding_input["abstract"] is None or embedding_input["title"] is None:
+                in_ref_section_refs.append(None)
+                continue
+            has_abstract_title+=1
+            paper, in_ref_section_ref = find_reference_in_reference_section(paper, cite, references)
+            in_ref_section_refs.append(in_ref_section_ref)
+        ieee, auth_year = extract_potential_citations(paper)
+        for ref in in_ref_section_refs:
+            if ref is not None:
+                if ref[4] is not None:
+                    mentions = find_mentions_by_pointer(doi, ref, paper, ieee)
+                else: mentions = []
+                mentions += find_mentions_direct(doi, ref, paper, auth_year)
+                extracted_citations+=mentions
+                if len(mentions)>0:
+                    n_mentions+=len(mentions)
+                    found+=1
+        print(f"{counter}/{len(cites)} - {doi}: {len(citations)} citations, {has_abstract_title} embeddable citations and {found} references with {n_mentions} mentions found in markdown.")
+    return pd.DataFrame(extracted_citations, columns = ["cited_in_doi", "citation_id", "reference_marker", "reference_target", "mention_start", "mention_end", "left_context", "mention", "right_context"])

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 requests
 pandas
 tqdm

+gradio
 requests
 pandas
 tqdm