Spaces:

chrible
/

citation-integrity

Sleeping

App Files Files Community

Christof Bless commited on May 17

Commit

64d8f8c

unverified ·

1 Parent(s): ee35a4d

add citation lookup function

Browse files

Files changed (3) hide show

app.py +27 -9
extract_citations.py +108 -0
requirements.txt +5 -0

app.py CHANGED Viewed

@@ -1,20 +1,38 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-# Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("## Citation Integrity Score")
-    doi_input = gr.Textbox(label="Enter DOI")
     pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-    output = gr.Textbox(label="Output")
     submit_btn = gr.Button("Submit")
-    # submit_btn.click(fn=process_input, inputs=[doi_input, pdf_input], outputs=output)
-demo.launch()
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

 import gradio as gr
+import requests
+from extract_citations import *
+# Set your GROBID server URL
+GROBID_URL = "http://localhost:8070/api/processReferences"
+def extract_text(pdf_file):
+    if not pdf_file:
+        return "Please upload a PDF file."
+    try:
+        # Send PDF to GROBID for citation extraction
+        with open(pdf_file.name, 'rb') as f:
+            print("processing PDF ...")
+    except Exception as e:
+        return f"Error when processing PDF. {e}"
+def extract_citations(doi):
+    try:
+        citations_data = fetch_citations_for_dois([doi])
+    except Exception as e:
+        return f"Please submit a valid DOI. {e}"
+    return citations_data
+# Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## Citation Integrity Score")
+    doi_input = gr.Textbox(label="Enter DOI (optional)")
     pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+    output = gr.Textbox(label="Extracted Citations", lines=20)
     submit_btn = gr.Button("Submit")
+    submit_btn.click(fn=extract_citations, inputs=[doi_input, pdf_input], outputs=output)
 demo.launch()

extract_citations.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import requests
+import time
+import json
+import sys
+from pathlib import Path
+from tqdm import tqdm
+import pandas as pd
+MAIL_TO = "[email protected]"
+def get_openalex_ids(dois, batch_size=50):
+    """Retrieve the OpenAlex IDs for a list of DOIs."""
+    results = {}
+    for i in range(0, len(dois), batch_size):
+        batch = dois[i:i+batch_size]
+        pipe_separated_dois = "|".join(batch)
+        url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page={batch_size}&select=id,doi&mailto={MAIL_TO}"
+        response = requests.get(url)
+        time.sleep(0.1)  # Respect API rate limits
+        if response.status_code == 200:
+            data = response.json().get("results", [])
+            for a in data:
+                results[a.get("doi").replace("https://doi.org/","")] = a.get("id")
+        else:
+            print(f"response failed with code: {response.status_code}")
+    return results
+def get_outgoing_citations(openalex_id):
+    """Retrieve the list of outgoing citations for multiple articles given their OpenAlex IDs."""
+    url = (
+        f"https://api.openalex.org/works?filter=cited_by:{openalex_id}"
+        f"&select=id,doi,title,keywords,authorships,abstract_inverted_index,publication_year,primary_location,language"
+        f"&per-page=200"
+        f"&mailto={MAIL_TO}"
+    )
+    response = requests.get(url)
+    if response.status_code == 200:
+        results = response.json().get("results", [])
+        return results
+    else:
+        print(f"response failed with code: {response.status_code}")
+        return []
+def extract_citation_data(citing_articles):
+    """Extracts relevant metadata from the citing articles."""
+    citations = []
+    for article in citing_articles:
+        citations.append({
+            "id": article.get("id"),
+            "doi": article.get("doi"),
+            "title": article.get("title"),
+            "authors": [
+                {"name": author.get("author", {}).get("display_name"), "id": author.get("author", {}).get("id")}
+                for author in article.get("authorships", [])
+            ],
+            "abstract": article.get("abstract_inverted_index"),
+            "year": article.get("publication_year"),
+            "venue": article.get("primary_location", {}).get("source", {}).get("display_name"),
+            "language": article.get("language")
+        })
+    return citations
+def fetch_citations_for_dois(doi_list):
+    """Main function to fetch outgoing citations for a list of DOIs."""
+    all_citations = {}
+    openalex_ids = get_openalex_ids(doi_list)
+    print(len(openalex_ids))
+    for doi, oa_id in tqdm(openalex_ids.items()):
+        all_citations[doi] = get_outgoing_citations(oa_id)
+        if len(all_citations[doi]) == 200:
+            print(">= 200 citations:", doi, oa_id)
+        time.sleep(0.1)  # Respect API rate limits
+    return all_citations
+def save_to_file(citations, fn):
+    # Save to a JSON file
+    with open(fn, "w") as f:
+        json.dump(citations, f)
+if __name__ == "__main__":
+    # Example usage
+    data = pd.read_parquet(sys.argv[1])
+    doi_list = data["OriginalPaperDOI"]
+    dois_w_fulltext = []
+    for doi in doi_list:
+        md_fn = doi.replace("https://doi.org/", "").replace("/", "|") + ".md"
+        if "retraction" in sys.argv[1]:
+            dir_up = Path("/mnt/data1/retraction_data/pdf_articles_unpaywall_md")
+            dir_oa = Path("/mnt/data1/retraction_data/pdf_articles_md")
+            dir_man = Path("/mnt/data1/retraction_data/pdf_articles_manual_md")
+            if (dir_up/md_fn).exists() or (dir_oa/md_fn).exists() or (dir_man/md_fn).exists():
+                dois_w_fulltext.append(doi)
+        elif "reference" in sys.argv[1]:
+            dir = Path("/mnt/data1/retraction_data/pdf_articles_reference_md")
+            if (dir/md_fn).exists():
+                dois_w_fulltext.append(doi)
+        else:
+            print("Can't find any markdown files for these DOI's.")
+    # dois_w_fulltext = dois_w_fulltext[:101]
+    print(f"Fetching outgoing citations for {len(dois_w_fulltext)} articles.")
+    out_fn = sys.argv[2]
+    citations_data = fetch_citations_for_dois(dois_w_fulltext)
+    save_to_file(citations_data, out_fn)
+    print(f"Citations data saved to {out_fn}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+requests
+pandas
+tqdm
+torch
+transformers