Spaces:
Sleeping
Sleeping
Christof Bless
commited on
add citation lookup function
Browse files- app.py +27 -9
- extract_citations.py +108 -0
- requirements.txt +5 -0
app.py
CHANGED
@@ -1,20 +1,38 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
with gr.Blocks() as demo:
|
8 |
gr.Markdown("## Citation Integrity Score")
|
9 |
|
10 |
-
doi_input = gr.Textbox(label="Enter DOI")
|
11 |
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
12 |
-
output = gr.Textbox(label="
|
13 |
|
14 |
submit_btn = gr.Button("Submit")
|
15 |
-
|
16 |
-
|
17 |
-
demo.launch()
|
18 |
|
19 |
-
demo = gr.Interface(fn=greet, inputs="text", outputs="text")
|
20 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import requests
|
3 |
|
4 |
+
from extract_citations import *
|
5 |
+
# Set your GROBID server URL
|
6 |
+
GROBID_URL = "http://localhost:8070/api/processReferences"
|
7 |
|
8 |
+
def extract_text(pdf_file):
|
9 |
+
if not pdf_file:
|
10 |
+
return "Please upload a PDF file."
|
11 |
+
try:
|
12 |
+
# Send PDF to GROBID for citation extraction
|
13 |
+
with open(pdf_file.name, 'rb') as f:
|
14 |
+
print("processing PDF ...")
|
15 |
+
except Exception as e:
|
16 |
+
return f"Error when processing PDF. {e}"
|
17 |
+
|
18 |
+
def extract_citations(doi):
|
19 |
+
try:
|
20 |
+
citations_data = fetch_citations_for_dois([doi])
|
21 |
+
except Exception as e:
|
22 |
+
return f"Please submit a valid DOI. {e}"
|
23 |
+
|
24 |
+
return citations_data
|
25 |
+
|
26 |
+
|
27 |
+
# Gradio UI
|
28 |
with gr.Blocks() as demo:
|
29 |
gr.Markdown("## Citation Integrity Score")
|
30 |
|
31 |
+
doi_input = gr.Textbox(label="Enter DOI (optional)")
|
32 |
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
33 |
+
output = gr.Textbox(label="Extracted Citations", lines=20)
|
34 |
|
35 |
submit_btn = gr.Button("Submit")
|
36 |
+
submit_btn.click(fn=extract_citations, inputs=[doi_input, pdf_input], outputs=output)
|
|
|
|
|
37 |
|
|
|
38 |
demo.launch()
|
extract_citations.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import time
|
3 |
+
import json
|
4 |
+
import sys
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
from tqdm import tqdm
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
MAIL_TO = "[email protected]"
|
11 |
+
|
12 |
+
def get_openalex_ids(dois, batch_size=50):
|
13 |
+
"""Retrieve the OpenAlex IDs for a list of DOIs."""
|
14 |
+
results = {}
|
15 |
+
for i in range(0, len(dois), batch_size):
|
16 |
+
batch = dois[i:i+batch_size]
|
17 |
+
pipe_separated_dois = "|".join(batch)
|
18 |
+
url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page={batch_size}&select=id,doi&mailto={MAIL_TO}"
|
19 |
+
response = requests.get(url)
|
20 |
+
time.sleep(0.1) # Respect API rate limits
|
21 |
+
|
22 |
+
if response.status_code == 200:
|
23 |
+
data = response.json().get("results", [])
|
24 |
+
for a in data:
|
25 |
+
results[a.get("doi").replace("https://doi.org/","")] = a.get("id")
|
26 |
+
else:
|
27 |
+
print(f"response failed with code: {response.status_code}")
|
28 |
+
return results
|
29 |
+
|
30 |
+
def get_outgoing_citations(openalex_id):
|
31 |
+
"""Retrieve the list of outgoing citations for multiple articles given their OpenAlex IDs."""
|
32 |
+
|
33 |
+
url = (
|
34 |
+
f"https://api.openalex.org/works?filter=cited_by:{openalex_id}"
|
35 |
+
f"&select=id,doi,title,keywords,authorships,abstract_inverted_index,publication_year,primary_location,language"
|
36 |
+
f"&per-page=200"
|
37 |
+
f"&mailto={MAIL_TO}"
|
38 |
+
)
|
39 |
+
response = requests.get(url)
|
40 |
+
|
41 |
+
if response.status_code == 200:
|
42 |
+
results = response.json().get("results", [])
|
43 |
+
return results
|
44 |
+
else:
|
45 |
+
print(f"response failed with code: {response.status_code}")
|
46 |
+
return []
|
47 |
+
|
48 |
+
def extract_citation_data(citing_articles):
|
49 |
+
"""Extracts relevant metadata from the citing articles."""
|
50 |
+
citations = []
|
51 |
+
for article in citing_articles:
|
52 |
+
citations.append({
|
53 |
+
"id": article.get("id"),
|
54 |
+
"doi": article.get("doi"),
|
55 |
+
"title": article.get("title"),
|
56 |
+
"authors": [
|
57 |
+
{"name": author.get("author", {}).get("display_name"), "id": author.get("author", {}).get("id")}
|
58 |
+
for author in article.get("authorships", [])
|
59 |
+
],
|
60 |
+
"abstract": article.get("abstract_inverted_index"),
|
61 |
+
"year": article.get("publication_year"),
|
62 |
+
"venue": article.get("primary_location", {}).get("source", {}).get("display_name"),
|
63 |
+
"language": article.get("language")
|
64 |
+
})
|
65 |
+
return citations
|
66 |
+
|
67 |
+
def fetch_citations_for_dois(doi_list):
|
68 |
+
"""Main function to fetch outgoing citations for a list of DOIs."""
|
69 |
+
all_citations = {}
|
70 |
+
openalex_ids = get_openalex_ids(doi_list)
|
71 |
+
print(len(openalex_ids))
|
72 |
+
for doi, oa_id in tqdm(openalex_ids.items()):
|
73 |
+
all_citations[doi] = get_outgoing_citations(oa_id)
|
74 |
+
if len(all_citations[doi]) == 200:
|
75 |
+
print(">= 200 citations:", doi, oa_id)
|
76 |
+
time.sleep(0.1) # Respect API rate limits
|
77 |
+
return all_citations
|
78 |
+
|
79 |
+
def save_to_file(citations, fn):
|
80 |
+
# Save to a JSON file
|
81 |
+
with open(fn, "w") as f:
|
82 |
+
json.dump(citations, f)
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
# Example usage
|
86 |
+
data = pd.read_parquet(sys.argv[1])
|
87 |
+
doi_list = data["OriginalPaperDOI"]
|
88 |
+
dois_w_fulltext = []
|
89 |
+
for doi in doi_list:
|
90 |
+
md_fn = doi.replace("https://doi.org/", "").replace("/", "|") + ".md"
|
91 |
+
if "retraction" in sys.argv[1]:
|
92 |
+
dir_up = Path("/mnt/data1/retraction_data/pdf_articles_unpaywall_md")
|
93 |
+
dir_oa = Path("/mnt/data1/retraction_data/pdf_articles_md")
|
94 |
+
dir_man = Path("/mnt/data1/retraction_data/pdf_articles_manual_md")
|
95 |
+
if (dir_up/md_fn).exists() or (dir_oa/md_fn).exists() or (dir_man/md_fn).exists():
|
96 |
+
dois_w_fulltext.append(doi)
|
97 |
+
elif "reference" in sys.argv[1]:
|
98 |
+
dir = Path("/mnt/data1/retraction_data/pdf_articles_reference_md")
|
99 |
+
if (dir/md_fn).exists():
|
100 |
+
dois_w_fulltext.append(doi)
|
101 |
+
else:
|
102 |
+
print("Can't find any markdown files for these DOI's.")
|
103 |
+
# dois_w_fulltext = dois_w_fulltext[:101]
|
104 |
+
print(f"Fetching outgoing citations for {len(dois_w_fulltext)} articles.")
|
105 |
+
out_fn = sys.argv[2]
|
106 |
+
citations_data = fetch_citations_for_dois(dois_w_fulltext)
|
107 |
+
save_to_file(citations_data, out_fn)
|
108 |
+
print(f"Citations data saved to {out_fn}")
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
requests
|
2 |
+
pandas
|
3 |
+
tqdm
|
4 |
+
torch
|
5 |
+
transformers
|