Christof Bless commited on
Commit
64d8f8c
Β·
unverified Β·
1 Parent(s): ee35a4d

add citation lookup function

Browse files
Files changed (3) hide show
  1. app.py +27 -9
  2. extract_citations.py +108 -0
  3. requirements.txt +5 -0
app.py CHANGED
@@ -1,20 +1,38 @@
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
5
 
6
- # Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  with gr.Blocks() as demo:
8
  gr.Markdown("## Citation Integrity Score")
9
 
10
- doi_input = gr.Textbox(label="Enter DOI")
11
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
12
- output = gr.Textbox(label="Output")
13
 
14
  submit_btn = gr.Button("Submit")
15
- # submit_btn.click(fn=process_input, inputs=[doi_input, pdf_input], outputs=output)
16
-
17
- demo.launch()
18
 
19
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
20
  demo.launch()
 
1
  import gradio as gr
2
+ import requests
3
 
4
+ from extract_citations import *
5
+ # Set your GROBID server URL
6
+ GROBID_URL = "http://localhost:8070/api/processReferences"
7
 
8
+ def extract_text(pdf_file):
9
+ if not pdf_file:
10
+ return "Please upload a PDF file."
11
+ try:
12
+ # Send PDF to GROBID for citation extraction
13
+ with open(pdf_file.name, 'rb') as f:
14
+ print("processing PDF ...")
15
+ except Exception as e:
16
+ return f"Error when processing PDF. {e}"
17
+
18
+ def extract_citations(doi):
19
+ try:
20
+ citations_data = fetch_citations_for_dois([doi])
21
+ except Exception as e:
22
+ return f"Please submit a valid DOI. {e}"
23
+
24
+ return citations_data
25
+
26
+
27
+ # Gradio UI
28
  with gr.Blocks() as demo:
29
  gr.Markdown("## Citation Integrity Score")
30
 
31
+ doi_input = gr.Textbox(label="Enter DOI (optional)")
32
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
33
+ output = gr.Textbox(label="Extracted Citations", lines=20)
34
 
35
  submit_btn = gr.Button("Submit")
36
+ submit_btn.click(fn=extract_citations, inputs=[doi_input, pdf_input], outputs=output)
 
 
37
 
 
38
  demo.launch()
extract_citations.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from tqdm import tqdm
8
+ import pandas as pd
9
+
10
+ MAIL_TO = "[email protected]"
11
+
12
+ def get_openalex_ids(dois, batch_size=50):
13
+ """Retrieve the OpenAlex IDs for a list of DOIs."""
14
+ results = {}
15
+ for i in range(0, len(dois), batch_size):
16
+ batch = dois[i:i+batch_size]
17
+ pipe_separated_dois = "|".join(batch)
18
+ url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page={batch_size}&select=id,doi&mailto={MAIL_TO}"
19
+ response = requests.get(url)
20
+ time.sleep(0.1) # Respect API rate limits
21
+
22
+ if response.status_code == 200:
23
+ data = response.json().get("results", [])
24
+ for a in data:
25
+ results[a.get("doi").replace("https://doi.org/","")] = a.get("id")
26
+ else:
27
+ print(f"response failed with code: {response.status_code}")
28
+ return results
29
+
30
+ def get_outgoing_citations(openalex_id):
31
+ """Retrieve the list of outgoing citations for multiple articles given their OpenAlex IDs."""
32
+
33
+ url = (
34
+ f"https://api.openalex.org/works?filter=cited_by:{openalex_id}"
35
+ f"&select=id,doi,title,keywords,authorships,abstract_inverted_index,publication_year,primary_location,language"
36
+ f"&per-page=200"
37
+ f"&mailto={MAIL_TO}"
38
+ )
39
+ response = requests.get(url)
40
+
41
+ if response.status_code == 200:
42
+ results = response.json().get("results", [])
43
+ return results
44
+ else:
45
+ print(f"response failed with code: {response.status_code}")
46
+ return []
47
+
48
+ def extract_citation_data(citing_articles):
49
+ """Extracts relevant metadata from the citing articles."""
50
+ citations = []
51
+ for article in citing_articles:
52
+ citations.append({
53
+ "id": article.get("id"),
54
+ "doi": article.get("doi"),
55
+ "title": article.get("title"),
56
+ "authors": [
57
+ {"name": author.get("author", {}).get("display_name"), "id": author.get("author", {}).get("id")}
58
+ for author in article.get("authorships", [])
59
+ ],
60
+ "abstract": article.get("abstract_inverted_index"),
61
+ "year": article.get("publication_year"),
62
+ "venue": article.get("primary_location", {}).get("source", {}).get("display_name"),
63
+ "language": article.get("language")
64
+ })
65
+ return citations
66
+
67
+ def fetch_citations_for_dois(doi_list):
68
+ """Main function to fetch outgoing citations for a list of DOIs."""
69
+ all_citations = {}
70
+ openalex_ids = get_openalex_ids(doi_list)
71
+ print(len(openalex_ids))
72
+ for doi, oa_id in tqdm(openalex_ids.items()):
73
+ all_citations[doi] = get_outgoing_citations(oa_id)
74
+ if len(all_citations[doi]) == 200:
75
+ print(">= 200 citations:", doi, oa_id)
76
+ time.sleep(0.1) # Respect API rate limits
77
+ return all_citations
78
+
79
+ def save_to_file(citations, fn):
80
+ # Save to a JSON file
81
+ with open(fn, "w") as f:
82
+ json.dump(citations, f)
83
+
84
+ if __name__ == "__main__":
85
+ # Example usage
86
+ data = pd.read_parquet(sys.argv[1])
87
+ doi_list = data["OriginalPaperDOI"]
88
+ dois_w_fulltext = []
89
+ for doi in doi_list:
90
+ md_fn = doi.replace("https://doi.org/", "").replace("/", "|") + ".md"
91
+ if "retraction" in sys.argv[1]:
92
+ dir_up = Path("/mnt/data1/retraction_data/pdf_articles_unpaywall_md")
93
+ dir_oa = Path("/mnt/data1/retraction_data/pdf_articles_md")
94
+ dir_man = Path("/mnt/data1/retraction_data/pdf_articles_manual_md")
95
+ if (dir_up/md_fn).exists() or (dir_oa/md_fn).exists() or (dir_man/md_fn).exists():
96
+ dois_w_fulltext.append(doi)
97
+ elif "reference" in sys.argv[1]:
98
+ dir = Path("/mnt/data1/retraction_data/pdf_articles_reference_md")
99
+ if (dir/md_fn).exists():
100
+ dois_w_fulltext.append(doi)
101
+ else:
102
+ print("Can't find any markdown files for these DOI's.")
103
+ # dois_w_fulltext = dois_w_fulltext[:101]
104
+ print(f"Fetching outgoing citations for {len(dois_w_fulltext)} articles.")
105
+ out_fn = sys.argv[2]
106
+ citations_data = fetch_citations_for_dois(dois_w_fulltext)
107
+ save_to_file(citations_data, out_fn)
108
+ print(f"Citations data saved to {out_fn}")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ requests
2
+ pandas
3
+ tqdm
4
+ torch
5
+ transformers