import requests import time import json import sys from pathlib import Path from tqdm import tqdm import pandas as pd MAIL_TO = "christof.bless@hslu.ch" def get_openalex_ids(dois, batch_size=50): """Retrieve the OpenAlex IDs for a list of DOIs.""" results = {} for i in range(0, len(dois), batch_size): batch = dois[i:i+batch_size] pipe_separated_dois = "|".join(batch) url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page={batch_size}&select=id,doi&mailto={MAIL_TO}" response = requests.get(url) time.sleep(0.1) # Respect API rate limits if response.status_code == 200: data = response.json().get("results", []) for a in data: results[a.get("doi").replace("https://doi.org/","")] = a.get("id") else: print(f"response failed with code: {response.status_code}") return results def get_outgoing_citations(openalex_id): """Retrieve the list of outgoing citations for multiple articles given their OpenAlex IDs.""" url = ( f"https://api.openalex.org/works?filter=cited_by:{openalex_id}" f"&select=id,doi,title,keywords,authorships,abstract_inverted_index,publication_year,primary_location,language" f"&per-page=200" f"&mailto={MAIL_TO}" ) response = requests.get(url) if response.status_code == 200: results = response.json().get("results", []) return results else: print(f"response failed with code: {response.status_code}") return [] def extract_citation_data(citing_articles): """Extracts relevant metadata from the citing articles.""" citations = [] for article in citing_articles: citations.append({ "id": article.get("id"), "doi": article.get("doi"), "title": article.get("title"), "authors": [ {"name": author.get("author", {}).get("display_name"), "id": author.get("author", {}).get("id")} for author in article.get("authorships", []) ], "abstract": article.get("abstract_inverted_index"), "year": article.get("publication_year"), "venue": article.get("primary_location", {}).get("source", {}).get("display_name"), "language": article.get("language") }) return citations def fetch_citations_for_dois(doi_list): """Main function to fetch outgoing citations for a list of DOIs.""" all_citations = {} openalex_ids = get_openalex_ids(doi_list) print(len(openalex_ids)) for doi, oa_id in tqdm(openalex_ids.items()): all_citations[doi] = get_outgoing_citations(oa_id) if len(all_citations[doi]) == 200: print(">= 200 citations:", doi, oa_id) time.sleep(0.1) # Respect API rate limits return all_citations def save_to_file(citations, fn): # Save to a JSON file with open(fn, "w") as f: json.dump(citations, f) if __name__ == "__main__": # Example usage data = pd.read_parquet(sys.argv[1]) doi_list = data["OriginalPaperDOI"] dois_w_fulltext = [] for doi in doi_list: md_fn = doi.replace("https://doi.org/", "").replace("/", "|") + ".md" if "retraction" in sys.argv[1]: dir_up = Path("/mnt/data1/retraction_data/pdf_articles_unpaywall_md") dir_oa = Path("/mnt/data1/retraction_data/pdf_articles_md") dir_man = Path("/mnt/data1/retraction_data/pdf_articles_manual_md") if (dir_up/md_fn).exists() or (dir_oa/md_fn).exists() or (dir_man/md_fn).exists(): dois_w_fulltext.append(doi) elif "reference" in sys.argv[1]: dir = Path("/mnt/data1/retraction_data/pdf_articles_reference_md") if (dir/md_fn).exists(): dois_w_fulltext.append(doi) else: print("Can't find any markdown files for these DOI's.") # dois_w_fulltext = dois_w_fulltext[:101] print(f"Fetching outgoing citations for {len(dois_w_fulltext)} articles.") out_fn = sys.argv[2] citations_data = fetch_citations_for_dois(dois_w_fulltext) save_to_file(citations_data, out_fn) print(f"Citations data saved to {out_fn}")