Christof Bless commited on
Commit
b23f8b6
Β·
unverified Β·
1 Parent(s): db2b5ef

first working mvp

Browse files
Files changed (4) hide show
  1. app.py +91 -9
  2. extract_embeddings.py +412 -0
  3. extract_mentions.py +200 -0
  4. requirements.txt +1 -0
app.py CHANGED
@@ -1,17 +1,27 @@
1
  import gradio as gr
2
- import requests
 
 
3
 
4
- from extract_citations import *
5
- # Set your GROBID server URL
6
- GROBID_URL = "http://localhost:8070/api/processReferences"
 
 
 
 
 
 
 
 
 
 
7
 
8
  def extract_text(pdf_file):
9
  if not pdf_file:
10
  return "Please upload a PDF file."
11
  try:
12
- # Send PDF to GROBID for citation extraction
13
- with open(pdf_file.name, 'rb') as f:
14
- print("processing PDF ...")
15
  except Exception as e:
16
  return f"Error when processing PDF. {e}"
17
 
@@ -23,7 +33,79 @@ def extract_citations(doi):
23
 
24
  return citations_data
25
 
26
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Gradio UI
28
  with gr.Blocks() as demo:
29
  gr.Markdown("## Citation Integrity Score")
@@ -33,6 +115,6 @@ with gr.Blocks() as demo:
33
  output = gr.Textbox(label="Extracted Citations", lines=20)
34
 
35
  submit_btn = gr.Button("Submit")
36
- submit_btn.click(fn=extract_citations, inputs=[doi_input], outputs=output)
37
 
38
  demo.launch()
 
1
  import gradio as gr
2
+ import numpy as np
3
+ import pymupdf4llm
4
+ import spacy
5
 
6
+ from transformers import AutoTokenizer, AutoModel
7
+ from adapters import AutoAdapterModel
8
+
9
+
10
+ from extract_citations import fetch_citations_for_dois
11
+ from extract_embeddings import (
12
+ prune_contexts,
13
+ embed_abstracts,
14
+ embed_contexts,
15
+ restore_inverted_abstract,
16
+ calculate_distances
17
+ )
18
+ from extract_mentions import extract_citation_contexts
19
 
20
  def extract_text(pdf_file):
21
  if not pdf_file:
22
  return "Please upload a PDF file."
23
  try:
24
+ return pymupdf4llm.to_markdown(pdf_file)
 
 
25
  except Exception as e:
26
  return f"Error when processing PDF. {e}"
27
 
 
33
 
34
  return citations_data
35
 
36
+ def get_cite_context_distance(pdf, doi):
37
+ # Load models
38
+ tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
39
+ model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
40
+ nlp = spacy.load("en_core_web_sm")
41
+
42
+ # fetch cited papers from OpenAlex
43
+ citations_data = fetch_citations_for_dois([doi])
44
+ # get markdown text from PDF file
45
+ text = extract_text(pdf.name)
46
+ # get the context around citation markers
47
+ citations = extract_citation_contexts(citations_data, text)
48
+ citations["pruned_contexts"], citations["known_tokens_fraction"] = prune_contexts(citations, nlp, tokenizer)
49
+
50
+ # embed the contexts
51
+ citation_context_embedding = embed_contexts(
52
+ citations[
53
+ (citations["known_tokens_fraction"] >= 0.7) &
54
+ (~citations["pruned_contexts"].isna())
55
+ ]["pruned_contexts"].to_list(),
56
+ model,
57
+ tokenizer,
58
+ ).detach().numpy()
59
+
60
+ citations_data = {entry["id"]:entry for cite in citations_data.values() for entry in cite}
61
+ # embed the abstract
62
+ citation_abstract_embedding = embed_abstracts(
63
+ [
64
+ {
65
+ "title":citations_data[cite]["title"],
66
+ "abstract": (
67
+ restore_inverted_abstract(
68
+ citations_data[cite]["abstract_inverted_index"]
69
+ )
70
+ if citations_data[cite]["abstract_inverted_index"] is not None
71
+ else None
72
+ )
73
+ }
74
+ for cite in citations["citation_id"].unique()
75
+ ],
76
+ model,
77
+ tokenizer,
78
+ batch_size=4,
79
+ ).detach().numpy()
80
+ print(citation_abstract_embedding.shape)
81
+
82
+ # calculate the distances
83
+ index_left = citations.index[
84
+ (citations["known_tokens_fraction"] >= 0.7) &
85
+ (~citations["pruned_contexts"].isna())
86
+ ].tolist()
87
+
88
+ index_right = citations["citation_id"].unique().tolist()
89
+
90
+ indices = [
91
+ (index_left.index(i), index_right.index(cite_id))
92
+ if i in index_left else (None, None)
93
+ for i, cite_id in enumerate(citations["citation_id"])
94
+ ]
95
+ distances = np.array(calculate_distances(citation_context_embedding, citation_abstract_embedding, indices))
96
+ results = []
97
+ for i, dist in enumerate(distances):
98
+ if not np.isnan(dist):
99
+ obj = {}
100
+ left_context = citations.left_context[i][-50:].replace('\n', '')
101
+ right_context = citations.right_context[i][:50].replace('\n', '')
102
+ obj["cite_context_short"] = f"...{left_context}{citations.mention[i]}{right_context}..."
103
+ obj["cited_paper"] = citations_data[citations.citation_id[i]]["title"]
104
+ obj["cited_paper_id"] = citations.citation_id[i]
105
+ obj["distance"] = dist
106
+ results.append(obj)
107
+ return {"score": np.nanmean(distances), "individual_citations": results}
108
+
109
  # Gradio UI
110
  with gr.Blocks() as demo:
111
  gr.Markdown("## Citation Integrity Score")
 
115
  output = gr.Textbox(label="Extracted Citations", lines=20)
116
 
117
  submit_btn = gr.Button("Submit")
118
+ submit_btn.click(fn=get_cite_context_distance, inputs=[pdf_input, doi_input], outputs=output)
119
 
120
  demo.launch()
extract_embeddings.py ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import string
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from tqdm import tqdm
7
+ import torch
8
+ import spacy
9
+ from transformers import AutoTokenizer, AutoModel
10
+ from adapters import AutoAdapterModel
11
+
12
+
13
+ def restore_inverted_abstract(inverted_abstr):
14
+ all_indexes = [index for indexes in inverted_abstr.values() for index in indexes]
15
+ if len(all_indexes) > 0:
16
+ length = max(all_indexes) + 1
17
+ else:
18
+ return None
19
+ abstract_words = ["" for _ in range(length)]
20
+ for word, indexes in inverted_abstr.items():
21
+ for index in indexes:
22
+ abstract_words[index] = word
23
+ return " ".join(abstract_words)
24
+
25
+ def extract_title_abstract(oa_object):
26
+ abstract = oa_object["abstract_inverted_index"]
27
+ title_abstract_obj = {
28
+ "title": oa_object["title"],
29
+ "abstract": (None if abstract is None else restore_inverted_abstract(abstract))
30
+ }
31
+ return title_abstract_obj
32
+
33
+ def preprocess_batch(batch, tokenizer, input_is_context=False):
34
+ # papers = [{'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'},
35
+ # {'title': 'Attention is all you need', 'abstract': ' The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'}]
36
+
37
+ # concatenate title and abstract
38
+ if not input_is_context:
39
+ batch = [(d['title'] or '') + tokenizer.sep_token + (d.get('abstract') or '') for d in batch]
40
+
41
+ tokenized_batch = tokenizer(batch, padding=True, truncation=True,
42
+ return_tensors="pt", return_token_type_ids=False, max_length=512)
43
+ return tokenized_batch
44
+ def sent_is_mostly_known_tokens(tokens, tokenizer, threshold=0.7):
45
+ return get_fraction_of_known_tokens(tokens, tokenizer) >= threshold
46
+
47
+ def get_fraction_of_known_tokens(tokens, tokenizer):
48
+ total_tokens = len(tokens)
49
+ if total_tokens == 0:
50
+ return False # Avoid division by zero
51
+
52
+ # Clean tokens and check if they exist in the tokenizer's vocab
53
+ known_tokens = sum(1 for token in tokens if token.text.lower().strip(string.punctuation) in tokenizer.vocab)
54
+ return known_tokens / total_tokens
55
+
56
+ def prune_contexts(contexts, spacy_model, tokenizer):
57
+ chosen_sents = []
58
+ fractions = []
59
+ for _, context in tqdm(contexts.iterrows(), total=len(contexts)):
60
+ text = (context["left_context"] + context["mention"] + context["right_context"]).replace("\n", " ")
61
+ citation_start = len(context["left_context"]) + 1
62
+ spacied = spacy_model(text)
63
+ chosen_sent = None
64
+ previous_sent = ""
65
+ kt_fraction = None
66
+ for sent in spacied.sents:
67
+ if citation_start < sent.end_char and citation_start >= sent.start_char:
68
+ chosen_sent = previous_sent + sent.text
69
+ kt_fraction = get_fraction_of_known_tokens(sent, tokenizer)
70
+ break
71
+ previous_sent = sent.text
72
+
73
+ if chosen_sent is None or len(chosen_sent.split()) < 5:
74
+ print(f" - no context found: {spacied.text}")
75
+ chosen_sent = None
76
+ # if chosen_sent is not None:
77
+ chosen_sents.append(chosen_sent)
78
+ fractions.append(kt_fraction)
79
+ return chosen_sents, fractions
80
+
81
+ def embed_contexts(contexts, model, tokenizer, batch_size = 16):
82
+ embeddings = []
83
+ # Process in batches
84
+ with torch.no_grad(): # Disable gradient tracking to save memory
85
+ for i in tqdm(range(0, len(contexts), batch_size)):
86
+ batch = contexts[i:i + batch_size]
87
+ try:
88
+ inputs = preprocess_batch(batch, tokenizer, input_is_context=True)
89
+ except Exception as e:
90
+ print(e)
91
+ breakpoint()
92
+ batch_embeddings = embed_batch(inputs, model)
93
+ embeddings.append(batch_embeddings)
94
+
95
+ # Concatenate all batches back together
96
+ return torch.cat(embeddings, dim=0)
97
+
98
+ def embed_batch(tokenized_batch, model):
99
+ output = model(**tokenized_batch)
100
+ # take the first token in the batch as the embedding
101
+ embeddings = output.last_hidden_state[:, 0, :]
102
+ return embeddings
103
+
104
+
105
+ def embed_abstracts(abstract_title_list, model, tokenizer, batch_size=16):
106
+ print("Loaded specter2 model:")
107
+ embeddings = []
108
+
109
+ # Process in batches
110
+ with torch.no_grad(): # Disable gradient tracking to save memory
111
+ for i in tqdm(range(0, len(abstract_title_list), batch_size)):
112
+ batch = abstract_title_list[i:i + batch_size]
113
+ inputs = preprocess_batch(batch, tokenizer)
114
+ batch_embeddings = embed_batch(inputs, model)
115
+ embeddings.append(batch_embeddings)
116
+
117
+ # Concatenate all batches back together
118
+ return torch.cat(embeddings, dim=0)
119
+
120
+ def calculate_distances(embeddings_a, embeddings_b, indices, batch_size=512):
121
+ # Initialize a list to store the results
122
+ all_distances = [None] * len(indices)
123
+
124
+ # Loop over the embeddings in batches
125
+ num_batches = len(indices) // batch_size + (1 if len(indices) % batch_size != 0 else 0)
126
+ for i in range(num_batches):
127
+ # Get the current batch
128
+ start_idx = i * batch_size
129
+ end_idx = min((i + 1) * batch_size, len(indices))
130
+
131
+ batch_a, batch_b, batch_positions = [], [], []
132
+ for idx, (a, b) in enumerate(indices[start_idx:end_idx]):
133
+ if a is None or b is None:
134
+ all_distances[start_idx + idx] = np.nan # Assign NaN directly in place
135
+ else:
136
+ batch_a.append(embeddings_a[a])
137
+ batch_b.append(embeddings_b[b])
138
+ batch_positions.append(start_idx + idx)
139
+
140
+ if batch_a and batch_b:
141
+ batch_a = torch.from_numpy(np.array(batch_a)).float()
142
+ batch_b = torch.from_numpy(np.array(batch_b)).float()
143
+
144
+ # Compute L2 (Euclidean) distance for the batch
145
+ distances_batch = torch.norm(batch_a - batch_b, p=2, dim=1).numpy().astype(float)
146
+
147
+ # Assign computed distances in the correct positions
148
+ for pos, dist in zip(batch_positions, distances_batch):
149
+ all_distances[pos] = dist
150
+
151
+ return all_distances
152
+
153
+ def add_distances_to_df(df, index_left, index_right, embeddings, column_name):
154
+ if column_name == "abstract_abstract_l2_distance":
155
+ indices = [(index_left.index(doi), index_right.index(cite_id)) for doi, cite_id in zip(df["cited_in_doi"], df["citation_id"])]
156
+ print("calculate distances...")
157
+ distances = calculate_distances(embeddings["original_abstract"], embeddings["citation_abstract"], indices)
158
+ df[column_name] = distances
159
+ elif column_name == "context_abstract_l2_distance":
160
+ indices = [
161
+ (index_left.index(i), index_right.index(cite_id))
162
+ if i in index_left else (None, None)
163
+ for i, cite_id in enumerate(df["citation_id"])
164
+ ]
165
+ print("calculate distances...")
166
+ distances = calculate_distances(embeddings["citation_context_base"], embeddings["citation_abstract"], indices)
167
+ df[column_name] = distances
168
+ return df
169
+
170
+ def add_pruned_contexts_to_df(df, df_name):
171
+ tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
172
+ nlp = spacy.load("en_core_web_lg")
173
+
174
+ df["pruned_contexts"], df["known_tokens_fraction"] = prune_contexts(df, nlp, tokenizer)
175
+ df.to_parquet(df_name, compression='gzip')
176
+
177
+ def main_specter(retracted, overwrite=True):
178
+ tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
179
+ model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
180
+ # model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)
181
+
182
+ if not overwrite:
183
+ embeddings_from_disk = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz')
184
+
185
+ ## Paper abstracts
186
+ if retracted:
187
+ data = pd.read_parquet("25_02_05_retractions_abstracts_cleaned.gzip")
188
+ else:
189
+ # data = pd.read_parquet("24_11_30_reference_articles.gzip")
190
+ data = pd.read_parquet("24_12_31_reference_articles_most_cited.gzip")
191
+ print("embedding original abstracts...")
192
+ if not overwrite and "original_abstract" in embeddings_from_disk:
193
+ paper_abstract_embedding = embeddings_from_disk["original_abstract"]
194
+ else:
195
+ paper_abstract_embedding = embed_abstracts(
196
+ [
197
+ {"title":r["Title"], "abstract": r["Abstract"]}
198
+ for _,r in data.iterrows()
199
+ ],
200
+ model,
201
+ tokenizer,
202
+ batch_size=4
203
+ ).detach().numpy()
204
+
205
+
206
+ ## Cited papers abstracts
207
+ if retracted:
208
+ citations_df_name = "retraction_citation_mentions.gzip"
209
+ with open("retractions_citations.json") as jsonfile:
210
+ cite_data = json.load(jsonfile)
211
+ citations = pd.read_parquet(citations_df_name)
212
+ else:
213
+ citations_df_name = "reference_mc_citation_mentions.gzip"
214
+ # with open("reference_citations.json") as jsonfile:
215
+ with open("reference_most_cited_citations.json") as jsonfile:
216
+ cite_data = json.load(jsonfile)
217
+ citations = pd.read_parquet(citations_df_name)
218
+
219
+ cite_data = {entry["id"]:entry for cite in cite_data.values() for entry in cite}
220
+
221
+ print("embedding cited abstracts...")
222
+ if not overwrite and "citation_abstract" in embeddings_from_disk:
223
+ citation_abstract_embedding = embeddings_from_disk["citation_abstract"]
224
+ else:
225
+ citation_abstract_embedding = embed_abstracts(
226
+ [
227
+ {
228
+ "title":cite_data[cite]["title"],
229
+ "abstract": (restore_inverted_abstract(cite_data[cite]["abstract_inverted_index"]) if cite_data[cite]["abstract_inverted_index"] is not None else None)
230
+ }
231
+ for cite in citations["citation_id"].unique()
232
+ ],
233
+ model,
234
+ tokenizer,
235
+ batch_size=4,
236
+ ).detach().numpy()
237
+
238
+ print("embedding citation contexts base...")
239
+ if not overwrite and "citation_context_base" in embeddings_from_disk:
240
+ citation_context_embedding_base = embeddings_from_disk["citation_context_base"]
241
+ else:
242
+
243
+ citation_context_embedding_base = embed_contexts(
244
+ citations[
245
+ (citations["known_tokens_fraction"] >= 0.7) &
246
+ (~citations["pruned_contexts"].isna())
247
+ ]["pruned_contexts"].to_list(),
248
+ model,
249
+ tokenizer,
250
+ ).detach().numpy()
251
+
252
+ print("embedding citation contexts...")
253
+ if not overwrite and "citation_context" in embeddings_from_disk:
254
+ citation_context_embedding = embeddings_from_disk["citation_context"]
255
+ else:
256
+ model.load_adapter("allenai/specter2_adhoc_query", source="hf", load_as="adhoc", set_active=True)
257
+
258
+ citation_context_embedding = embed_contexts(
259
+ citations[
260
+ (citations["known_tokens_fraction"] >= 0.7) &
261
+ (~citations["pruned_contexts"].isna())
262
+ ]["pruned_contexts"].to_list(),
263
+ model,
264
+ tokenizer,
265
+ ).detach().numpy()
266
+ # Save
267
+ np.savez(
268
+ f'{("retractions" if retracted else "reference")}_embeddings_specter.npz',
269
+ original_abstract=paper_abstract_embedding,
270
+ citation_context=citation_context_embedding,
271
+ citation_abstract=citation_abstract_embedding,
272
+ citation_context_base=citation_context_embedding_base,
273
+ )
274
+
275
+ # Load
276
+ data = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz')
277
+ print(data["original_abstract"].shape) # (768,) or (1536,) depending on the model
278
+ print(data["citation_context"].shape) # (768,) or (1536,) depending on the model
279
+ print(data["citation_context_base"].shape) # (768,) or (1536,) depending on the model
280
+ print(data["citation_abstract"].shape) # (768,) or (1536,) depending on the model
281
+
282
+ def main_scibert(retracted, overwrite=True):
283
+ tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
284
+ model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
285
+ # model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)
286
+
287
+ if not overwrite:
288
+ embeddings_from_disk = np.load(f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz')
289
+
290
+ ## Paper abstracts
291
+ if retracted:
292
+ data = pd.read_parquet("25_02_05_retractions_abstracts_cleaned.gzip")
293
+ else:
294
+ # data = pd.read_parquet("24_11_30_reference_articles.gzip")
295
+ data = pd.read_parquet("24_12_31_reference_articles_most_cited.gzip")
296
+ print("embedding original abstracts...")
297
+ if not overwrite and "original_abstract" in embeddings_from_disk:
298
+ paper_abstract_embedding = embeddings_from_disk["original_abstract"]
299
+ else:
300
+ paper_abstract_embedding = embed_abstracts(
301
+ [
302
+ {"title":r["Title"], "abstract": r["Abstract"]}
303
+ for _,r in data.iterrows()
304
+ ],
305
+ model,
306
+ tokenizer,
307
+ batch_size=4
308
+ ).detach().numpy()
309
+
310
+
311
+ ## Cited papers abstracts
312
+ if retracted:
313
+ citations_df_name = "retraction_citation_mentions.gzip"
314
+ with open("retractions_citations.json") as jsonfile:
315
+ cite_data = json.load(jsonfile)
316
+ citations = pd.read_parquet(citations_df_name)
317
+ else:
318
+ citations_df_name = "reference_mc_citation_mentions.gzip"
319
+ # with open("reference_citations.json") as jsonfile:
320
+ with open("reference_most_cited_citations.json") as jsonfile:
321
+ cite_data = json.load(jsonfile)
322
+ citations = pd.read_parquet(citations_df_name)
323
+
324
+ cite_data = {entry["id"]:entry for cite in cite_data.values() for entry in cite}
325
+
326
+ print("embedding cited abstracts...")
327
+ if not overwrite and "citation_abstract" in embeddings_from_disk:
328
+ citation_abstract_embedding = embeddings_from_disk["citation_abstract"]
329
+ else:
330
+ citation_abstract_embedding = embed_abstracts(
331
+ [
332
+ {
333
+ "title":cite_data[cite]["title"],
334
+ "abstract": (restore_inverted_abstract(cite_data[cite]["abstract_inverted_index"]) if cite_data[cite]["abstract_inverted_index"] is not None else None)
335
+ }
336
+ for cite in citations["citation_id"].unique()
337
+ ],
338
+ model,
339
+ tokenizer,
340
+ batch_size=4,
341
+ ).detach().numpy()
342
+
343
+ print("embedding citation contexts...")
344
+ if not overwrite and "citation_context" in embeddings_from_disk:
345
+ citation_context_embedding = embeddings_from_disk["citation_context"]
346
+ else:
347
+ citation_context_embedding = embed_contexts(
348
+ citations[
349
+ (citations["known_tokens_fraction"] >= 0.7) &
350
+ (~citations["pruned_contexts"].isna())
351
+ ]["pruned_contexts"].to_list(),
352
+ model,
353
+ tokenizer,
354
+ ).detach().numpy()
355
+ # Save
356
+ np.savez(
357
+ f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz',
358
+ original_abstract=paper_abstract_embedding,
359
+ citation_context=citation_context_embedding,
360
+ citation_abstract=citation_abstract_embedding,
361
+ )
362
+
363
+ # Load
364
+ data = np.load(f'{("retractions" if retracted else "reference")}_embeddings_scibert.npz')
365
+ print(data["original_abstract"].shape) # (768,) or (1536,) depending on the model
366
+ print(data["citation_context"].shape) # (768,) or (1536,) depending on the model
367
+ print(data["citation_context_base"].shape) # (768,) or (1536,) depending on the model
368
+ print(data["citation_abstract"].shape) # (768,) or (1536,) depending on the model
369
+
370
+
371
+ if __name__=="__main__":
372
+ import sys
373
+ retracted=(sys.argv[1] == "retracted")
374
+ if retracted:
375
+ print("Running embedding pipeline for retractions.")
376
+ else:
377
+ print("Running embedding pipeline for reference.")
378
+
379
+ df = pd.read_parquet(f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip')
380
+
381
+ # add_pruned_contexts_to_df(df, f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip')
382
+
383
+ main_scibert(retracted, overwrite=False)
384
+ # main_specter(retracted, overwrite=False)
385
+
386
+ embeddings = np.load(f'{("retractions" if retracted else "reference")}_embeddings_specter.npz')
387
+ print(embeddings["original_abstract"].shape) # (768,) or (1536,) depending on the model
388
+ print(embeddings["citation_context"].shape) # (768,) or (1536,) depending on the model
389
+ print(embeddings["citation_abstract"].shape) # (768,) or (1536,) depending on the model
390
+ # original_dois = pd.read_parquet(f'{("25_02_05_retractions_abstracts_cleaned" if retracted else "24_11_30_reference_articles")}.gzip', columns = ["OriginalPaperDOI"])["OriginalPaperDOI"].tolist()
391
+ original_dois = pd.read_parquet(f'{("25_02_05_retractions_abstracts_cleaned" if retracted else "24_12_31_reference_articles_most_cited")}.gzip', columns = ["OriginalPaperDOI"])["OriginalPaperDOI"].tolist()
392
+
393
+ # df = add_distances_to_df(
394
+ # df,
395
+ # [doi.replace("https://doi.org/", "") for doi in original_dois],
396
+ # df["citation_id"].unique().tolist(),
397
+ # embeddings,
398
+ # "abstract_abstract_l2_distance"
399
+ # )
400
+
401
+ df = add_distances_to_df(
402
+ df,
403
+ df.index[
404
+ (df["known_tokens_fraction"] >= 0.7) &
405
+ (~df["pruned_contexts"].isna())
406
+ ].tolist(),
407
+ df["citation_id"].unique().tolist(),
408
+ embeddings,
409
+ "context_abstract_l2_distance"
410
+ )
411
+
412
+ df.to_parquet(f'{("retraction" if retracted else "reference_mc")}_citation_mentions.gzip', compression='gzip')
extract_mentions.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import sys
4
+ import numpy as np
5
+ from pathlib import Path
6
+ from typing import NamedTuple
7
+
8
+ import pandas as pd
9
+
10
+ TITLE_NORMALIZE = [
11
+ "alpha", "beta", "gamma", "delta", "epsilon", "kappa", "lambda"
12
+ ]
13
+
14
+ class Context(NamedTuple):
15
+ left: str
16
+ right: str
17
+
18
+ split_right_pattern = re.compile(r"(?:#+)|(?:\[(?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\])")
19
+ split_left_pattern = re.compile(r"(?:#+)|(?:\](?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\[)")
20
+ ieee_style_pattern = re.compile(r"(?>\[(?>[^A-Za-z0-9\[\]\.]*(\d{1,3})[^A-Za-z0-9\[\]\.]*)+\][^A-Za-z0-9\[\]]*)+")
21
+ auth_year_style_pattern = re.compile(r"(?>\((?>[^()]+?[,\s][1-2][0-9]{3})+\)[^()A-Za-z0-9]*)+")
22
+
23
+ def filter_page_breaks(content):
24
+ find_page_breaks = re.compile(
25
+ r"""
26
+ \n*
27
+ \n # empty line
28
+ -----\n # 5 dashes
29
+ \n # empty line
30
+ (?:.*?\n)? # Capture the footer/header
31
+ \n*
32
+ """,
33
+ re.VERBOSE | re.M
34
+ )
35
+ return re.sub(find_page_breaks, " ", content)
36
+
37
+ def get_author_title_year_patterns_from_citation(cite):
38
+ title = cite['title']
39
+ for w in TITLE_NORMALIZE:
40
+ title = title.replace(w, "$")
41
+ title = re.sub(r"[^a-zA-Z0-9]+", "_", title) # Replace en and em dashes with a hyphen
42
+ # title = title.replace(" ", r"[^a-zA-Z0-9]+?")
43
+ year = str(cite['publication_year'])
44
+ try:
45
+ first_author = cite['authorships'][0]['author']['display_name']
46
+ ## only lastname
47
+ first_author = re.sub(r"[^a-zA-Z0-9]+", "_", first_author.split(" ")[-1])
48
+ except IndexError or TypeError:
49
+ first_author = None
50
+ return first_author, title, year
51
+
52
+ def extract_potential_citations(paper):
53
+ ieee_style = ieee_style_pattern.finditer(paper)
54
+ ieee_style_buckets = []
55
+ for match in ieee_style:
56
+ possible = set([int(n) for n in re.findall(r"\d{1,3}", match.group(1))])
57
+ ## expand ranges
58
+ ranges = re.findall(r"(\d{1,3})[–——-]+(\d{1,3})", match.group(1))
59
+ if len(ranges)>0:
60
+ for start, end in ranges:
61
+ possible |= set(range(int(start),int(end)+1))
62
+ ieee_style_buckets.append((match.start(), match.end(), match.group(0), possible))
63
+
64
+ auth_year_style = auth_year_style_pattern.finditer(paper)
65
+ auth_year_style_buckets = []
66
+ for match in auth_year_style:
67
+ possible = set(re.split(r"(\b[1-2]\d{3}\b)\W*", match.group(0)))
68
+ auth_year_style_buckets.append((match.start(), match.end(), match.group(0), possible))
69
+
70
+ return ieee_style_buckets, auth_year_style_buckets
71
+
72
+ def find_reference_in_reference_section(paper, cite, references):
73
+ """
74
+ Searches for reference section entry matching citation paper title, year, first author, and journal in a markdown file
75
+ using fuzzy matching.
76
+ """
77
+ patterns = get_author_title_year_patterns_from_citation(cite)
78
+ if any([p is None for p in patterns]):
79
+ return paper, None
80
+ author, title, year = patterns
81
+ patterns = [author, title, year]
82
+ # Try finding all the patterns between two enumeration items starting from the back of the string
83
+ # for i,s in enumerate(references):
84
+ for full_ref, enum, ref_body in references:
85
+ for w in TITLE_NORMALIZE:
86
+ normalized = ref_body.replace(w, "$")
87
+ fuzzy_ref = re.sub(r"[^a-zA-Z0-9]+", "_", normalized)
88
+ if all([re.search(pattern, fuzzy_ref, re.IGNORECASE | re.MULTILINE | re.DOTALL) for pattern in patterns]):
89
+ match = (cite["id"], author, title, year, enum, ref_body)
90
+ # remove the reference from the paper so it can't be matched again
91
+ paper = paper.replace(full_ref, "")
92
+ return paper, match
93
+
94
+ return paper, (cite["id"], author, title, year, None, None)
95
+
96
+
97
+ def find_mentions_by_pointer(doi, ref, paper, ieee_possible):
98
+ """
99
+ Match the links mentioning that reference in the text and extract context.
100
+ """
101
+ mentions = []
102
+ (oa_id, _, _, _, ref_num, r) = ref
103
+ for start, end, match, possible_numbers in ieee_possible:
104
+ if int(ref_num) in possible_numbers:
105
+ context = create_context(start, end, paper)
106
+ mentions.append((doi, oa_id, ref_num, r, start, end, context.left, match, context.right))
107
+ return mentions
108
+
109
+ def find_mentions_direct(doi, ref, paper, auth_style_possible):
110
+ """
111
+ Match the links mentioning that reference in the text and extract context.
112
+ """
113
+ mentions = []
114
+ (oa_id, a, _, y, _, _) = ref
115
+ for start, end, match, possibilities in auth_style_possible:
116
+ for possibility in possibilities:
117
+ if y in possibility and a in possibility:
118
+ context = create_context(start, end, paper)
119
+ mentions.append((doi, oa_id, None, None, start, end, context.left, match, context.right))
120
+ return mentions
121
+
122
+ def create_context(start, end, paper):
123
+ left = paper[max(0, start - 500):start]
124
+ right = paper[end:end + min(len(paper) - end, 500)]
125
+ ## only take context until a next section begins or another citation appears
126
+ splitleft = split_left_pattern.search(left[::-1])
127
+ if splitleft is not None:
128
+ left = left[len(left) - splitleft.start():]
129
+ splitright = split_right_pattern.search(right)
130
+ if splitright is not None:
131
+ right = right[:splitright.start()]
132
+ return Context(left=left, right=right)
133
+
134
+ def restore_inverted_abstract(inverted_abstr):
135
+ all_indexes = [index for indexes in inverted_abstr.values() for index in indexes]
136
+ if len(all_indexes) > 0:
137
+ length = max(all_indexes) + 1
138
+ else:
139
+ return None
140
+ abstract_words = ["" for _ in range(length)]
141
+ for word, indexes in inverted_abstr.items():
142
+ for index in indexes:
143
+ abstract_words[index] = word
144
+ return " ".join(abstract_words)
145
+
146
+ def extract_title_abstract(oa_object):
147
+ abstract = oa_object["abstract_inverted_index"]
148
+ title_abstract_obj = {
149
+ "title": oa_object["title"],
150
+ "abstract": (None if abstract is None else restore_inverted_abstract(abstract))
151
+ }
152
+ return title_abstract_obj
153
+
154
+ def extract_citation_contexts(cites, paper):
155
+ counter=0
156
+ extracted_citations = []
157
+ references_pattern = re.compile(r'(\n\W*(\d{1,3})\W(.+?)(?=(?:\n\n)|(?:\n\W*\d{1,3}\W)|\Z))', re.VERBOSE | re.I | re.M | re.S)
158
+ for doi in cites:
159
+ # for doi in ["10.1155/2021/4883509"]:
160
+ counter+=1
161
+ paper = filter_page_breaks(paper)
162
+ # print(paper)
163
+ if paper is None:
164
+ continue
165
+ # remove title and authors from beginning of paper
166
+ paper = paper[750:]
167
+ citations = cites[doi]
168
+ # references = re.findall(r'\n\s*(\d+)\.(.*?)(?=(?:\n\s*\d+\.)|\Z)', paper, re.VERBOSE | re.I | re.M | re.S)
169
+ references = references_pattern.findall(paper)
170
+ found = 0
171
+ n_mentions = 0
172
+ has_abstract_title = 0
173
+ in_ref_section_refs = []
174
+ for cite in citations:
175
+ embedding_input = extract_title_abstract(cite)
176
+ if embedding_input["abstract"] is None or embedding_input["title"] is None:
177
+ in_ref_section_refs.append(None)
178
+ continue
179
+ has_abstract_title+=1
180
+ paper, in_ref_section_ref = find_reference_in_reference_section(paper, cite, references)
181
+ in_ref_section_refs.append(in_ref_section_ref)
182
+ ieee, auth_year = extract_potential_citations(paper)
183
+
184
+ for ref in in_ref_section_refs:
185
+ if ref is not None:
186
+ if ref[4] is not None:
187
+ mentions = find_mentions_by_pointer(doi, ref, paper, ieee)
188
+ else: mentions = []
189
+ mentions += find_mentions_direct(doi, ref, paper, auth_year)
190
+ extracted_citations+=mentions
191
+
192
+ if len(mentions)>0:
193
+ n_mentions+=len(mentions)
194
+ found+=1
195
+
196
+ print(f"{counter}/{len(cites)} - {doi}: {len(citations)} citations, {has_abstract_title} embeddable citations and {found} references with {n_mentions} mentions found in markdown.")
197
+
198
+ return pd.DataFrame(extracted_citations, columns = ["cited_in_doi", "citation_id", "reference_marker", "reference_target", "mention_start", "mention_end", "left_context", "mention", "right_context"])
199
+
200
+
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  requests
2
  pandas
3
  tqdm
 
1
+ gradio
2
  requests
3
  pandas
4
  tqdm