Spaces:
Running
on
Zero
Running
on
Zero
jedick
commited on
Commit
·
a95b710
1
Parent(s):
6d38f09
Add app files
Browse files- app.py +281 -0
- examples/accurate/log.csv +7 -0
- examples/inaccurate/log.csv +2 -0
- examples/retrieval/log.csv +2 -0
- requirements.txt +6 -0
- retrieval.py +51 -0
app.py
ADDED
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import gradio as gr
|
3 |
+
from retrieval import retrieve_from_pdf
|
4 |
+
|
5 |
+
if gr.NO_RELOAD:
|
6 |
+
from transformers import pipeline
|
7 |
+
|
8 |
+
# Keep track of the model name in a global variable so correct model is shown after page refresh
|
9 |
+
# https://github.com/gradio-app/gradio/issues/3173
|
10 |
+
MODEL_NAME = "jedick/DeBERTa-v3-base-mnli-fever-anli-scifact-citint"
|
11 |
+
pipe = pipeline(
|
12 |
+
"text-classification",
|
13 |
+
model=MODEL_NAME,
|
14 |
+
)
|
15 |
+
|
16 |
+
|
17 |
+
def query_model(claim, evidence):
|
18 |
+
"""
|
19 |
+
Get prediction for a pair of claim and evidence
|
20 |
+
"""
|
21 |
+
prediction = {
|
22 |
+
# Send a dictionary containing {"text", "text_pair"} keys; use top_k=3 to get results for all classes
|
23 |
+
# https://huggingface.co/docs/transformers/v4.51.3/en/main_classes/pipelines#transformers.TextClassificationPipeline.__call__.inputs
|
24 |
+
# Put evidence before claim
|
25 |
+
# https://github.com/jedick/ML-capstone-project
|
26 |
+
# Output {label: confidence} dictionary format as expected by gr.Label()
|
27 |
+
# https://github.com/gradio-app/gradio/issues/11170
|
28 |
+
d["label"]: d["score"]
|
29 |
+
for d in pipe({"text": evidence, "text_pair": claim}, top_k=3)
|
30 |
+
}
|
31 |
+
# Return two instances of the prediction to send to different Gradio components
|
32 |
+
return prediction, prediction
|
33 |
+
|
34 |
+
|
35 |
+
def query_model_for_examples(claim, evidence):
|
36 |
+
"""
|
37 |
+
A duplicate of the previous function, used to keep the API names clean
|
38 |
+
"""
|
39 |
+
prediction = {
|
40 |
+
d["label"]: d["score"]
|
41 |
+
for d in pipe({"text": evidence, "text_pair": claim}, top_k=3)
|
42 |
+
}
|
43 |
+
return prediction, prediction
|
44 |
+
|
45 |
+
|
46 |
+
# Function to select the model
|
47 |
+
def select_model(model_name):
|
48 |
+
global pipe, MODEL_NAME
|
49 |
+
MODEL_NAME = model_name
|
50 |
+
pipe = pipeline(
|
51 |
+
"text-classification",
|
52 |
+
model=MODEL_NAME,
|
53 |
+
)
|
54 |
+
|
55 |
+
|
56 |
+
def prediction_to_df(prediction=None):
|
57 |
+
"""
|
58 |
+
Convert prediction text to DataFrame for barplot
|
59 |
+
"""
|
60 |
+
if prediction is None or prediction == "":
|
61 |
+
# Show an empty plot for app initialization or auto-reload
|
62 |
+
prediction = {"SUPPORT": 0, "NEI": 0, "REFUTE": 0}
|
63 |
+
elif "Model" in prediction:
|
64 |
+
# Show full-height bars when the model is changed
|
65 |
+
prediction = {"SUPPORT": 1, "NEI": 1, "REFUTE": 1}
|
66 |
+
else:
|
67 |
+
# Convert predictions text to dictionary
|
68 |
+
prediction = eval(prediction)
|
69 |
+
# Rename dictionary keys to use consistent labels across models
|
70 |
+
prediction = {
|
71 |
+
("SUPPORT" if k == "entailment" else k): v for k, v in prediction.items()
|
72 |
+
}
|
73 |
+
prediction = {
|
74 |
+
("NEI" if k == "neutral" else k): v for k, v in prediction.items()
|
75 |
+
}
|
76 |
+
prediction = {
|
77 |
+
("REFUTE" if k == "contradiction" else k): v for k, v in prediction.items()
|
78 |
+
}
|
79 |
+
# Use custom order for labels (pipe() returns labels in descending order of softmax score)
|
80 |
+
labels = ["SUPPORT", "NEI", "REFUTE"]
|
81 |
+
prediction = {k: prediction[k] for k in labels}
|
82 |
+
# Convert dictionary to DataFrame with one column (Probability)
|
83 |
+
df = pd.DataFrame.from_dict(prediction, orient="index", columns=["Probability"])
|
84 |
+
# Move the index to the Class column
|
85 |
+
return df.reset_index(names="Class")
|
86 |
+
|
87 |
+
|
88 |
+
def change_visualization(choice):
|
89 |
+
if choice == "barplot":
|
90 |
+
barplot = gr.update(visible=True)
|
91 |
+
label = gr.update(visible=False)
|
92 |
+
elif choice == "label":
|
93 |
+
barplot = gr.update(visible=False)
|
94 |
+
label = gr.update(visible=True)
|
95 |
+
return barplot, label
|
96 |
+
|
97 |
+
|
98 |
+
# Setup theme without background image
|
99 |
+
my_theme = gr.Theme.from_hub("NoCrypt/miku")
|
100 |
+
my_theme.set(body_background_fill="#FFFFFF", body_background_fill_dark="#000000")
|
101 |
+
|
102 |
+
# Gradio interface setup
|
103 |
+
with gr.Blocks(theme=my_theme) as demo:
|
104 |
+
|
105 |
+
# Layout
|
106 |
+
with gr.Row():
|
107 |
+
with gr.Column(scale=3):
|
108 |
+
with gr.Row():
|
109 |
+
gr.Markdown(
|
110 |
+
"""
|
111 |
+
# AI4citations
|
112 |
+
## Scientific citation verification
|
113 |
+
|
114 |
+
*Press Enter in a textbox or click Submit to run the model.*
|
115 |
+
"""
|
116 |
+
)
|
117 |
+
gr.Markdown(
|
118 |
+
"""
|
119 |
+
### Three ways to use this app
|
120 |
+
|
121 |
+
1. **Claim verification**: Input a claim and evidence
|
122 |
+
2. **Evidence retrieval**: Input a claim to get evidence from PDF
|
123 |
+
3. **Claim extraction**: Input a text to get claim from text
|
124 |
+
"""
|
125 |
+
)
|
126 |
+
# Create dropdown menu to select the model
|
127 |
+
dropdown = gr.Dropdown(
|
128 |
+
choices=[
|
129 |
+
# TODO: For bert-base-uncased, how can we set num_labels = 2 in HF pipeline?
|
130 |
+
# (num_labels is available in AutoModelForSequenceClassification.from_pretrained)
|
131 |
+
# "bert-base-uncased",
|
132 |
+
"MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",
|
133 |
+
"jedick/DeBERTa-v3-base-mnli-fever-anli-scifact-citint",
|
134 |
+
],
|
135 |
+
value=MODEL_NAME,
|
136 |
+
label="Model",
|
137 |
+
)
|
138 |
+
claim = gr.Textbox(
|
139 |
+
label="Claim",
|
140 |
+
info="aka hypothesis",
|
141 |
+
placeholder="Input claim or use Get Claim from Text",
|
142 |
+
)
|
143 |
+
evidence = gr.TextArea(
|
144 |
+
label="Evidence",
|
145 |
+
info="aka premise",
|
146 |
+
placeholder="Input evidence or use Get Evidence from PDF",
|
147 |
+
)
|
148 |
+
with gr.Row():
|
149 |
+
with gr.Accordion("Get Claim from Text", open=False):
|
150 |
+
text = gr.TextArea(
|
151 |
+
label="Text",
|
152 |
+
placeholder="Under construction!",
|
153 |
+
interactive=False,
|
154 |
+
)
|
155 |
+
with gr.Accordion("Get Evidence from PDF", open=False):
|
156 |
+
pdf_file = gr.File(label="Upload PDF", type="filepath")
|
157 |
+
get_evidence = gr.Button(value="Get Evidence")
|
158 |
+
top_k = gr.Slider(
|
159 |
+
1,
|
160 |
+
10,
|
161 |
+
value=5,
|
162 |
+
step=1,
|
163 |
+
interactive=True,
|
164 |
+
label="Top k sentences",
|
165 |
+
)
|
166 |
+
submit = gr.Button("Submit")
|
167 |
+
|
168 |
+
with gr.Column(scale=2):
|
169 |
+
radio = gr.Radio(["barplot", "label"], value="barplot", label="Results")
|
170 |
+
# Keep the prediction textbox hidden
|
171 |
+
with gr.Accordion(visible=False):
|
172 |
+
prediction = gr.Textbox(label="Prediction")
|
173 |
+
barplot = gr.BarPlot(
|
174 |
+
prediction_to_df,
|
175 |
+
x="Class",
|
176 |
+
y="Probability",
|
177 |
+
color="Class",
|
178 |
+
color_map={"SUPPORT": "green", "NEI": "#888888", "REFUTE": "#FF8888"},
|
179 |
+
inputs=prediction,
|
180 |
+
y_lim=([0, 1]),
|
181 |
+
)
|
182 |
+
label = gr.Label(visible=False)
|
183 |
+
with gr.Accordion("Examples", open=False):
|
184 |
+
gr.Markdown(
|
185 |
+
"*Prediction performance with jedick/DeBERTa-v3-base-mnli-fever-anli-scifact-citint:*"
|
186 |
+
),
|
187 |
+
with gr.Row():
|
188 |
+
gr.Examples(
|
189 |
+
examples="examples/accurate",
|
190 |
+
inputs=[claim, evidence],
|
191 |
+
outputs=[prediction, label],
|
192 |
+
fn=query_model_for_examples,
|
193 |
+
label="Accurate",
|
194 |
+
run_on_click=True,
|
195 |
+
example_labels=pd.read_csv("examples/accurate/log.csv")[
|
196 |
+
"label"
|
197 |
+
].tolist(),
|
198 |
+
)
|
199 |
+
gr.Examples(
|
200 |
+
examples="examples/inaccurate",
|
201 |
+
inputs=[claim, evidence],
|
202 |
+
outputs=[prediction, label],
|
203 |
+
fn=query_model_for_examples,
|
204 |
+
label="Inaccurate",
|
205 |
+
run_on_click=True,
|
206 |
+
example_labels=pd.read_csv("examples/inaccurate/log.csv")[
|
207 |
+
"label"
|
208 |
+
].tolist(),
|
209 |
+
)
|
210 |
+
gr.Examples(
|
211 |
+
examples="examples/retrieval",
|
212 |
+
inputs=[pdf_file, claim],
|
213 |
+
outputs=evidence,
|
214 |
+
fn=retrieve_from_pdf,
|
215 |
+
label="Retrieval",
|
216 |
+
run_on_click=False,
|
217 |
+
example_labels=pd.read_csv("examples/retrieval/log.csv")[
|
218 |
+
"label"
|
219 |
+
].tolist(),
|
220 |
+
)
|
221 |
+
gr.Markdown(
|
222 |
+
"""
|
223 |
+
### Sources
|
224 |
+
- ML project: [jedick/ML-capstone-project](https://github.com/jedick/ML-capstone-project)
|
225 |
+
- App repository: [jedick/AI4citations](https://github.com/jedick/AI4citations)
|
226 |
+
- Fine-tuned model: [jedick/DeBERTa-v3-base-mnli-fever-anli-scifact-citint](https://huggingface.co/jedick/DeBERTa-v3-base-mnli-fever-anli-scifact-citint)
|
227 |
+
- Datasets used for fine-tuning
|
228 |
+
- SciFact: [allenai/SciFact](https://github.com/allenai/scifact)
|
229 |
+
- Citation-Integrity (CitInt): [ScienceNLP-Lab/Citation-Integrity](https://github.com/ScienceNLP-Lab/Citation-Integrity)
|
230 |
+
- Base model: [MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli](https://huggingface.co/MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli)
|
231 |
+
- Evidence retrieval: [xhluca/bm25s](https://github.com/xhluca/bm25s)
|
232 |
+
- Gradio theme: [NoCrypt/miku](https://huggingface.co/spaces/NoCrypt/miku)
|
233 |
+
"""
|
234 |
+
)
|
235 |
+
|
236 |
+
# Event listeners
|
237 |
+
|
238 |
+
# Click the submit button or press Enter to submit
|
239 |
+
gr.on(
|
240 |
+
triggers=[claim.submit, evidence.submit, submit.click],
|
241 |
+
fn=query_model,
|
242 |
+
inputs=[claim, evidence],
|
243 |
+
outputs=[prediction, label],
|
244 |
+
)
|
245 |
+
|
246 |
+
# Clear the previous predictions when the model is changed
|
247 |
+
gr.on(
|
248 |
+
triggers=[dropdown.select],
|
249 |
+
fn=lambda: "Model changed! Waiting for updated predictions...",
|
250 |
+
outputs=[prediction],
|
251 |
+
api_name=False,
|
252 |
+
)
|
253 |
+
|
254 |
+
# Update the predictions after changing the model
|
255 |
+
dropdown.change(
|
256 |
+
fn=select_model,
|
257 |
+
inputs=dropdown,
|
258 |
+
).then(
|
259 |
+
fn=query_model,
|
260 |
+
inputs=[claim, evidence],
|
261 |
+
outputs=[prediction, label],
|
262 |
+
api_name=False,
|
263 |
+
)
|
264 |
+
|
265 |
+
# Get evidence from PDF
|
266 |
+
gr.on(
|
267 |
+
triggers=[pdf_file.upload, get_evidence.click],
|
268 |
+
fn=retrieve_from_pdf,
|
269 |
+
inputs=[pdf_file, claim, top_k],
|
270 |
+
outputs=evidence,
|
271 |
+
)
|
272 |
+
|
273 |
+
# Change visualization
|
274 |
+
radio.change(
|
275 |
+
fn=change_visualization,
|
276 |
+
inputs=radio,
|
277 |
+
outputs=[barplot, label],
|
278 |
+
api_name=False,
|
279 |
+
)
|
280 |
+
|
281 |
+
demo.launch()
|
examples/accurate/log.csv
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
claim,evidence,label
|
2 |
+
"Poirot was now back and I was sorry that he would take over what I now considered my own investigation.","Poirot, I exclaimed, with relief, and seizing him by both hands, I dragged him into the room.","REFUTE (MNLI)"
|
3 |
+
"0-dimensional biomaterials lack inductive properties.","Nanotechnologies are emerging platforms that could be useful in measuring, understanding, and manipulating stem cells. Examples include magnetic nanoparticles and quantum dots for stem cell labeling and in vivo tracking; nanoparticles, carbon nanotubes, and polyplexes for the intracellular delivery of genes/oligonucleotides and protein/peptides; and engineered nanometer-scale scaffolds for stem cell differentiation and transplantation. This review examines the use of nanotechnologies for stem cell tracking, differentiation, and transplantation. We further discuss their utility and the potential concerns regarding their cytotoxicity.","NEI (SciFact)"
|
4 |
+
"1 in 5 million in UK have abnormal PrP positivity.","OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, and to understand better the implications for the management of blood and blood products and for the handling of surgical instruments. DESIGN Irreversibly unlinked and anonymised large scale survey of archived appendix samples. SETTING Archived appendix samples from the pathology departments of 41 UK hospitals participating in the earlier survey, and additional hospitals in regions with lower levels of participation in that survey. SAMPLE 32,441 archived appendix samples fixed in formalin and embedded in paraffin and tested for the presence of abnormal prion protein (PrP). RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevalence of 493 per million population (95% confidence interval 282 to 801 per million). The prevalence in those born in 1941-60 (733 per million, 269 to 1596 per million) did not differ significantly from those born between 1961 and 1985 (412 per million, 198 to 758 per million) and was similar in both sexes and across the three broad geographical areas sampled. Genetic testing of the positive specimens for the genotype at PRNP codon 129 revealed a high proportion that were valine homozygous compared with the frequency in the normal population, and in stark contrast with confirmed clinical cases of vCJD, all of which were methionine homozygous at PRNP codon 129. CONCLUSIONS This study corroborates previous studies and suggests a high prevalence of infection with abnormal PrP, indicating vCJD carrier status in the population compared with the 177 vCJD cases to date. These findings have important implications for the management of blood and blood products and for the handling of surgical instruments.","REFUTE (SciFact)"
|
5 |
+
"32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.","ContextChronic hepatitis C is the leading cause for liver transplantation in the United States. Intravenous drug use, the major risk factor, accounts for approximately 60% of hepatitis C virus transmission. Information from the United Network of Organ Sharing (UNOS) does not address substance use among liver transplantation patients. ObjectiveTo identify addiction-related criteria for admission to the UNOS liver transplantation waiting list and posttransplantation problems experienced by patients who are prescribed maintenance methadone. Design, Setting, and ParticipantsMail survey of all 97 adult US liver transplantation programs (belonging to UNOS) in March 2000 with telephone follow-up conducted in May and June 2000.Main Outcome MeasuresPrograms' acceptance and management of patients with past or present substance use disorder. ResultsOf the 97 programs surveyed, 87 (90%) responded. All accept applicants with a history of alcoholism or other addictions, including heroin dependence. Eighty-eight percent of the responding programs require at least 6 months of abstinence from alcohol; 83% from illicit drugs. Ninety-four percent have addiction treatment requirements. Consultations from substance abuse specialists are obtained by 86%. Patients receiving methadone maintenance are accepted by 56% of the responding programs. Approximately 180 patients receiving methadone maintenance are reported to have undergone liver transplantation. ConclusionsMost liver transplantation programs have established policies for patients with substance use disorders. Opiate-dependent patients receiving opiate replacement therapy seem underrepresented in transplantation programs. Little anecdotal evidence for negative impact of opiate replacement therapy on liver transplantation outcome was found. Policies requiring discontinuation of methadone in 32% of all programs contradict the evidence base for efficacy of long-term replacement therapies and potentially result in relapse of previously stable patients.","SUPPORT (SciFact)"
|
6 |
+
"Several studies have also shown the association of non-coding RNAs in colorectal carcinogenesis through the stimulation or inhibition of apoptosis, cell proliferation, differentiation, invasion and metastasis","Accumulating evidence indicates that lncRNAs could play a critical role in regulation of cellular processes such as cell growth and apoptosis as well as cancer progression and metastasis. In colon cancer, a recent report indicated that miR-211 promotes cell proliferation, tumor growth and cell migration of HCT-116 cells. Although they are less well characterized compared with small non- coding microRNAs (1–5), increasing evidence suggests that lncRNAs could play a critical role in regulation of diverse cellular processes such as stem cell pluripotency, development, cell growth and apoptosis and cancer metastasis (6–13). For example, miR-211 enhances the proliferation, migration and anchorage-independent colony formation of oral carcinoma cells (35). Alterations in the primary structure, secondary structure and expression levels of lncRNAs as well as their cognate RNA-binding proteins are often associated with human diseases, in particular cancer (36).","SUPPORT (CitInt)"
|
7 |
+
"This high AMP/ATP ratio activates the phosphorylation of AMPK, a master energy sensor within cell, and then pAMPK inhibits mTOR signaling by activating TSC2 and subsequently inhibiting Rheb","When electron transport function is inhibited, the ATP synthase can function in reverse such that it uses ATP generated by glycolysis to pump protons across the inner mitochondrial membrane, maintaining membrane potential (Appleby et al., 1999). This latter hypothesis has been questioned as cancer cells have the ability to survive on ATP produced exclusively by glycolysis. The ATP synthase inhibitor, Oligomycin A, diminished TMRE fluorescence in Control-HCT 116 p53−/− cells treated with metformin suggesting that in the presence of metformin, intact cells maintain their mitochondrial membrane potential by reversal of the ATP synthase (Figure 4E). Metformin inhibits cellular proliferation and pro- proliferative signaling via complex I inhibition. It is likely that metformin acts upstream of this site, inhibiting complex I activity while also inhibiting ROS generation.","NEI (CitInt)"
|
examples/inaccurate/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
claim,evidence,label
|
2 |
+
"In this way, loc285194 functions as a downstream p53 effector that exerts its anti-proliferative role by binding miR-211 in CRC and miR-23b in GC","We present evidence that loc285194 is a direct target for p53 and functions as a tumor suppressor in part through negative regulation of miR-211. Together, these results suggest that loc285194 is a p53-regulated tumor suppressor, which acts in part through repression of miR-211. Therefore, loc285194 suppresses tumor cell growth not only in vitro but also in vivo, further suggesting that loc285194 is a p53 downstream effector, functioning as a tumor suppressor. Finally, we demonstrate that loc285194 negatively regulates miR-211, which may in part account for loc285194-mediated cell growth inhibition. Together, these results suggest that both loc285194 and miR-211 are associated with the RISC complex through which loc285194 is able to reduce the miR-211 level and vice versa.","REFUTE (CitInt)"
|
examples/retrieval/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
pdf_file,claim,top_k,label
|
2 |
+
https://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0030197&type=printable,"Falsified artemisinin family drugs with no active ingredient can be life-threatening.",5,SciFact
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
gradio
|
3 |
+
pymupdf
|
4 |
+
unidecode
|
5 |
+
nltk
|
6 |
+
bm25s
|
retrieval.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import fitz # pip install pymupdf
|
3 |
+
from unidecode import unidecode
|
4 |
+
from nltk.tokenize import sent_tokenize
|
5 |
+
import bm25s
|
6 |
+
|
7 |
+
|
8 |
+
def retrieve_from_pdf(pdf_file, query, k=10):
|
9 |
+
|
10 |
+
# Get PDF file as binary
|
11 |
+
with open(pdf_file, mode="rb") as f:
|
12 |
+
pdf_file_bytes = f.read()
|
13 |
+
|
14 |
+
# Extract text from the PDF
|
15 |
+
pdf_doc = fitz.open(stream=pdf_file_bytes, filetype="pdf")
|
16 |
+
pdf_text = ""
|
17 |
+
for page_num in range(pdf_doc.page_count):
|
18 |
+
page = pdf_doc.load_page(page_num)
|
19 |
+
pdf_text += page.get_text("text")
|
20 |
+
|
21 |
+
# Clean text
|
22 |
+
# pdf_text = 'In §3.1, we find\nthat dis-\ntractor abstracts.')
|
23 |
+
# clean_text = 'In SS3.1, we find that distractor abstracts.'
|
24 |
+
# Remove hyphens at end of lines
|
25 |
+
clean_text = re.sub("-\n", "", pdf_text)
|
26 |
+
# Replace remaining newline characters with space
|
27 |
+
clean_text = re.sub("\n", " ", clean_text)
|
28 |
+
# Replace unicode with ascii
|
29 |
+
clean_text = unidecode(clean_text)
|
30 |
+
|
31 |
+
# Parse text into sentences to build the corpus
|
32 |
+
corpus = sent_tokenize(clean_text)
|
33 |
+
# Tokenize the corpus
|
34 |
+
corpus_tokens = bm25s.tokenize(corpus, stopwords="en")
|
35 |
+
# Initialize the BM25 model
|
36 |
+
retriever = bm25s.BM25()
|
37 |
+
retriever.index(corpus_tokens, show_progress=False)
|
38 |
+
# Tokenize the query
|
39 |
+
query_tokens = bm25s.tokenize(query)
|
40 |
+
|
41 |
+
# Get top-k results
|
42 |
+
# Use int(k) in case we get str value (as in retrieval example)
|
43 |
+
results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=int(k))
|
44 |
+
## Print results
|
45 |
+
# for i in range(results.shape[1]):
|
46 |
+
# doc, score = results[0, i], scores[0, i]
|
47 |
+
# print(f"Rank {i+1} (score: {score:.2f}): {doc}")
|
48 |
+
|
49 |
+
# Join sentences and return results
|
50 |
+
results = " ".join(results[0])
|
51 |
+
return results
|