Spaces:
Sleeping
Sleeping
use paragraphs instead of sentences
Browse files- document_qa/document_qa_engine.py +1 -1
- document_qa/grobid_processors.py +86 -37
- requirements.txt +1 -1
- streamlit_app.py +4 -3
- tests/__init__.py +0 -0
- tests/conftest.py +37 -0
- tests/resources/2312.07559.paragraphs.tei.xml +0 -0
- tests/resources/2312.07559.sentences.tei.xml +0 -0
- tests/test_grobid_processors.py +20 -0
document_qa/document_qa_engine.py
CHANGED
|
@@ -56,7 +56,7 @@ class DocumentQAEngine:
|
|
| 56 |
grobid_client = GrobidClient(
|
| 57 |
grobid_server=self.grobid_url,
|
| 58 |
batch_size=1000,
|
| 59 |
-
coordinates=["
|
| 60 |
sleep_time=5,
|
| 61 |
timeout=60,
|
| 62 |
check_server=True
|
|
|
|
| 56 |
grobid_client = GrobidClient(
|
| 57 |
grobid_server=self.grobid_url,
|
| 58 |
batch_size=1000,
|
| 59 |
+
coordinates=["p"],
|
| 60 |
sleep_time=5,
|
| 61 |
timeout=60,
|
| 62 |
check_server=True
|
document_qa/grobid_processors.py
CHANGED
|
@@ -136,7 +136,7 @@ class GrobidProcessor(BaseProcessor):
|
|
| 136 |
input_path,
|
| 137 |
consolidate_header=True,
|
| 138 |
consolidate_citations=False,
|
| 139 |
-
segment_sentences=
|
| 140 |
tei_coordinates=coordinates,
|
| 141 |
include_raw_citations=False,
|
| 142 |
include_raw_affiliations=False,
|
|
@@ -188,7 +188,7 @@ class GrobidProcessor(BaseProcessor):
|
|
| 188 |
# "passage_id": "title0"
|
| 189 |
# })
|
| 190 |
|
| 191 |
-
passage_type = "
|
| 192 |
|
| 193 |
if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
|
| 194 |
passages.append({
|
|
@@ -201,42 +201,74 @@ class GrobidProcessor(BaseProcessor):
|
|
| 201 |
})
|
| 202 |
|
| 203 |
soup = BeautifulSoup(text, 'xml')
|
| 204 |
-
text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
text_blocks_figures = get_children_figures(soup, verbose=False)
|
| 224 |
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
return output_data
|
| 242 |
|
|
@@ -532,6 +564,21 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
|
|
| 532 |
def extract_materials(self, text):
|
| 533 |
return self.gmp.extract_materials(text)
|
| 534 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
@staticmethod
|
| 536 |
def prune_overlapping_annotations(entities: list) -> list:
|
| 537 |
# Sorting by offsets
|
|
@@ -742,7 +789,8 @@ def get_children_body(soup: object, use_paragraphs: object = True, verbose: obje
|
|
| 742 |
child_name = "p" if use_paragraphs else "s"
|
| 743 |
for child in soup.TEI.children:
|
| 744 |
if child.name == 'text':
|
| 745 |
-
children.extend(
|
|
|
|
| 746 |
|
| 747 |
if verbose:
|
| 748 |
print(str(children))
|
|
@@ -755,7 +803,8 @@ def get_children_figures(soup: object, use_paragraphs: object = True, verbose: o
|
|
| 755 |
child_name = "p" if use_paragraphs else "s"
|
| 756 |
for child in soup.TEI.children:
|
| 757 |
if child.name == 'text':
|
| 758 |
-
children.extend(
|
|
|
|
| 759 |
|
| 760 |
if verbose:
|
| 761 |
print(str(children))
|
|
|
|
| 136 |
input_path,
|
| 137 |
consolidate_header=True,
|
| 138 |
consolidate_citations=False,
|
| 139 |
+
segment_sentences=False,
|
| 140 |
tei_coordinates=coordinates,
|
| 141 |
include_raw_citations=False,
|
| 142 |
include_raw_affiliations=False,
|
|
|
|
| 188 |
# "passage_id": "title0"
|
| 189 |
# })
|
| 190 |
|
| 191 |
+
passage_type = "paragraph"
|
| 192 |
|
| 193 |
if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
|
| 194 |
passages.append({
|
|
|
|
| 201 |
})
|
| 202 |
|
| 203 |
soup = BeautifulSoup(text, 'xml')
|
| 204 |
+
text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=True)
|
| 205 |
+
|
| 206 |
+
use_paragraphs = True
|
| 207 |
+
if not use_paragraphs:
|
| 208 |
+
passages.extend([
|
| 209 |
+
{
|
| 210 |
+
"text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
|
| 211 |
+
text.parent.name != "ref" or (
|
| 212 |
+
text.parent.name == "ref" and text.parent.attrs[
|
| 213 |
+
'type'] != 'bibr'))),
|
| 214 |
+
"type": passage_type,
|
| 215 |
+
"section": "<body>",
|
| 216 |
+
"subSection": "<paragraph>",
|
| 217 |
+
"passage_id": str(paragraph_id),
|
| 218 |
+
"coordinates": paragraph['coords'] if coordinates and sentence.has_attr('coords') else ""
|
| 219 |
+
}
|
| 220 |
+
for paragraph_id, paragraph in enumerate(text_blocks_body) for
|
| 221 |
+
sentence_id, sentence in enumerate(paragraph)
|
| 222 |
+
])
|
| 223 |
+
else:
|
| 224 |
+
passages.extend([
|
| 225 |
+
{
|
| 226 |
+
"text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
|
| 227 |
+
text.parent.name != "ref" or (
|
| 228 |
+
text.parent.name == "ref" and text.parent.attrs[
|
| 229 |
+
'type'] != 'bibr'))),
|
| 230 |
+
"type": passage_type,
|
| 231 |
+
"section": "<body>",
|
| 232 |
+
"subSection": "<paragraph>",
|
| 233 |
+
"passage_id": str(paragraph_id),
|
| 234 |
+
"coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
|
| 235 |
+
}
|
| 236 |
+
for paragraph_id, paragraph in enumerate(text_blocks_body)
|
| 237 |
+
])
|
| 238 |
|
| 239 |
text_blocks_figures = get_children_figures(soup, verbose=False)
|
| 240 |
|
| 241 |
+
if not use_paragraphs:
|
| 242 |
+
passages.extend([
|
| 243 |
+
{
|
| 244 |
+
"text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
|
| 245 |
+
text.parent.name != "ref" or (
|
| 246 |
+
text.parent.name == "ref" and text.parent.attrs[
|
| 247 |
+
'type'] != 'bibr'))),
|
| 248 |
+
"type": passage_type,
|
| 249 |
+
"section": "<body>",
|
| 250 |
+
"subSection": "<figure>",
|
| 251 |
+
"passage_id": str(paragraph_id) + str(sentence_id),
|
| 252 |
+
"coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
|
| 253 |
+
}
|
| 254 |
+
for paragraph_id, paragraph in enumerate(text_blocks_figures) for
|
| 255 |
+
sentence_id, sentence in enumerate(paragraph)
|
| 256 |
+
])
|
| 257 |
+
else:
|
| 258 |
+
passages.extend([
|
| 259 |
+
{
|
| 260 |
+
"text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
|
| 261 |
+
text.parent.name != "ref" or (
|
| 262 |
+
text.parent.name == "ref" and text.parent.attrs[
|
| 263 |
+
'type'] != 'bibr'))),
|
| 264 |
+
"type": passage_type,
|
| 265 |
+
"section": "<body>",
|
| 266 |
+
"subSection": "<figure>",
|
| 267 |
+
"passage_id": str(paragraph_id),
|
| 268 |
+
"coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
|
| 269 |
+
}
|
| 270 |
+
for paragraph_id, paragraph in enumerate(text_blocks_figures)
|
| 271 |
+
])
|
| 272 |
|
| 273 |
return output_data
|
| 274 |
|
|
|
|
| 564 |
def extract_materials(self, text):
|
| 565 |
return self.gmp.extract_materials(text)
|
| 566 |
|
| 567 |
+
@staticmethod
|
| 568 |
+
def box_to_dict(box, color=None, type=None):
|
| 569 |
+
|
| 570 |
+
if box is None or box == "" or len(box) < 5:
|
| 571 |
+
return {}
|
| 572 |
+
|
| 573 |
+
item = {"page": box[0], "x": box[1], "y": box[2], "width": box[3], "height": box[4]}
|
| 574 |
+
if color is not None:
|
| 575 |
+
item['color'] = color
|
| 576 |
+
|
| 577 |
+
if type:
|
| 578 |
+
item['type'] = type
|
| 579 |
+
|
| 580 |
+
return item
|
| 581 |
+
|
| 582 |
@staticmethod
|
| 583 |
def prune_overlapping_annotations(entities: list) -> list:
|
| 584 |
# Sorting by offsets
|
|
|
|
| 789 |
child_name = "p" if use_paragraphs else "s"
|
| 790 |
for child in soup.TEI.children:
|
| 791 |
if child.name == 'text':
|
| 792 |
+
children.extend(
|
| 793 |
+
[subchild for subchild in child.find_all("body") for subchild in subchild.find_all(child_name)])
|
| 794 |
|
| 795 |
if verbose:
|
| 796 |
print(str(children))
|
|
|
|
| 803 |
child_name = "p" if use_paragraphs else "s"
|
| 804 |
for child in soup.TEI.children:
|
| 805 |
if child.name == 'text':
|
| 806 |
+
children.extend(
|
| 807 |
+
[subchild for subchilds in child.find_all("body") for subchild in subchilds.find_all("figDesc")])
|
| 808 |
|
| 809 |
if verbose:
|
| 810 |
print(str(children))
|
requirements.txt
CHANGED
|
@@ -7,7 +7,7 @@ grobid_tei_xml==0.1.3
|
|
| 7 |
tqdm
|
| 8 |
pyyaml==6.0
|
| 9 |
pytest
|
| 10 |
-
streamlit==1.
|
| 11 |
lxml
|
| 12 |
Beautifulsoup4
|
| 13 |
python-dotenv
|
|
|
|
| 7 |
tqdm
|
| 8 |
pyyaml==6.0
|
| 9 |
pytest
|
| 10 |
+
streamlit==1.29.0
|
| 11 |
lxml
|
| 12 |
Beautifulsoup4
|
| 13 |
python-dotenv
|
streamlit_app.py
CHANGED
|
@@ -296,7 +296,7 @@ with st.sidebar:
|
|
| 296 |
mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
|
| 297 |
help="LLM will respond the question, Embedding will show the "
|
| 298 |
"paragraphs relevant to the question in the paper.")
|
| 299 |
-
chunk_size = st.slider("Chunks size", -1, 2000, value
|
| 300 |
help="Size of chunks in which the document is partitioned",
|
| 301 |
disabled=uploaded_file is not None)
|
| 302 |
context_size = st.slider("Context size", 3, 10, value=4,
|
|
@@ -410,8 +410,9 @@ with right_column:
|
|
| 410 |
st.session_state.doc_id,
|
| 411 |
context_size=context_size)
|
| 412 |
annotations = [
|
| 413 |
-
|
| 414 |
-
coordinates for c in coord]
|
|
|
|
| 415 |
gradients = generate_color_gradient(len(annotations))
|
| 416 |
for i, color in enumerate(gradients):
|
| 417 |
annotations[i]['color'] = color
|
|
|
|
| 296 |
mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
|
| 297 |
help="LLM will respond the question, Embedding will show the "
|
| 298 |
"paragraphs relevant to the question in the paper.")
|
| 299 |
+
chunk_size = st.slider("Chunks size", -1, 2000, value=-1,
|
| 300 |
help="Size of chunks in which the document is partitioned",
|
| 301 |
disabled=uploaded_file is not None)
|
| 302 |
context_size = st.slider("Context size", 3, 10, value=4,
|
|
|
|
| 410 |
st.session_state.doc_id,
|
| 411 |
context_size=context_size)
|
| 412 |
annotations = [
|
| 413 |
+
GrobidAggregationProcessor.box_to_dict(coo) for coo in [c.split(",") for coord in
|
| 414 |
+
coordinates for c in coord]
|
| 415 |
+
]
|
| 416 |
gradients = generate_color_gradient(len(annotations))
|
| 417 |
for i, color in enumerate(gradients):
|
| 418 |
annotations[i]['color'] = color
|
tests/__init__.py
ADDED
|
File without changes
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from unittest.mock import MagicMock
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from _pytest._py.path import LocalPath
|
| 7 |
+
|
| 8 |
+
# derived from https://github.com/elifesciences/sciencebeam-trainer-delft/tree/develop/tests
|
| 9 |
+
|
| 10 |
+
LOGGER = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@pytest.fixture(scope='session', autouse=True)
|
| 14 |
+
def setup_logging():
|
| 15 |
+
logging.root.handlers = []
|
| 16 |
+
logging.basicConfig(level='INFO')
|
| 17 |
+
logging.getLogger('tests').setLevel('DEBUG')
|
| 18 |
+
# logging.getLogger('sciencebeam_trainer_delft').setLevel('DEBUG')
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _backport_assert_called(mock: MagicMock):
|
| 22 |
+
assert mock.called
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@pytest.fixture(scope='session', autouse=True)
|
| 26 |
+
def patch_magicmock():
|
| 27 |
+
try:
|
| 28 |
+
MagicMock.assert_called
|
| 29 |
+
except AttributeError:
|
| 30 |
+
MagicMock.assert_called = _backport_assert_called
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@pytest.fixture
|
| 34 |
+
def temp_dir(tmpdir: LocalPath):
|
| 35 |
+
# convert to standard Path
|
| 36 |
+
return Path(str(tmpdir))
|
| 37 |
+
|
tests/resources/2312.07559.paragraphs.tei.xml
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tests/resources/2312.07559.sentences.tei.xml
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tests/test_grobid_processors.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
from document_qa.grobid_processors import get_children_body
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def test_get_children_paragraphs():
|
| 6 |
+
with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
|
| 7 |
+
soup = BeautifulSoup(fo, 'xml')
|
| 8 |
+
|
| 9 |
+
children = get_children_body(soup, use_paragraphs=True)
|
| 10 |
+
|
| 11 |
+
assert len(children) == 70
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_get_children_sentences():
|
| 15 |
+
with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
|
| 16 |
+
soup = BeautifulSoup(fo, 'xml')
|
| 17 |
+
|
| 18 |
+
children = get_children_body(soup, use_paragraphs=False)
|
| 19 |
+
|
| 20 |
+
assert len(children) == 327
|