Spaces:
Sleeping
Sleeping
include only bibliographics data that have potentially coordinates
Browse files- document_qa/document_qa_engine.py +4 -16
- document_qa/grobid_processors.py +60 -31
- tests/test_grobid_processors.py +32 -6
document_qa/document_qa_engine.py
CHANGED
|
@@ -57,7 +57,7 @@ class DocumentQAEngine:
|
|
| 57 |
grobid_client = GrobidClient(
|
| 58 |
grobid_server=self.grobid_url,
|
| 59 |
batch_size=1000,
|
| 60 |
-
coordinates=["p"],
|
| 61 |
sleep_time=5,
|
| 62 |
timeout=60,
|
| 63 |
check_server=True
|
|
@@ -189,7 +189,7 @@ class DocumentQAEngine:
|
|
| 189 |
relevant_documents = multi_query_retriever.get_relevant_documents(query)
|
| 190 |
return relevant_documents
|
| 191 |
|
| 192 |
-
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
|
| 193 |
"""
|
| 194 |
Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
|
| 195 |
"""
|
|
@@ -233,25 +233,13 @@ class DocumentQAEngine:
|
|
| 233 |
metadatas = [biblio for _ in range(len(texts))]
|
| 234 |
ids = [id for id, t in enumerate(texts)]
|
| 235 |
|
| 236 |
-
if "biblio" in include:
|
| 237 |
-
biblio_metadata = copy.copy(biblio)
|
| 238 |
-
biblio_metadata['type'] = "biblio"
|
| 239 |
-
biblio_metadata['section'] = "header"
|
| 240 |
-
for key in ['title', 'authors', 'publication_year']:
|
| 241 |
-
if key in biblio_metadata:
|
| 242 |
-
texts.append("{}: {}".format(key, biblio_metadata[key]))
|
| 243 |
-
metadatas.append(biblio_metadata)
|
| 244 |
-
ids.append(key)
|
| 245 |
-
|
| 246 |
return texts, metadatas, ids
|
| 247 |
|
| 248 |
-
def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1
|
| 249 |
-
include = ["biblio"] if include_biblio else []
|
| 250 |
texts, metadata, ids = self.get_text_from_document(
|
| 251 |
pdf_path,
|
| 252 |
chunk_size=chunk_size,
|
| 253 |
-
perc_overlap=perc_overlap
|
| 254 |
-
include=include)
|
| 255 |
if doc_id:
|
| 256 |
hash = doc_id
|
| 257 |
else:
|
|
|
|
| 57 |
grobid_client = GrobidClient(
|
| 58 |
grobid_server=self.grobid_url,
|
| 59 |
batch_size=1000,
|
| 60 |
+
coordinates=["p", "title", "persName"],
|
| 61 |
sleep_time=5,
|
| 62 |
timeout=60,
|
| 63 |
check_server=True
|
|
|
|
| 189 |
relevant_documents = multi_query_retriever.get_relevant_documents(query)
|
| 190 |
return relevant_documents
|
| 191 |
|
| 192 |
+
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
|
| 193 |
"""
|
| 194 |
Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
|
| 195 |
"""
|
|
|
|
| 233 |
metadatas = [biblio for _ in range(len(texts))]
|
| 234 |
ids = [id for id, t in enumerate(texts)]
|
| 235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
return texts, metadatas, ids
|
| 237 |
|
| 238 |
+
def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1):
|
|
|
|
| 239 |
texts, metadata, ids = self.get_text_from_document(
|
| 240 |
pdf_path,
|
| 241 |
chunk_size=chunk_size,
|
| 242 |
+
perc_overlap=perc_overlap)
|
|
|
|
| 243 |
if doc_id:
|
| 244 |
hash = doc_id
|
| 245 |
else:
|
document_qa/grobid_processors.py
CHANGED
|
@@ -176,32 +176,48 @@ class GrobidProcessor(BaseProcessor):
|
|
| 176 |
pass
|
| 177 |
|
| 178 |
output_data['biblio'] = biblio
|
| 179 |
-
|
| 180 |
passages = []
|
| 181 |
output_data['passages'] = passages
|
| 182 |
-
# if biblio['title'] is not None and len(biblio['title']) > 0:
|
| 183 |
-
# passages.append({
|
| 184 |
-
# "text": self.post_process(biblio['title']),
|
| 185 |
-
# "type": "paragraph",
|
| 186 |
-
# "section": "<header>",
|
| 187 |
-
# "subSection": "<title>",
|
| 188 |
-
# "passage_id": "title0"
|
| 189 |
-
# })
|
| 190 |
-
|
| 191 |
passage_type = "paragraph"
|
| 192 |
|
| 193 |
-
if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
|
| 194 |
-
passages.append({
|
| 195 |
-
"text": self.post_process(doc_biblio.abstract),
|
| 196 |
-
"type": passage_type,
|
| 197 |
-
"section": "<header>",
|
| 198 |
-
"subSection": "<abstract>",
|
| 199 |
-
"passage_id": "abstract0",
|
| 200 |
-
"coordinates": ""
|
| 201 |
-
})
|
| 202 |
-
|
| 203 |
soup = BeautifulSoup(text, 'xml')
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
use_paragraphs = True
|
| 207 |
if not use_paragraphs:
|
|
@@ -236,7 +252,7 @@ class GrobidProcessor(BaseProcessor):
|
|
| 236 |
for paragraph_id, paragraph in enumerate(text_blocks_body)
|
| 237 |
])
|
| 238 |
|
| 239 |
-
text_blocks_figures =
|
| 240 |
|
| 241 |
if not use_paragraphs:
|
| 242 |
passages.extend([
|
|
@@ -784,23 +800,36 @@ def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbos
|
|
| 784 |
return children
|
| 785 |
|
| 786 |
|
| 787 |
-
def
|
| 788 |
-
|
| 789 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 790 |
for child in soup.TEI.children:
|
| 791 |
if child.name == 'text':
|
| 792 |
-
|
| 793 |
-
|
|
|
|
| 794 |
|
| 795 |
if verbose:
|
| 796 |
-
print(str(
|
| 797 |
|
| 798 |
-
return
|
| 799 |
|
| 800 |
|
| 801 |
-
def
|
| 802 |
children = []
|
| 803 |
-
child_name = "p" if use_paragraphs else "s"
|
| 804 |
for child in soup.TEI.children:
|
| 805 |
if child.name == 'text':
|
| 806 |
children.extend(
|
|
|
|
| 176 |
pass
|
| 177 |
|
| 178 |
output_data['biblio'] = biblio
|
|
|
|
| 179 |
passages = []
|
| 180 |
output_data['passages'] = passages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
passage_type = "paragraph"
|
| 182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
soup = BeautifulSoup(text, 'xml')
|
| 184 |
+
blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
|
| 185 |
+
|
| 186 |
+
passages.append({
|
| 187 |
+
"text": f"authors: {biblio['authors']}",
|
| 188 |
+
"type": passage_type,
|
| 189 |
+
"section": "<header>",
|
| 190 |
+
"subSection": "<title>",
|
| 191 |
+
"passage_id": "htitle",
|
| 192 |
+
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
| 193 |
+
blocks_header['authors']])
|
| 194 |
+
})
|
| 195 |
+
|
| 196 |
+
passages.append({
|
| 197 |
+
"text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
|
| 198 |
+
"type": passage_type,
|
| 199 |
+
"section": "<header>",
|
| 200 |
+
"subSection": "<title>",
|
| 201 |
+
"passage_id": "htitle",
|
| 202 |
+
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
| 203 |
+
blocks_header['title']])
|
| 204 |
+
})
|
| 205 |
+
|
| 206 |
+
passages.append({
|
| 207 |
+
"text": self.post_process(
|
| 208 |
+
''.join(node.text for node in blocks_header['abstract'] for text in node.find_all(text=True) if
|
| 209 |
+
text.parent.name != "ref" or (
|
| 210 |
+
text.parent.name == "ref" and text.parent.attrs[
|
| 211 |
+
'type'] != 'bibr'))),
|
| 212 |
+
"type": passage_type,
|
| 213 |
+
"section": "<header>",
|
| 214 |
+
"subSection": "<abstract>",
|
| 215 |
+
"passage_id": "habstract",
|
| 216 |
+
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
| 217 |
+
blocks_header['abstract']])
|
| 218 |
+
})
|
| 219 |
+
|
| 220 |
+
text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
|
| 221 |
|
| 222 |
use_paragraphs = True
|
| 223 |
if not use_paragraphs:
|
|
|
|
| 252 |
for paragraph_id, paragraph in enumerate(text_blocks_body)
|
| 253 |
])
|
| 254 |
|
| 255 |
+
text_blocks_figures = get_xml_nodes_figures(soup, verbose=False)
|
| 256 |
|
| 257 |
if not use_paragraphs:
|
| 258 |
passages.extend([
|
|
|
|
| 800 |
return children
|
| 801 |
|
| 802 |
|
| 803 |
+
def get_xml_nodes_header(soup: object, use_paragraphs: bool = True) -> list:
|
| 804 |
+
sub_tag = "p" if use_paragraphs else "s"
|
| 805 |
+
|
| 806 |
+
header_elements = {
|
| 807 |
+
"authors": [persNameNode for persNameNode in soup.teiHeader.find_all("persName")],
|
| 808 |
+
"abstract": [p_in_abstract for abstractNodes in soup.teiHeader.find_all("abstract") for p_in_abstract in
|
| 809 |
+
abstractNodes.find_all(sub_tag)],
|
| 810 |
+
"title": [soup.teiHeader.fileDesc.title]
|
| 811 |
+
}
|
| 812 |
+
|
| 813 |
+
return header_elements
|
| 814 |
+
|
| 815 |
+
|
| 816 |
+
def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
|
| 817 |
+
nodes = []
|
| 818 |
+
tag_name = "p" if use_paragraphs else "s"
|
| 819 |
for child in soup.TEI.children:
|
| 820 |
if child.name == 'text':
|
| 821 |
+
# nodes.extend([subchild.find_all(tag_name) for subchild in child.find_all("body")])
|
| 822 |
+
nodes.extend(
|
| 823 |
+
[subsubchild for subchild in child.find_all("body") for subsubchild in subchild.find_all(tag_name)])
|
| 824 |
|
| 825 |
if verbose:
|
| 826 |
+
print(str(nodes))
|
| 827 |
|
| 828 |
+
return nodes
|
| 829 |
|
| 830 |
|
| 831 |
+
def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
|
| 832 |
children = []
|
|
|
|
| 833 |
for child in soup.TEI.children:
|
| 834 |
if child.name == 'text':
|
| 835 |
children.extend(
|
tests/test_grobid_processors.py
CHANGED
|
@@ -1,20 +1,46 @@
|
|
| 1 |
from bs4 import BeautifulSoup
|
| 2 |
-
from document_qa.grobid_processors import
|
| 3 |
|
| 4 |
|
| 5 |
-
def
|
| 6 |
with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
|
| 7 |
soup = BeautifulSoup(fo, 'xml')
|
| 8 |
|
| 9 |
-
|
| 10 |
|
| 11 |
-
assert len(
|
| 12 |
|
| 13 |
|
| 14 |
-
def
|
| 15 |
with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
|
| 16 |
soup = BeautifulSoup(fo, 'xml')
|
| 17 |
|
| 18 |
-
children =
|
| 19 |
|
| 20 |
assert len(children) == 327
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from bs4 import BeautifulSoup
|
| 2 |
+
from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header
|
| 3 |
|
| 4 |
|
| 5 |
+
def test_get_xml_nodes_body_paragraphs():
|
| 6 |
with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
|
| 7 |
soup = BeautifulSoup(fo, 'xml')
|
| 8 |
|
| 9 |
+
nodes = get_xml_nodes_body(soup, use_paragraphs=True)
|
| 10 |
|
| 11 |
+
assert len(nodes) == 70
|
| 12 |
|
| 13 |
|
| 14 |
+
def test_get_xml_nodes_body_sentences():
|
| 15 |
with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
|
| 16 |
soup = BeautifulSoup(fo, 'xml')
|
| 17 |
|
| 18 |
+
children = get_xml_nodes_body(soup, use_paragraphs=False)
|
| 19 |
|
| 20 |
assert len(children) == 327
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_get_xml_nodes_figures():
|
| 24 |
+
with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
|
| 25 |
+
soup = BeautifulSoup(fo, 'xml')
|
| 26 |
+
|
| 27 |
+
children = get_xml_nodes_figures(soup)
|
| 28 |
+
|
| 29 |
+
assert len(children) == 13
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_get_xml_nodes_header_paragraphs():
|
| 33 |
+
with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
|
| 34 |
+
soup = BeautifulSoup(fo, 'xml')
|
| 35 |
+
|
| 36 |
+
children = get_xml_nodes_header(soup)
|
| 37 |
+
|
| 38 |
+
assert len(children) == 8
|
| 39 |
+
|
| 40 |
+
def test_get_xml_nodes_header_sentences():
|
| 41 |
+
with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
|
| 42 |
+
soup = BeautifulSoup(fo, 'xml')
|
| 43 |
+
|
| 44 |
+
children = get_xml_nodes_header(soup, use_paragraphs=False)
|
| 45 |
+
|
| 46 |
+
assert len(children) == 15
|