Spaces:
Sleeping
Sleeping
Merge branch 'main' into question-coefficient
Browse files
document_qa/document_qa_engine.py
CHANGED
|
@@ -494,7 +494,7 @@ class DocumentQAEngine:
|
|
| 494 |
print("File", pdf_file_path)
|
| 495 |
filename = Path(pdf_file_path).stem
|
| 496 |
coordinates = True # if chunk_size == -1 else False
|
| 497 |
-
structure = self.grobid_processor.
|
| 498 |
|
| 499 |
biblio = structure['biblio']
|
| 500 |
biblio['filename'] = filename.replace(" ", "_")
|
|
|
|
| 494 |
print("File", pdf_file_path)
|
| 495 |
filename = Path(pdf_file_path).stem
|
| 496 |
coordinates = True # if chunk_size == -1 else False
|
| 497 |
+
structure = self.grobid_processor.process(pdf_file_path, coordinates=coordinates)
|
| 498 |
|
| 499 |
biblio = structure['biblio']
|
| 500 |
biblio['filename'] = filename.replace(" ", "_")
|
document_qa/grobid_processors.py
CHANGED
|
@@ -110,10 +110,10 @@ class GrobidProcessor(BaseProcessor):
|
|
| 110 |
if status != 200:
|
| 111 |
return
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
| 115 |
|
| 116 |
-
return
|
| 117 |
|
| 118 |
def process_single(self, input_file):
|
| 119 |
doc = self.process_structure(input_file)
|
|
@@ -152,8 +152,8 @@ class GrobidProcessor(BaseProcessor):
|
|
| 152 |
"text": f"authors: {biblio['authors']}",
|
| 153 |
"type": passage_type,
|
| 154 |
"section": "<header>",
|
| 155 |
-
"subSection": "<
|
| 156 |
-
"passage_id": "
|
| 157 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
| 158 |
blocks_header['authors']])
|
| 159 |
})
|
|
@@ -258,7 +258,7 @@ class GrobidQuantitiesProcessor(BaseProcessor):
|
|
| 258 |
def __init__(self, grobid_quantities_client):
|
| 259 |
self.grobid_quantities_client = grobid_quantities_client
|
| 260 |
|
| 261 |
-
def
|
| 262 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
| 263 |
|
| 264 |
if status != 200:
|
|
@@ -430,7 +430,7 @@ class GrobidMaterialsProcessor(BaseProcessor):
|
|
| 430 |
def __init__(self, grobid_superconductors_client):
|
| 431 |
self.grobid_superconductors_client = grobid_superconductors_client
|
| 432 |
|
| 433 |
-
def
|
| 434 |
preprocessed_text = text.strip()
|
| 435 |
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
| 436 |
"processText_disable_linking")
|
|
@@ -534,22 +534,21 @@ class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProce
|
|
| 534 |
self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
|
| 535 |
|
| 536 |
def process_single_text(self, text):
|
| 537 |
-
extracted_quantities_spans = self.
|
| 538 |
-
extracted_materials_spans = self.
|
| 539 |
all_entities = extracted_quantities_spans + extracted_materials_spans
|
| 540 |
entities = self.prune_overlapping_annotations(all_entities)
|
| 541 |
return entities
|
| 542 |
|
| 543 |
-
def
|
| 544 |
if self.gqp:
|
| 545 |
-
return self.gqp.
|
| 546 |
else:
|
| 547 |
return []
|
| 548 |
|
| 549 |
-
|
| 550 |
-
def extract_materials(self, text):
|
| 551 |
if self.gmp:
|
| 552 |
-
return self.gmp.
|
| 553 |
else:
|
| 554 |
return []
|
| 555 |
|
|
@@ -688,8 +687,8 @@ class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProce
|
|
| 688 |
|
| 689 |
|
| 690 |
class XmlProcessor(BaseProcessor):
|
| 691 |
-
def __init__(self
|
| 692 |
-
super().__init__(
|
| 693 |
|
| 694 |
def process_structure(self, input_file):
|
| 695 |
text = ""
|
|
@@ -701,16 +700,16 @@ class XmlProcessor(BaseProcessor):
|
|
| 701 |
|
| 702 |
return output_data
|
| 703 |
|
| 704 |
-
def process_single(self, input_file):
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
|
| 713 |
-
def
|
| 714 |
output_data = OrderedDict()
|
| 715 |
soup = BeautifulSoup(text, 'xml')
|
| 716 |
text_blocks_children = get_children_list_supermat(soup, verbose=False)
|
|
|
|
| 110 |
if status != 200:
|
| 111 |
return
|
| 112 |
|
| 113 |
+
document_object = self.parse_grobid_xml(text, coordinates=coordinates)
|
| 114 |
+
document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
|
| 115 |
|
| 116 |
+
return document_object
|
| 117 |
|
| 118 |
def process_single(self, input_file):
|
| 119 |
doc = self.process_structure(input_file)
|
|
|
|
| 152 |
"text": f"authors: {biblio['authors']}",
|
| 153 |
"type": passage_type,
|
| 154 |
"section": "<header>",
|
| 155 |
+
"subSection": "<authors>",
|
| 156 |
+
"passage_id": "hauthors",
|
| 157 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
| 158 |
blocks_header['authors']])
|
| 159 |
})
|
|
|
|
| 258 |
def __init__(self, grobid_quantities_client):
|
| 259 |
self.grobid_quantities_client = grobid_quantities_client
|
| 260 |
|
| 261 |
+
def process(self, text) -> list:
|
| 262 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
| 263 |
|
| 264 |
if status != 200:
|
|
|
|
| 430 |
def __init__(self, grobid_superconductors_client):
|
| 431 |
self.grobid_superconductors_client = grobid_superconductors_client
|
| 432 |
|
| 433 |
+
def process(self, text):
|
| 434 |
preprocessed_text = text.strip()
|
| 435 |
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
| 436 |
"processText_disable_linking")
|
|
|
|
| 534 |
self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
|
| 535 |
|
| 536 |
def process_single_text(self, text):
|
| 537 |
+
extracted_quantities_spans = self.process_properties(text)
|
| 538 |
+
extracted_materials_spans = self.process_materials(text)
|
| 539 |
all_entities = extracted_quantities_spans + extracted_materials_spans
|
| 540 |
entities = self.prune_overlapping_annotations(all_entities)
|
| 541 |
return entities
|
| 542 |
|
| 543 |
+
def process_properties(self, text):
|
| 544 |
if self.gqp:
|
| 545 |
+
return self.gqp.process(text)
|
| 546 |
else:
|
| 547 |
return []
|
| 548 |
|
| 549 |
+
def process_materials(self, text):
|
|
|
|
| 550 |
if self.gmp:
|
| 551 |
+
return self.gmp.process(text)
|
| 552 |
else:
|
| 553 |
return []
|
| 554 |
|
|
|
|
| 687 |
|
| 688 |
|
| 689 |
class XmlProcessor(BaseProcessor):
|
| 690 |
+
def __init__(self):
|
| 691 |
+
super().__init__()
|
| 692 |
|
| 693 |
def process_structure(self, input_file):
|
| 694 |
text = ""
|
|
|
|
| 700 |
|
| 701 |
return output_data
|
| 702 |
|
| 703 |
+
# def process_single(self, input_file):
|
| 704 |
+
# doc = self.process_structure(input_file)
|
| 705 |
+
#
|
| 706 |
+
# for paragraph in doc['passages']:
|
| 707 |
+
# entities = self.process_single_text(paragraph['text'])
|
| 708 |
+
# paragraph['spans'] = entities
|
| 709 |
+
#
|
| 710 |
+
# return doc
|
| 711 |
|
| 712 |
+
def process(self, text):
|
| 713 |
output_data = OrderedDict()
|
| 714 |
soup = BeautifulSoup(text, 'xml')
|
| 715 |
text_blocks_children = get_children_list_supermat(soup, verbose=False)
|