Spaces:
Runtime error
Runtime error
File size: 7,491 Bytes
d9b7d2f 57da257 d9b7d2f 57da257 d9b7d2f 57da257 d9b7d2f 57da257 d9b7d2f 57da257 d9b7d2f 57da257 d9b7d2f 57da257 d9b7d2f 57da257 d9b7d2f 57da257 d9b7d2f 57da257 d9b7d2f 5f21add d9b7d2f 5f21add d9b7d2f 5f21add d9b7d2f 57da257 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
from operator import itemgetter
from collections import OrderedDict
from typing import Dict, List, Iterator, Union, Tuple
import re
class TextExtractor:
def __init__(self) -> None:
pass
@staticmethod
def get_font_info(doc: Iterator, granularity=False) -> List[Tuple[str, int]]:
"""
Return a list containing the font sizes and their count number.
Args:
doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.
Raises:
ValueError: Raises Value Error if there are no font detected
Returns:
List[Tuple[str, int]]:
Font Counts: [('12.0', 266), ('16.020000457763672', 18), ('13.979999542236328', 7), ('7.019999980926514', 2)]
"""
styles = {}
font_counts = {}
for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
font_counts[identifier] = font_counts.get(identifier, 0) + 1
font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)
if not font_counts:
raise ValueError("Zero discriminating fonts found!")
return font_counts, styles
@staticmethod
def get_font_tags(font_counts, styles) -> Dict[int, str]:
"""
Return a dictionary of font sizes and their corresponding tags.
Args:
font_counts (List[Tuple[str, int]]): The font sizes as keys and their count as values
styles (Dict[int, Dict[str, str]]): A style descriptioin of every font sizes.
Returns:
Dict[int, str]: Dictionary of the font sizes as keys and their tags as values.
Example: {12.0: '<p>', 16.020000457763672: '<h1>', 13.979999542236328: '<h2>', 7.019999980926514: '<s4>'}
"""
p_size = styles[font_counts[0][0]]['size']
# sorting the font sizes high to low, so that we can append the right integer to each tag
font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
size_tag = {p_size: "<p>"}
for i, size in enumerate(font_sizes):
if size > p_size:
size_tag[size] = f"<h{i+1}>"
elif size < p_size:
size_tag[size] = f"<s{i+1}>"
return size_tag
@staticmethod
def assign_tags(doc, size_tag) -> List[str]:
"""
Scrapes headers & paragraphs from PDF and return texts with element tags.
Args:
doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
size_tag (dict): Textual element tags for each size.
Returns:
list: Texts with pre-prended element tags
Examples: ['<h1>Group Members: |', '<p>1. Stella Shania Mintara - 2301860596
| 2. David Samuel - 2301850304 | 3. Egivenia - 2301850134 | 4. Aurelius Va
nnes Leander - 2301862102 | 5. Juanrico Alvaro - 2301847316 ||']
"""
texts = []
previous_s = {}
block_string = ""
for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
block_string = ""
for l in b["lines"]:
for s in l["spans"]:
text = re.sub(r"[^\w\s]", '', s["text"]).strip()
if text:
if not previous_s: # First Span
previous_s = s
block_string = size_tag[s['size']] + s['text']
elif s['size'] == previous_s['size']:
if not block_string or (block_string and all((c == "|") for c in block_string)): # New block
block_string = size_tag[s['size']] + s['text']
else: # in the same block, so concatenate strings
block_string += f" {s['text']}"
else:
texts.append(block_string)
block_string = size_tag[s['size']] + s['text']
previous_s = s
if block_string:
block_string += "|"
# if block_string:
texts.append(block_string)
return texts
@staticmethod
def get_slides(texts):
"""
Returns the tagged texts into a slide format dictionary where the page is the
key and the value is a list contaning the component of that page.
Args:
texts (List[str]): PDF text with element tags.
Returns:
Dict: The text of the PDF seperated by the header 1 tags.
Examples: {'Page 1': [('h1', 'Group Members:'),
['p', '1. Stella Shania Mintara - 2301860596 2. David Samuel -
2301850304 3. Egivenia - 2301850134 4. Aurelius Vannes Leander -
2301862102 5.
Juanrico Alvaro - 2301847316']],
'Page 2': [('h1', 'Case Problem'),
['p', FreshMart is an established large-scale supermarket with branc
hes in popular areas across Jakarta and big cities]]}
"""
slides = {}
section = []
page = 1
current_header = ""
for text, next_text in zip(texts, texts[1:] + [None]):
tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
if tag_match:
tag = tag_match.group()
if tag == 'h1':
section = []
section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip()))
elif tag.startswith('h'): # non h1 headers
# Remove tag and pipes from the text
section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
elif tag.startswith('p'):
text = re.split("((\|){2,})", text) # If encounter more than 1 pipe than split that text into different paragraphs
for paragraph in text:
paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip() # Remove any pipe
paragraph = re.sub(' +', ' ', paragraph) # Remove any double or more spaces into single space
if paragraph and paragraph[0].islower(): # If a pargraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
section[-1][1] += f" {paragraph}"
elif paragraph:
section.append([tag, paragraph])
try:
if tag_match.group() == 'h1': # Create new page when current text is a type 1 header or title
slides[f"Page {page}"] = section
page += 1
except:
continue
return slides
|