Spaces:
Runtime error
Runtime error
File size: 5,945 Bytes
d9b7d2f 5f21add d9b7d2f 5f21add d9b7d2f 5f21add d9b7d2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from operator import itemgetter
from collections import OrderedDict
from typing import Dict, List, Iterator, Union, Tuple
import re
class TextExtractor:
def __init__(self) -> None:
pass
@staticmethod
def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]:
"""
This function return the fonts information inside the pdf such as size and type.
Args:
doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.
Raises:
ValueError: Raises Value Error if there are no font detected
Returns:
Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_
"""
styles = {}
font_counts = {}
for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
font_counts[identifier] = font_counts.get(identifier, 0) + 1
font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)
if not font_counts:
raise ValueError("Zero discriminating fonts found!")
return font_counts, styles
@staticmethod
def get_font_tags(font_counts, styles):
"""
_summary_
Args:
font_counts (_type_): _description_
styles (_type_): _description_
Returns:
_type_: _description_
"""
p_size = styles[font_counts[0][0]]['size']
# sorting the font sizes high to low, so that we can append the right integer to each tag
font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
size_tag = {p_size: "<p>"}
for i, size in enumerate(font_sizes):
if size > p_size:
size_tag[size] = f"<h{i+1}>"
elif size < p_size:
size_tag[size] = f"<s{i+1}>"
return size_tag
@staticmethod
def assign_tags(doc, size_tag):
"""
Scrapes headers & paragraphs from PDF and return texts with element tags.
Args:
doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
size_tag (dict): Textual element tags for each size.
Returns:
list: Texts with pre-prended element tags
"""
texts = []
previous_s = {}
block_string = ""
for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
block_string = ""
for l in b["lines"]:
for s in l["spans"]:
text = re.sub(r"[^\w\s]", '', s["text"]).strip()
if text:
if not previous_s: # First Span
previous_s = s
block_string = size_tag[s['size']] + s['text']
elif s['size'] == previous_s['size']:
if not block_string or (block_string and all((c == "|") for c in block_string)): # New block
block_string = size_tag[s['size']] + s['text']
else: # in the same block, so concatenate strings
block_string += f" {s['text']}"
else:
texts.append(block_string)
block_string = size_tag[s['size']] + s['text']
previous_s = s
if block_string:
block_string += "|"
# if block_string:
texts.append(block_string)
return texts
@staticmethod
def get_slides(texts):
slides = {}
section = []
page = 1
current_header = ""
for text, next_text in zip(texts, texts[1:] + [None]):
tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
if tag_match:
tag = tag_match.group()
if tag == 'h1':
section = []
section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip()))
elif tag.startswith('h'): # non h1 headers
# Remove tag and pipes from the text
section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
elif tag.startswith('p'):
text = re.split("((\|){2,})", text) # If encounter more than 1 pipe than split that text into different paragraphs
for paragraph in text:
paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip() # Remove any pipe
paragraph = re.sub(' +', ' ', paragraph) # Remove any double or more spaces into single space
if paragraph and paragraph[0].islower(): # If a pargraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
section[-1][1] += f" {paragraph}"
elif paragraph:
section.append([tag, paragraph])
try:
if tag_match.group() == 'h1': # Create new page when current text is a type 1 header or title
slides[f"Page {page}"] = section
page += 1
except:
continue
return slides |