from operator import itemgetter from collections import OrderedDict from typing import Dict, List, Iterator, Union, Tuple import re class TextExtractor: def __init__(self) -> None: pass @staticmethod def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]: """ This function return the fonts information inside the pdf such as size and type. Args: doc (): A fitz type document of the pdf file. granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False. Raises: ValueError: Raises Value Error if there are no font detected Returns: Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_ """ styles = {} font_counts = {} for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]: identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size']) styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']} font_counts[identifier] = font_counts.get(identifier, 0) + 1 font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True) if not font_counts: raise ValueError("Zero discriminating fonts found!") return font_counts, styles @staticmethod def get_font_tags(font_counts, styles): """ _summary_ Args: font_counts (_type_): _description_ styles (_type_): _description_ Returns: _type_: _description_ """ p_size = styles[font_counts[0][0]]['size'] # sorting the font sizes high to low, so that we can append the right integer to each tag font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True) size_tag = {p_size: "

"} for i, size in enumerate(font_sizes): if size > p_size: size_tag[size] = f"" elif size < p_size: size_tag[size] = f"" return size_tag @staticmethod def assign_tags(doc, size_tag): """ Scrapes headers & paragraphs from PDF and return texts with element tags. Args: doc (): PDF document to iterate through. size_tag (dict): Textual element tags for each size. Returns: list: Texts with pre-prended element tags """ texts = [] previous_s = {} block_string = "" for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]: block_string = "" for l in b["lines"]: for s in l["spans"]: text = re.sub(r"[^\w\s]", '', s["text"]).strip() if text: if not previous_s: # First Span previous_s = s block_string = size_tag[s['size']] + s['text'] elif s['size'] == previous_s['size']: if not block_string or (block_string and all((c == "|") for c in block_string)): # New block block_string = size_tag[s['size']] + s['text'] else: # in the same block, so concatenate strings block_string += f" {s['text']}" else: texts.append(block_string) block_string = size_tag[s['size']] + s['text'] previous_s = s if block_string: block_string += "|" # if block_string: texts.append(block_string) return texts @staticmethod def get_slides(texts): slides = {} section = [] page = 1 current_header = "" for text, next_text in zip(texts, texts[1:] + [None]): tag_match = re.search(r'(?<=<)(.*?)(?=>)', text) if tag_match: tag = tag_match.group() if tag == 'h1': section = [] section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip())) elif tag.startswith('h'): # non h1 headers # Remove tag and pipes from the text section.append((tag, re.sub(r'<.*?>|\|', '', text).strip())) elif tag.startswith('p'): text = re.split("((\|){2,})", text) for paragraph in text: paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip() if paragraph and paragraph[0].islower(): # If a parggraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph my_list = list(section[-1]) my_list[1] += f" {paragraph}" my_tuple = tuple(my_list) section[-1] = my_tuple # Append back the concatenated paragraph back to the section elif paragraph: paragraph = re.sub(' +', ' ', paragraph) section.append((tag, paragraph)) try: if next_text is None: slides[f"Page {page}"] = section page += 1 elif re.search(r'(?<=<)(.*?)(?=>)', next_text).group() == 'h1': slides[f"Page {page}"] = section page += 1 except: continue return slides