Spaces:

Davidsamuel101
/

PPTGenerator

Runtime error

File size: 7,491 Bytes

from operator import itemgetter
from collections import OrderedDict
from typing import Dict, List, Iterator, Union, Tuple

import re

class TextExtractor:
    def __init__(self) -> None:
        pass
    
    @staticmethod
    def get_font_info(doc: Iterator, granularity=False) -> List[Tuple[str, int]]:
        """
        Return a list containing the font sizes and their count number.

        Args:
            doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
            granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.

        Raises:
            ValueError: Raises Value Error if there are no font detected

        Returns:
            List[Tuple[str, int]]: 
                Font Counts: [('12.0', 266), ('16.020000457763672', 18), ('13.979999542236328', 7), ('7.019999980926514', 2)]
        """
        styles = {}
        font_counts = {}

        for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
            identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
            styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
            font_counts[identifier] = font_counts.get(identifier, 0) + 1
        font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)

        if not font_counts:
            raise ValueError("Zero discriminating fonts found!")

        return font_counts, styles

    @staticmethod
    def get_font_tags(font_counts, styles) -> Dict[int, str]:
        """
        Return a dictionary of font sizes and their corresponding tags.

        Args:
            font_counts (List[Tuple[str, int]]): The font sizes as keys and their count as values
            styles (Dict[int, Dict[str, str]]): A style descriptioin of every font sizes.

        Returns:
            Dict[int, str]: Dictionary of the font sizes as keys and their tags as values.
            Example: {12.0: '<p>', 16.020000457763672: '<h1>', 13.979999542236328: '<h2>', 7.019999980926514: '<s4>'}
        """
        p_size = styles[font_counts[0][0]]['size']
        # sorting the font sizes high to low, so that we can append the right integer to each tag 
        font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
        size_tag = {p_size: "<p>"}
        for i, size in enumerate(font_sizes):
            if size > p_size:
                size_tag[size] = f"<h{i+1}>"
            elif size < p_size:
                size_tag[size] = f"<s{i+1}>"
        return size_tag
    
    @staticmethod
    def assign_tags(doc, size_tag) -> List[str]:
        """
        Scrapes headers & paragraphs from PDF and return texts with element tags.

        Args:
            doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
            size_tag (dict): Textual element tags for each size.
        Returns:
            list: Texts with pre-prended element tags
            Examples: ['<h1>Group Members: |', '<p>1. Stella Shania Mintara - 2301860596
            | 2. David Samuel - 2301850304 | 3.   Egivenia - 2301850134 | 4. Aurelius Va
            nnes Leander - 2301862102 | 5. Juanrico Alvaro - 2301847316 ||']
        """
        texts = []
        previous_s = {}
        block_string = ""
        for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
            block_string = ""
            for l in b["lines"]:
                for s in l["spans"]:
                    text = re.sub(r"[^\w\s]", '', s["text"]).strip()
                    if text:
                        if not previous_s: # First Span
                            previous_s = s
                            block_string = size_tag[s['size']] + s['text']                       
                        elif s['size'] == previous_s['size']:
                            if not block_string or (block_string and all((c == "|") for c in block_string)): # New block
                                block_string = size_tag[s['size']] + s['text']    
                            else:  # in the same block, so concatenate strings
                                block_string += f" {s['text']}"
                        else:
                            texts.append(block_string)
                            block_string = size_tag[s['size']] + s['text']
                        previous_s = s
                if block_string:
                    block_string += "|"
            # if block_string:
            texts.append(block_string)
        return texts
    
    @staticmethod
    def get_slides(texts):
        """
        Returns the tagged texts into a slide format dictionary where the page is the 
        key and the value is a list contaning the component of that page.

        Args:
            texts (List[str]): PDF text with element tags.

        Returns:
            Dict: The text of the PDF seperated by the header 1 tags.
            Examples: {'Page 1': [('h1', 'Group Members:'),
                    ['p', '1. Stella Shania Mintara - 2301860596 2. David Samuel - 
                    2301850304 3. Egivenia - 2301850134 4. Aurelius Vannes Leander - 
                    2301862102 5.
                    Juanrico Alvaro - 2301847316']],
                    'Page 2': [('h1', 'Case Problem'),
                    ['p', FreshMart is an established large-scale supermarket with branc
                    hes in popular areas across Jakarta and big cities]]}
        """
        slides = {}
        section = []
        page = 1

        current_header = ""
        for text, next_text in zip(texts, texts[1:] + [None]):
            tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
            if tag_match:
                tag = tag_match.group()
                if tag == 'h1':
                    section = []
                    section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip()))
                elif tag.startswith('h'): # non h1 headers
                    # Remove tag and pipes from the text  
                    section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
                elif tag.startswith('p'):
                    text = re.split("((\|){2,})", text) # If encounter more than 1 pipe than split that text into different paragraphs
                    for paragraph in text:
                        paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip() # Remove any pipe 
                        paragraph = re.sub(' +', ' ', paragraph) # Remove any double or more spaces into single space
                        if paragraph and paragraph[0].islower(): # If a pargraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
                            section[-1][1] += f" {paragraph}"
                        elif paragraph: 
                            section.append([tag, paragraph])
                try:
                    if tag_match.group() == 'h1': # Create new page when current text is a type 1 header or title
                        slides[f"Page {page}"] = section
                        page += 1
                except:
                    continue         
        return slides