File size: 7,491 Bytes
d9b7d2f
 
 
 
 
 
 
 
 
 
 
57da257
d9b7d2f
57da257
d9b7d2f
 
 
 
 
 
 
 
 
57da257
 
d9b7d2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57da257
d9b7d2f
57da257
d9b7d2f
 
57da257
 
d9b7d2f
 
57da257
 
d9b7d2f
 
 
 
 
 
 
 
 
 
 
 
 
57da257
d9b7d2f
 
 
 
 
 
 
 
57da257
 
 
d9b7d2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57da257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9b7d2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f21add
d9b7d2f
5f21add
 
 
 
 
 
d9b7d2f
5f21add
d9b7d2f
 
 
 
57da257
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from operator import itemgetter
from collections import OrderedDict
from typing import Dict, List, Iterator, Union, Tuple

import re

class TextExtractor:
    def __init__(self) -> None:
        pass
    
    @staticmethod
    def get_font_info(doc: Iterator, granularity=False) -> List[Tuple[str, int]]:
        """
        Return a list containing the font sizes and their count number.

        Args:
            doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
            granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.

        Raises:
            ValueError: Raises Value Error if there are no font detected

        Returns:
            List[Tuple[str, int]]: 
                Font Counts: [('12.0', 266), ('16.020000457763672', 18), ('13.979999542236328', 7), ('7.019999980926514', 2)]
        """
        styles = {}
        font_counts = {}

        for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
            identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
            styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
            font_counts[identifier] = font_counts.get(identifier, 0) + 1
        font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)

        if not font_counts:
            raise ValueError("Zero discriminating fonts found!")

        return font_counts, styles

    @staticmethod
    def get_font_tags(font_counts, styles) -> Dict[int, str]:
        """
        Return a dictionary of font sizes and their corresponding tags.

        Args:
            font_counts (List[Tuple[str, int]]): The font sizes as keys and their count as values
            styles (Dict[int, Dict[str, str]]): A style descriptioin of every font sizes.

        Returns:
            Dict[int, str]: Dictionary of the font sizes as keys and their tags as values.
            Example: {12.0: '<p>', 16.020000457763672: '<h1>', 13.979999542236328: '<h2>', 7.019999980926514: '<s4>'}
        """
        p_size = styles[font_counts[0][0]]['size']
        # sorting the font sizes high to low, so that we can append the right integer to each tag 
        font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
        size_tag = {p_size: "<p>"}
        for i, size in enumerate(font_sizes):
            if size > p_size:
                size_tag[size] = f"<h{i+1}>"
            elif size < p_size:
                size_tag[size] = f"<s{i+1}>"
        return size_tag
    
    @staticmethod
    def assign_tags(doc, size_tag) -> List[str]:
        """
        Scrapes headers & paragraphs from PDF and return texts with element tags.

        Args:
            doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
            size_tag (dict): Textual element tags for each size.
        Returns:
            list: Texts with pre-prended element tags
            Examples: ['<h1>Group Members: |', '<p>1. Stella Shania Mintara - 2301860596
            | 2. David Samuel - 2301850304 | 3.   Egivenia - 2301850134 | 4. Aurelius Va
            nnes Leander - 2301862102 | 5. Juanrico Alvaro - 2301847316 ||']
        """
        texts = []
        previous_s = {}
        block_string = ""
        for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
            block_string = ""
            for l in b["lines"]:
                for s in l["spans"]:
                    text = re.sub(r"[^\w\s]", '', s["text"]).strip()
                    if text:
                        if not previous_s: # First Span
                            previous_s = s
                            block_string = size_tag[s['size']] + s['text']                       
                        elif s['size'] == previous_s['size']:
                            if not block_string or (block_string and all((c == "|") for c in block_string)): # New block
                                block_string = size_tag[s['size']] + s['text']    
                            else:  # in the same block, so concatenate strings
                                block_string += f" {s['text']}"
                        else:
                            texts.append(block_string)
                            block_string = size_tag[s['size']] + s['text']
                        previous_s = s
                if block_string:
                    block_string += "|"
            # if block_string:
            texts.append(block_string)
        return texts
    
    @staticmethod
    def get_slides(texts):
        """
        Returns the tagged texts into a slide format dictionary where the page is the 
        key and the value is a list contaning the component of that page.

        Args:
            texts (List[str]): PDF text with element tags.

        Returns:
            Dict: The text of the PDF seperated by the header 1 tags.
            Examples: {'Page 1': [('h1', 'Group Members:'),
                    ['p', '1. Stella Shania Mintara - 2301860596 2. David Samuel - 
                    2301850304 3. Egivenia - 2301850134 4. Aurelius Vannes Leander - 
                    2301862102 5.
                    Juanrico Alvaro - 2301847316']],
                    'Page 2': [('h1', 'Case Problem'),
                    ['p', FreshMart is an established large-scale supermarket with branc
                    hes in popular areas across Jakarta and big cities]]}
        """
        slides = {}
        section = []
        page = 1

        current_header = ""
        for text, next_text in zip(texts, texts[1:] + [None]):
            tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
            if tag_match:
                tag = tag_match.group()
                if tag == 'h1':
                    section = []
                    section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip()))
                elif tag.startswith('h'): # non h1 headers
                    # Remove tag and pipes from the text  
                    section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
                elif tag.startswith('p'):
                    text = re.split("((\|){2,})", text) # If encounter more than 1 pipe than split that text into different paragraphs
                    for paragraph in text:
                        paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip() # Remove any pipe 
                        paragraph = re.sub(' +', ' ', paragraph) # Remove any double or more spaces into single space
                        if paragraph and paragraph[0].islower(): # If a pargraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
                            section[-1][1] += f" {paragraph}"
                        elif paragraph: 
                            section.append([tag, paragraph])
                try:
                    if tag_match.group() == 'h1': # Create new page when current text is a type 1 header or title
                        slides[f"Page {page}"] = section
                        page += 1
                except:
                    continue         
        return slides