Davidsamuel101 commited on
Commit
d9b7d2f
·
1 Parent(s): f0a8738

Add text extractor

Browse files
Files changed (1) hide show
  1. text_extractor.py +140 -0
text_extractor.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+ from collections import OrderedDict
3
+ from typing import Dict, List, Iterator, Union, Tuple
4
+
5
+
6
+ import re
7
+
8
+ class TextExtractor:
9
+ def __init__(self) -> None:
10
+ pass
11
+
12
+ @staticmethod
13
+ def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]:
14
+ """
15
+ This function return the fonts information inside the pdf such as size and type.
16
+
17
+ Args:
18
+ doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
19
+ granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.
20
+
21
+ Raises:
22
+ ValueError: Raises Value Error if there are no font detected
23
+
24
+ Returns:
25
+ Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_
26
+ """
27
+ styles = {}
28
+ font_counts = {}
29
+
30
+ for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
31
+ identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
32
+ styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
33
+ font_counts[identifier] = font_counts.get(identifier, 0) + 1
34
+ font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)
35
+
36
+ if not font_counts:
37
+ raise ValueError("Zero discriminating fonts found!")
38
+
39
+ return font_counts, styles
40
+
41
+ @staticmethod
42
+ def get_font_tags(font_counts, styles):
43
+ """
44
+ _summary_
45
+
46
+ Args:
47
+ font_counts (_type_): _description_
48
+ styles (_type_): _description_
49
+
50
+ Returns:
51
+ _type_: _description_
52
+ """
53
+ p_size = styles[font_counts[0][0]]['size']
54
+ # sorting the font sizes high to low, so that we can append the right integer to each tag
55
+ font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
56
+ size_tag = {p_size: "<p>"}
57
+ for i, size in enumerate(font_sizes):
58
+ if size > p_size:
59
+ size_tag[size] = f"<h{i+1}>"
60
+ elif size < p_size:
61
+ size_tag[size] = f"<s{i+1}>"
62
+ return size_tag
63
+
64
+ @staticmethod
65
+ def assign_tags(doc, size_tag):
66
+ """
67
+ Scrapes headers & paragraphs from PDF and return texts with element tags.
68
+
69
+ Args:
70
+ doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
71
+ size_tag (dict): Textual element tags for each size.
72
+ Returns:
73
+ list: Texts with pre-prended element tags
74
+ """
75
+ texts = []
76
+ previous_s = {}
77
+ block_string = ""
78
+ for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
79
+ block_string = ""
80
+ for l in b["lines"]:
81
+ for s in l["spans"]:
82
+ text = re.sub(r"[^\w\s]", '', s["text"]).strip()
83
+ if text:
84
+ if not previous_s: # First Span
85
+ previous_s = s
86
+ block_string = size_tag[s['size']] + s['text']
87
+ elif s['size'] == previous_s['size']:
88
+ if not block_string or (block_string and all((c == "|") for c in block_string)): # New block
89
+ block_string = size_tag[s['size']] + s['text']
90
+ else: # in the same block, so concatenate strings
91
+ block_string += f" {s['text']}"
92
+ else:
93
+ texts.append(block_string)
94
+ block_string = size_tag[s['size']] + s['text']
95
+ previous_s = s
96
+ if block_string:
97
+ block_string += "|"
98
+ # if block_string:
99
+ texts.append(block_string)
100
+ return texts
101
+
102
+ @staticmethod
103
+ def get_slides(texts):
104
+ slides = {}
105
+ section = []
106
+ page = 1
107
+
108
+ current_header = ""
109
+ for text, next_text in zip(texts, texts[1:] + [None]):
110
+ tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
111
+ if tag_match:
112
+ tag = tag_match.group()
113
+ if tag == 'h1':
114
+ section = []
115
+ section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip()))
116
+ elif tag.startswith('h'): # non h1 headers
117
+ # Remove tag and pipes from the text
118
+ section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
119
+ elif tag.startswith('p'):
120
+ text = re.split("((\|){2,})", text)
121
+ for paragraph in text:
122
+ paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip()
123
+ if paragraph and paragraph[0].islower(): # If a parggraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
124
+ my_list = list(section[-1])
125
+ my_list[1] += f" {paragraph}"
126
+ my_tuple = tuple(my_list)
127
+ section[-1] = my_tuple # Append back the concatenated paragraph back to the section
128
+ elif paragraph:
129
+ paragraph = re.sub(' +', ' ', paragraph)
130
+ section.append((tag, paragraph))
131
+ try:
132
+ if next_text is None:
133
+ slides[f"Page {page}"] = section
134
+ page += 1
135
+ elif re.search(r'(?<=<)(.*?)(?=>)', next_text).group() == 'h1':
136
+ slides[f"Page {page}"] = section
137
+ page += 1
138
+ except:
139
+ continue
140
+ return slides