Spaces:

Davidsamuel101
/

PPTGenerator

Runtime error

App Files Files Community

Davidsamuel101 commited on Apr 28, 2023

Commit

d9b7d2f

1 Parent(s): f0a8738

Add text extractor

Browse files

Files changed (1) hide show

text_extractor.py +140 -0

text_extractor.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from operator import itemgetter
+from collections import OrderedDict
+from typing import Dict, List, Iterator, Union, Tuple
+import re
+class TextExtractor:
+    def __init__(self) -> None:
+        pass
+    @staticmethod
+    def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]:
+        """
+        This function return the fonts information inside the pdf such as size and type.
+        Args:
+            doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
+            granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.
+        Raises:
+            ValueError: Raises Value Error if there are no font detected
+        Returns:
+            Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_
+        """
+        styles = {}
+        font_counts = {}
+        for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
+            identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
+            styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
+            font_counts[identifier] = font_counts.get(identifier, 0) + 1
+        font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)
+        if not font_counts:
+            raise ValueError("Zero discriminating fonts found!")
+        return font_counts, styles
+    @staticmethod
+    def get_font_tags(font_counts, styles):
+        """
+        _summary_
+        Args:
+            font_counts (_type_): _description_
+            styles (_type_): _description_
+        Returns:
+            _type_: _description_
+        """
+        p_size = styles[font_counts[0][0]]['size']
+        # sorting the font sizes high to low, so that we can append the right integer to each tag
+        font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
+        size_tag = {p_size: "<p>"}
+        for i, size in enumerate(font_sizes):
+            if size > p_size:
+                size_tag[size] = f"<h{i+1}>"
+            elif size < p_size:
+                size_tag[size] = f"<s{i+1}>"
+        return size_tag
+    @staticmethod
+    def assign_tags(doc, size_tag):
+        """
+        Scrapes headers & paragraphs from PDF and return texts with element tags.
+        Args:
+            doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
+            size_tag (dict): Textual element tags for each size.
+        Returns:
+            list: Texts with pre-prended element tags
+        """
+        texts = []
+        previous_s = {}
+        block_string = ""
+        for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
+            block_string = ""
+            for l in b["lines"]:
+                for s in l["spans"]:
+                    text = re.sub(r"[^\w\s]", '', s["text"]).strip()
+                    if text:
+                        if not previous_s: # First Span
+                            previous_s = s
+                            block_string = size_tag[s['size']] + s['text']
+                        elif s['size'] == previous_s['size']:
+                            if not block_string or (block_string and all((c == "|") for c in block_string)): # New block
+                                block_string = size_tag[s['size']] + s['text']
+                            else:  # in the same block, so concatenate strings
+                                block_string += f" {s['text']}"
+                        else:
+                            texts.append(block_string)
+                            block_string = size_tag[s['size']] + s['text']
+                        previous_s = s
+                if block_string:
+                    block_string += "|"
+            # if block_string:
+            texts.append(block_string)
+        return texts
+    @staticmethod
+    def get_slides(texts):
+        slides = {}
+        section = []
+        page = 1
+        current_header = ""
+        for text, next_text in zip(texts, texts[1:] + [None]):
+            tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
+            if tag_match:
+                tag = tag_match.group()
+                if tag == 'h1':
+                    section = []
+                    section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip()))
+                elif tag.startswith('h'): # non h1 headers
+                    # Remove tag and pipes from the text
+                    section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
+                elif tag.startswith('p'):
+                    text = re.split("((\|){2,})", text)
+                    for paragraph in text:
+                        paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip()
+                        if paragraph and paragraph[0].islower(): # If a parggraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
+                            my_list = list(section[-1])
+                            my_list[1] += f" {paragraph}"
+                            my_tuple = tuple(my_list)
+                            section[-1] = my_tuple # Append back the concatenated paragraph back to the section
+                        elif paragraph:
+                            paragraph = re.sub(' +', ' ', paragraph)
+                            section.append((tag, paragraph))
+                try:
+                    if next_text is None:
+                        slides[f"Page {page}"] = section
+                        page += 1
+                    elif re.search(r'(?<=<)(.*?)(?=>)', next_text).group() == 'h1':
+                        slides[f"Page {page}"] = section
+                        page += 1
+                except:
+                    continue
+        return slides