Spaces:
Runtime error
Runtime error
Commit
·
5f21add
1
Parent(s):
189657b
Fix get_slides in text_extractor.py
Browse files
__pycache__/app.cpython-38.pyc
CHANGED
Binary files a/__pycache__/app.cpython-38.pyc and b/__pycache__/app.cpython-38.pyc differ
|
|
__pycache__/text_extractor.cpython-38.pyc
CHANGED
Binary files a/__pycache__/text_extractor.cpython-38.pyc and b/__pycache__/text_extractor.cpython-38.pyc differ
|
|
text_extractor.py
CHANGED
@@ -117,19 +117,16 @@ class TextExtractor:
|
|
117 |
# Remove tag and pipes from the text
|
118 |
section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
|
119 |
elif tag.startswith('p'):
|
120 |
-
text = re.split("((\|){2,})", text)
|
121 |
for paragraph in text:
|
122 |
-
paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip()
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
section[
|
128 |
-
elif paragraph:
|
129 |
-
paragraph = re.sub(' +', ' ', paragraph) # Replace any double space in the paragraph
|
130 |
-
section.append((tag, paragraph))
|
131 |
try:
|
132 |
-
if
|
133 |
slides[f"Page {page}"] = section
|
134 |
page += 1
|
135 |
except:
|
|
|
117 |
# Remove tag and pipes from the text
|
118 |
section.append((tag, re.sub(r'<.*?>|\|', '', text).strip()))
|
119 |
elif tag.startswith('p'):
|
120 |
+
text = re.split("((\|){2,})", text) # If encounter more than 1 pipe than split that text into different paragraphs
|
121 |
for paragraph in text:
|
122 |
+
paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip() # Remove any pipe
|
123 |
+
paragraph = re.sub(' +', ' ', paragraph) # Remove any double or more spaces into single space
|
124 |
+
if paragraph and paragraph[0].islower(): # If a pargraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
|
125 |
+
section[-1][1] += f" {paragraph}"
|
126 |
+
elif paragraph:
|
127 |
+
section.append([tag, paragraph])
|
|
|
|
|
|
|
128 |
try:
|
129 |
+
if tag_match.group() == 'h1': # Create new page when current text is a type 1 header or title
|
130 |
slides[f"Page {page}"] = section
|
131 |
page += 1
|
132 |
except:
|