Spaces:
Running
Running
Update get_paper_from_pdf.py
Browse files- get_paper_from_pdf.py +73 -61
get_paper_from_pdf.py
CHANGED
@@ -5,50 +5,38 @@ import json
|
|
5 |
import re
|
6 |
|
7 |
|
8 |
-
|
9 |
class Paper:
|
10 |
def __init__(self, path, title='', url='', abs='', authors=[]):
|
11 |
# 初始化函数,根据pdf路径初始化Paper对象
|
12 |
-
self.url =
|
13 |
-
self.path = path
|
14 |
-
self.section_names = []
|
15 |
-
self.section_texts = {}
|
16 |
self.abs = abs
|
17 |
self.title_page = 0
|
18 |
if title == '':
|
19 |
-
self.pdf = fitz.open(self.path)
|
20 |
self.title = self.get_title()
|
21 |
-
self.parse_pdf()
|
22 |
else:
|
23 |
self.title = title
|
24 |
self.authors = authors
|
25 |
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
|
26 |
-
self.digit_num = [str(d+1) for d in range(10)]
|
27 |
self.first_image = ''
|
28 |
-
|
29 |
def parse_pdf(self):
|
30 |
-
self.pdf = fitz.open(self.path)
|
31 |
self.text_list = [page.get_text() for page in self.pdf]
|
32 |
self.all_text = ' '.join(self.text_list)
|
33 |
self.extract_section_infomation()
|
34 |
self.section_texts.update({"title": self.title})
|
35 |
-
self.
|
36 |
-
|
37 |
-
|
38 |
-
def get_paper_info(self):
|
39 |
-
first_page_text = self.pdf[self.title_page].get_text()
|
40 |
-
if "Abstract" in self.section_texts.keys():
|
41 |
-
abstract_text = self.section_texts['Abstract']
|
42 |
-
else:
|
43 |
-
abstract_text = self.abs
|
44 |
-
introduction_text = self.section_texts['Introduction']
|
45 |
-
first_page_text = first_page_text.replace(abstract_text, "").replace(introduction_text, "")
|
46 |
-
return first_page_text
|
47 |
-
|
48 |
# 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
|
49 |
-
def get_chapter_names(self,):
|
50 |
# # 打开一个pdf文件
|
51 |
-
doc = fitz.open(self.path)
|
52 |
text_list = [page.get_text() for page in doc]
|
53 |
all_text = ''
|
54 |
for text in text_list:
|
@@ -61,52 +49,53 @@ class Paper:
|
|
61 |
point_split_list = line.split('.')
|
62 |
space_split_list = line.split(' ')
|
63 |
if 1 < len(space_split_list) < 5:
|
64 |
-
if 1 < len(point_split_list) < 5 and (
|
|
|
65 |
# print("line:", line)
|
66 |
-
chapter_names.append(line)
|
67 |
-
|
68 |
return chapter_names
|
69 |
-
|
70 |
def get_title(self):
|
71 |
-
doc = self.pdf
|
72 |
-
max_font_size = 0
|
73 |
-
max_string = ""
|
74 |
max_font_sizes = [0]
|
75 |
-
for page_index, page in enumerate(doc):
|
76 |
-
text = page.get_text("dict")
|
77 |
-
blocks = text["blocks"]
|
78 |
-
for block in blocks:
|
79 |
-
if block["type"] == 0 and len(block['lines']):
|
80 |
if len(block["lines"][0]["spans"]):
|
81 |
-
font_size = block["lines"][0]["spans"][0]["size"]
|
82 |
max_font_sizes.append(font_size)
|
83 |
-
if font_size > max_font_size:
|
84 |
-
max_font_size = font_size
|
85 |
-
max_string = block["lines"][0]["spans"][0]["text"]
|
86 |
-
max_font_sizes.sort()
|
87 |
# print("max_font_sizes", max_font_sizes[-10:])
|
88 |
cur_title = ''
|
89 |
-
for page_index, page in enumerate(doc):
|
90 |
-
text = page.get_text("dict")
|
91 |
-
blocks = text["blocks"]
|
92 |
-
for block in blocks:
|
93 |
-
if block["type"] == 0 and len(block['lines']):
|
94 |
if len(block["lines"][0]["spans"]):
|
95 |
-
cur_string = block["lines"][0]["spans"][0]["text"]
|
96 |
-
font_flags = block["lines"][0]["spans"][0]["flags"]
|
97 |
-
font_size = block["lines"][0]["spans"][0]["size"]
|
98 |
# print(font_size)
|
99 |
-
if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
|
100 |
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
101 |
-
if len(cur_string) > 4 and "arXiv" not in cur_string:
|
102 |
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
103 |
if cur_title == '':
|
104 |
-
cur_title += cur_string
|
105 |
else:
|
106 |
-
cur_title += ' ' + cur_string
|
107 |
self.title_page = page_index
|
108 |
# break
|
109 |
-
title = cur_title.replace('\n', ' ')
|
110 |
return title
|
111 |
|
112 |
def extract_section_infomation(self):
|
@@ -134,21 +123,43 @@ class Paper:
|
|
134 |
heading_font = -1
|
135 |
# 遍历每一页并查找子标题
|
136 |
found_abstract = False
|
|
|
|
|
137 |
for page in doc:
|
138 |
blocks = page.get_text("dict")["blocks"]
|
139 |
for block in blocks:
|
140 |
if not found_abstract:
|
141 |
-
|
|
|
|
|
|
|
142 |
if re.search(r"\bAbstract\b", text, re.IGNORECASE):
|
143 |
found_abstract = True
|
|
|
|
|
144 |
if found_abstract:
|
145 |
if 'lines' not in block:
|
146 |
continue
|
147 |
lines = block["lines"]
|
148 |
for line in lines:
|
149 |
for span in line["spans"]:
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
if heading_font == -1:
|
153 |
heading_font = span["size"]
|
154 |
elif heading_font != span["size"]:
|
@@ -169,14 +180,15 @@ class Paper:
|
|
169 |
self.section_names = subheadings
|
170 |
self.section_texts = section_dict
|
171 |
|
|
|
172 |
def main():
|
173 |
path = r'demo.pdf'
|
174 |
paper = Paper(path=path)
|
175 |
paper.parse_pdf()
|
176 |
# for key, value in paper.section_text_dict.items():
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
|
181 |
if __name__ == '__main__':
|
182 |
main()
|
|
|
5 |
import re
|
6 |
|
7 |
|
|
|
8 |
class Paper:
|
9 |
def __init__(self, path, title='', url='', abs='', authors=[]):
|
10 |
# 初始化函数,根据pdf路径初始化Paper对象
|
11 |
+
self.url = url # 文章链接
|
12 |
+
self.path = path # pdf路径
|
13 |
+
self.section_names = [] # 段落标题
|
14 |
+
self.section_texts = {} # 段落内容
|
15 |
self.abs = abs
|
16 |
self.title_page = 0
|
17 |
if title == '':
|
18 |
+
self.pdf = fitz.open(self.path) # pdf文档
|
19 |
self.title = self.get_title()
|
20 |
+
self.parse_pdf()
|
21 |
else:
|
22 |
self.title = title
|
23 |
self.authors = authors
|
24 |
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
|
25 |
+
self.digit_num = [str(d + 1) for d in range(10)]
|
26 |
self.first_image = ''
|
27 |
+
|
28 |
def parse_pdf(self):
|
29 |
+
self.pdf = fitz.open(self.path) # pdf文档
|
30 |
self.text_list = [page.get_text() for page in self.pdf]
|
31 |
self.all_text = ' '.join(self.text_list)
|
32 |
self.extract_section_infomation()
|
33 |
self.section_texts.update({"title": self.title})
|
34 |
+
self.pdf.close()
|
35 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
# 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
|
37 |
+
def get_chapter_names(self, ):
|
38 |
# # 打开一个pdf文件
|
39 |
+
doc = fitz.open(self.path) # pdf文档
|
40 |
text_list = [page.get_text() for page in doc]
|
41 |
all_text = ''
|
42 |
for text in text_list:
|
|
|
49 |
point_split_list = line.split('.')
|
50 |
space_split_list = line.split(' ')
|
51 |
if 1 < len(space_split_list) < 5:
|
52 |
+
if 1 < len(point_split_list) < 5 and (
|
53 |
+
point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
|
54 |
# print("line:", line)
|
55 |
+
chapter_names.append(line)
|
56 |
+
|
57 |
return chapter_names
|
58 |
+
|
59 |
def get_title(self):
|
60 |
+
doc = self.pdf # 打开pdf文件
|
61 |
+
max_font_size = 0 # 初始化最大字体大小为0
|
62 |
+
max_string = "" # 初始化最大字体大小对应的字符串为空
|
63 |
max_font_sizes = [0]
|
64 |
+
for page_index, page in enumerate(doc): # 遍历每一页
|
65 |
+
text = page.get_text("dict") # 获取页面上的文本信息
|
66 |
+
blocks = text["blocks"] # 获取文本块列表
|
67 |
+
for block in blocks: # 遍历每个文本块
|
68 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
69 |
if len(block["lines"][0]["spans"]):
|
70 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
71 |
max_font_sizes.append(font_size)
|
72 |
+
if font_size > max_font_size: # 如果字体大小大于当前最大值
|
73 |
+
max_font_size = font_size # 更新最大值
|
74 |
+
max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
75 |
+
max_font_sizes.sort()
|
76 |
# print("max_font_sizes", max_font_sizes[-10:])
|
77 |
cur_title = ''
|
78 |
+
for page_index, page in enumerate(doc): # 遍历每一页
|
79 |
+
text = page.get_text("dict") # 获取页面上的文本信息
|
80 |
+
blocks = text["blocks"] # 获取文本块列表
|
81 |
+
for block in blocks: # 遍历每个文本块
|
82 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
83 |
if len(block["lines"][0]["spans"]):
|
84 |
+
cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
85 |
+
font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
|
86 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
87 |
# print(font_size)
|
88 |
+
if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
|
89 |
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
90 |
+
if len(cur_string) > 4 and "arXiv" not in cur_string:
|
91 |
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
92 |
if cur_title == '':
|
93 |
+
cur_title += cur_string
|
94 |
else:
|
95 |
+
cur_title += ' ' + cur_string
|
96 |
self.title_page = page_index
|
97 |
# break
|
98 |
+
title = cur_title.replace('\n', ' ')
|
99 |
return title
|
100 |
|
101 |
def extract_section_infomation(self):
|
|
|
123 |
heading_font = -1
|
124 |
# 遍历每一页并查找子标题
|
125 |
found_abstract = False
|
126 |
+
upper_heading = False
|
127 |
+
font_heading = False
|
128 |
for page in doc:
|
129 |
blocks = page.get_text("dict")["blocks"]
|
130 |
for block in blocks:
|
131 |
if not found_abstract:
|
132 |
+
try:
|
133 |
+
text = json.dumps(block)
|
134 |
+
except:
|
135 |
+
continue
|
136 |
if re.search(r"\bAbstract\b", text, re.IGNORECASE):
|
137 |
found_abstract = True
|
138 |
+
last_heading = "Abstract"
|
139 |
+
section_dict["Abstract"] = ""
|
140 |
if found_abstract:
|
141 |
if 'lines' not in block:
|
142 |
continue
|
143 |
lines = block["lines"]
|
144 |
for line in lines:
|
145 |
for span in line["spans"]:
|
146 |
+
# 如果当前文本是子标题
|
147 |
+
if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4: # 针对一些标题大小一样,但是全大写的论文
|
148 |
+
upper_heading = True
|
149 |
+
heading = span["text"].strip()
|
150 |
+
if "References" in heading: # reference 以后的内容不考虑
|
151 |
+
self.section_names = subheadings
|
152 |
+
self.section_texts = section_dict
|
153 |
+
return
|
154 |
+
subheadings.append(heading)
|
155 |
+
if last_heading is not None:
|
156 |
+
section_dict[last_heading] = section_dict[last_heading].strip()
|
157 |
+
section_dict[heading] = ""
|
158 |
+
last_heading = heading
|
159 |
+
if not upper_heading and span["size"] > threshold and re.match( # 正常情况下,通过字体大小判断
|
160 |
+
r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
|
161 |
+
span["text"].strip()):
|
162 |
+
font_heading = True
|
163 |
if heading_font == -1:
|
164 |
heading_font = span["size"]
|
165 |
elif heading_font != span["size"]:
|
|
|
180 |
self.section_names = subheadings
|
181 |
self.section_texts = section_dict
|
182 |
|
183 |
+
|
184 |
def main():
|
185 |
path = r'demo.pdf'
|
186 |
paper = Paper(path=path)
|
187 |
paper.parse_pdf()
|
188 |
# for key, value in paper.section_text_dict.items():
|
189 |
+
# print(key, value)
|
190 |
+
# print("*"*40)
|
191 |
+
|
192 |
|
193 |
if __name__ == '__main__':
|
194 |
main()
|