Spaces:

ShiwenNi
/

ChatReviewer

Running

App Files Files Community

ShiwenNi commited on Mar 21, 2023

Commit

78b3d2e

1 Parent(s): 8ddfa3b

Update get_paper_from_pdf.py

Browse files

Files changed (1) hide show

get_paper_from_pdf.py +73 -61

get_paper_from_pdf.py CHANGED Viewed

@@ -5,50 +5,38 @@ import json
 import re
 class Paper:
     def __init__(self, path, title='', url='', abs='', authors=[]):
         # 初始化函数，根据pdf路径初始化Paper对象
-        self.url =  url           # 文章链接
-        self.path = path          # pdf路径
-        self.section_names = []   # 段落标题
-        self.section_texts = {}   # 段落内容
         self.abs = abs
         self.title_page = 0
         if title == '':
-            self.pdf = fitz.open(self.path) # pdf文档
             self.title = self.get_title()
-            self.parse_pdf()
         else:
             self.title = title
         self.authors = authors
         self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
-        self.digit_num = [str(d+1) for d in range(10)]
         self.first_image = ''
     def parse_pdf(self):
-        self.pdf = fitz.open(self.path) # pdf文档
         self.text_list = [page.get_text() for page in self.pdf]
         self.all_text = ' '.join(self.text_list)
         self.extract_section_infomation()
         self.section_texts.update({"title": self.title})
-        self.section_texts.update({"paper_info": self.get_paper_info()})
-        self.pdf.close()
-    def get_paper_info(self):
-        first_page_text = self.pdf[self.title_page].get_text()
-        if "Abstract" in self.section_texts.keys():
-            abstract_text = self.section_texts['Abstract']
-        else:
-            abstract_text = self.abs
-        introduction_text = self.section_texts['Introduction']
-        first_page_text = first_page_text.replace(abstract_text, "").replace(introduction_text, "")
-        return first_page_text
     # 定义一个函数，根据字体的大小，识别每个章节名称，并返回一个列表
-    def get_chapter_names(self,):
         # # 打开一个pdf文件
-        doc = fitz.open(self.path) # pdf文档
         text_list = [page.get_text() for page in doc]
         all_text = ''
         for text in text_list:
@@ -61,52 +49,53 @@ class Paper:
                 point_split_list = line.split('.')
                 space_split_list = line.split(' ')
                 if 1 < len(space_split_list) < 5:
-                    if 1 < len(point_split_list) < 5 and (point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
                         # print("line:", line)
-                        chapter_names.append(line)
         return chapter_names
     def get_title(self):
-        doc = self.pdf # 打开pdf文件
-        max_font_size = 0 # 初始化最大字体大小为0
-        max_string = "" # 初始化最大字体大小对应的字符串为空
         max_font_sizes = [0]
-        for page_index, page in enumerate(doc): # 遍历每一页
-            text = page.get_text("dict") # 获取页面上的文本信息
-            blocks = text["blocks"] # 获取文本块列表
-            for block in blocks: # 遍历每个文本块
-                if block["type"] == 0 and len(block['lines']): # 如果是文字类型
                     if len(block["lines"][0]["spans"]):
-                        font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
                         max_font_sizes.append(font_size)
-                        if font_size > max_font_size: # 如果字体大小大于当前最大值
-                            max_font_size = font_size # 更新最大值
-                            max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
-        max_font_sizes.sort()
         # print("max_font_sizes", max_font_sizes[-10:])
         cur_title = ''
-        for page_index, page in enumerate(doc): # 遍历每一页
-            text = page.get_text("dict") # 获取页面上的文本信息
-            blocks = text["blocks"] # 获取文本块列表
-            for block in blocks: # 遍历每个文本块
-                if block["type"] == 0 and len(block['lines']): # 如果是文字类型
                     if len(block["lines"][0]["spans"]):
-                        cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
-                        font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
-                        font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
                         # print(font_size)
-                        if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
                             # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
-                            if len(cur_string) > 4 and "arXiv" not in cur_string:
                                 # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
                                 if cur_title == '':
-                                    cur_title += cur_string
                                 else:
-                                    cur_title += ' ' + cur_string
                                 self.title_page = page_index
                                 # break
-        title = cur_title.replace('\n', ' ')
         return title
     def extract_section_infomation(self):
@@ -134,21 +123,43 @@ class Paper:
         heading_font = -1
         # 遍历每一页并查找子标题
         found_abstract = False
         for page in doc:
             blocks = page.get_text("dict")["blocks"]
             for block in blocks:
                 if not found_abstract:
-                    text = json.dumps(block)
                     if re.search(r"\bAbstract\b", text, re.IGNORECASE):
                         found_abstract = True
                 if found_abstract:
                     if 'lines' not in block:
                         continue
                     lines = block["lines"]
                     for line in lines:
                         for span in line["spans"]:
-                            if span["size"] > threshold and re.match(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
-                                                                     span["text"].strip()):
                                 if heading_font == -1:
                                     heading_font = span["size"]
                                 elif heading_font != span["size"]:
@@ -169,14 +180,15 @@ class Paper:
         self.section_names = subheadings
         self.section_texts = section_dict
 def main():
     path = r'demo.pdf'
     paper = Paper(path=path)
     paper.parse_pdf()
     # for key, value in paper.section_text_dict.items():
-        # print(key, value)
-        # print("*"*40)
 if __name__ == '__main__':
     main()

 import re
 class Paper:
     def __init__(self, path, title='', url='', abs='', authors=[]):
         # 初始化函数，根据pdf路径初始化Paper对象
+        self.url = url  # 文章链接
+        self.path = path  # pdf路径
+        self.section_names = []  # 段落标题
+        self.section_texts = {}  # 段落内容
         self.abs = abs
         self.title_page = 0
         if title == '':
+            self.pdf = fitz.open(self.path)  # pdf文档
             self.title = self.get_title()
+            self.parse_pdf()
         else:
             self.title = title
         self.authors = authors
         self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
+        self.digit_num = [str(d + 1) for d in range(10)]
         self.first_image = ''
     def parse_pdf(self):
+        self.pdf = fitz.open(self.path)  # pdf文档
         self.text_list = [page.get_text() for page in self.pdf]
         self.all_text = ' '.join(self.text_list)
         self.extract_section_infomation()
         self.section_texts.update({"title": self.title})
+        self.pdf.close()
     # 定义一个函数，根据字体的大小，识别每个章节名称，并返回一个列表
+    def get_chapter_names(self, ):
         # # 打开一个pdf文件
+        doc = fitz.open(self.path)  # pdf文档
         text_list = [page.get_text() for page in doc]
         all_text = ''
         for text in text_list:
                 point_split_list = line.split('.')
                 space_split_list = line.split(' ')
                 if 1 < len(space_split_list) < 5:
+                    if 1 < len(point_split_list) < 5 and (
+                            point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
                         # print("line:", line)
+                        chapter_names.append(line)
         return chapter_names
     def get_title(self):
+        doc = self.pdf  # 打开pdf文件
+        max_font_size = 0  # 初始化最大字体大小为0
+        max_string = ""  # 初始化最大字体大小对应的字符串为空
         max_font_sizes = [0]
+        for page_index, page in enumerate(doc):  # 遍历每一页
+            text = page.get_text("dict")  # 获取页面上的文本信息
+            blocks = text["blocks"]  # 获取文本块列表
+            for block in blocks:  # 遍历每个文本块
+                if block["type"] == 0 and len(block['lines']):  # 如果是文字类型
                     if len(block["lines"][0]["spans"]):
+                        font_size = block["lines"][0]["spans"][0]["size"]  # 获取第一行第一段文字的字体大小
                         max_font_sizes.append(font_size)
+                        if font_size > max_font_size:  # 如果字体大小大于当前最大值
+                            max_font_size = font_size  # 更新最大值
+                            max_string = block["lines"][0]["spans"][0]["text"]  # 更新最大值对应的字符串
+        max_font_sizes.sort()
         # print("max_font_sizes", max_font_sizes[-10:])
         cur_title = ''
+        for page_index, page in enumerate(doc):  # 遍历每一页
+            text = page.get_text("dict")  # 获取页面上的文本信息
+            blocks = text["blocks"]  # 获取文本块列表
+            for block in blocks:  # 遍历每个文本块
+                if block["type"] == 0 and len(block['lines']):  # 如果是文字类型
                     if len(block["lines"][0]["spans"]):
+                        cur_string = block["lines"][0]["spans"][0]["text"]  # 更新最大值对应的字符串
+                        font_flags = block["lines"][0]["spans"][0]["flags"]  # 获取第一行第一段文字的字体特征
+                        font_size = block["lines"][0]["spans"][0]["size"]  # 获取第一行第一段文字的字体大小
                         # print(font_size)
+                        if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
                             # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
+                            if len(cur_string) > 4 and "arXiv" not in cur_string:
                                 # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
                                 if cur_title == '':
+                                    cur_title += cur_string
                                 else:
+                                    cur_title += ' ' + cur_string
                                 self.title_page = page_index
                                 # break
+        title = cur_title.replace('\n', ' ')
         return title
     def extract_section_infomation(self):
         heading_font = -1
         # 遍历每一页并查找子标题
         found_abstract = False
+        upper_heading = False
+        font_heading = False
         for page in doc:
             blocks = page.get_text("dict")["blocks"]
             for block in blocks:
                 if not found_abstract:
+                    try:
+                        text = json.dumps(block)
+                    except:
+                        continue
                     if re.search(r"\bAbstract\b", text, re.IGNORECASE):
                         found_abstract = True
+                        last_heading = "Abstract"
+                        section_dict["Abstract"] = ""
                 if found_abstract:
                     if 'lines' not in block:
                         continue
                     lines = block["lines"]
                     for line in lines:
                         for span in line["spans"]:
+                            # 如果当前文本是子标题
+                            if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4:  # 针对一些标题大小一样,但是全大写的论文
+                                upper_heading = True
+                                heading = span["text"].strip()
+                                if "References" in heading:  # reference 以后的内容不考虑
+                                    self.section_names = subheadings
+                                    self.section_texts = section_dict
+                                    return
+                                subheadings.append(heading)
+                                if last_heading is not None:
+                                    section_dict[last_heading] = section_dict[last_heading].strip()
+                                section_dict[heading] = ""
+                                last_heading = heading
+                            if not upper_heading and span["size"] > threshold and re.match(  # 正常情况下,通过字体大小判断
+                                    r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
+                                    span["text"].strip()):
+                                font_heading = True
                                 if heading_font == -1:
                                     heading_font = span["size"]
                                 elif heading_font != span["size"]:
         self.section_names = subheadings
         self.section_texts = section_dict
 def main():
     path = r'demo.pdf'
     paper = Paper(path=path)
     paper.parse_pdf()
     # for key, value in paper.section_text_dict.items():
+    # print(key, value)
+    # print("*"*40)
 if __name__ == '__main__':
     main()