Spaces:
Paused
Paused
Upload 2 files
Browse files- chat_reviewer.py +183 -0
- get_paper_from_pdf.py +182 -0
chat_reviewer.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import datetime
|
| 5 |
+
import time
|
| 6 |
+
import openai, tenacity
|
| 7 |
+
import argparse
|
| 8 |
+
import configparser
|
| 9 |
+
import json
|
| 10 |
+
import tiktoken
|
| 11 |
+
from get_paper_from_pdf import Paper
|
| 12 |
+
|
| 13 |
+
# 定义Reviewer类
|
| 14 |
+
class Reviewer:
|
| 15 |
+
# 初始化方法,设置属性
|
| 16 |
+
def __init__(self, args=None):
|
| 17 |
+
if args.language == 'en':
|
| 18 |
+
self.language = 'English'
|
| 19 |
+
elif args.language == 'zh':
|
| 20 |
+
self.language = 'Chinese'
|
| 21 |
+
else:
|
| 22 |
+
self.language = 'Chinese'
|
| 23 |
+
# 创建一个ConfigParser对象
|
| 24 |
+
self.config = configparser.ConfigParser()
|
| 25 |
+
# 读取配置文件
|
| 26 |
+
self.config.read('apikey.ini')
|
| 27 |
+
# 获取某个键对应的值
|
| 28 |
+
self.chat_api_list = self.config.get('OpenAI', 'OPENAI_API_KEYS')[1:-1].replace('\'', '').split(',')
|
| 29 |
+
self.chat_api_list = [api.strip() for api in self.chat_api_list if len(api) > 5]
|
| 30 |
+
self.cur_api = 0
|
| 31 |
+
self.file_format = args.file_format
|
| 32 |
+
self.max_token_num = 4096
|
| 33 |
+
self.encoding = tiktoken.get_encoding("gpt2")
|
| 34 |
+
|
| 35 |
+
def validateTitle(self, title):
|
| 36 |
+
# 修正论文的路径格式
|
| 37 |
+
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
|
| 38 |
+
new_title = re.sub(rstr, "_", title) # 替换为下划线
|
| 39 |
+
return new_title
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def review_by_chatgpt(self, paper_list):
|
| 43 |
+
htmls = []
|
| 44 |
+
for paper_index, paper in enumerate(paper_list):
|
| 45 |
+
sections_of_interest = self.stage_1(paper)
|
| 46 |
+
# extract the essential parts of the paper
|
| 47 |
+
text = ''
|
| 48 |
+
text += 'Title:' + paper.title + '. '
|
| 49 |
+
text += 'Abstract: ' + paper.section_texts['Abstract']
|
| 50 |
+
intro_title = next((item for item in paper.section_names if 'ntroduction' in item), None)
|
| 51 |
+
if intro_title is not None:
|
| 52 |
+
text += 'Introduction: ' + paper.section_texts[intro_title]
|
| 53 |
+
# Similar for conclusion section
|
| 54 |
+
conclusion_title = next((item for item in paper.section_names if 'onclusion' in item), None)
|
| 55 |
+
if conclusion_title is not None:
|
| 56 |
+
text += 'Conclusion: ' + paper.section_texts[conclusion_title]
|
| 57 |
+
for heading in sections_of_interest:
|
| 58 |
+
if heading in paper.section_names:
|
| 59 |
+
text += heading + ': ' + paper.section_texts[heading]
|
| 60 |
+
chat_review_text = self.chat_review(text=text)
|
| 61 |
+
htmls.append('## Paper:' + str(paper_index+1))
|
| 62 |
+
htmls.append('\n\n\n')
|
| 63 |
+
htmls.append(chat_review_text)
|
| 64 |
+
|
| 65 |
+
# 将审稿意见保存起来
|
| 66 |
+
date_str = str(datetime.datetime.now())[:13].replace(' ', '-')
|
| 67 |
+
try:
|
| 68 |
+
export_path = os.path.join('./', 'output_file')
|
| 69 |
+
os.makedirs(export_path)
|
| 70 |
+
except:
|
| 71 |
+
pass
|
| 72 |
+
mode = 'w' if paper_index == 0 else 'a'
|
| 73 |
+
file_name = os.path.join(export_path, date_str+'-'+self.validateTitle(paper.title)+"."+self.file_format)
|
| 74 |
+
self.export_to_markdown("\n".join(htmls), file_name=file_name, mode=mode)
|
| 75 |
+
htmls = []
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def stage_1(self, paper):
|
| 79 |
+
htmls = []
|
| 80 |
+
text = ''
|
| 81 |
+
text += 'Title: ' + paper.title + '. '
|
| 82 |
+
text += 'Abstract: ' + paper.section_texts['Abstract']
|
| 83 |
+
openai.api_key = self.chat_api_list[self.cur_api]
|
| 84 |
+
self.cur_api += 1
|
| 85 |
+
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
|
| 86 |
+
messages = [
|
| 87 |
+
{"role": "system",
|
| 88 |
+
"content": f"You are a professional reviewer in the field of {args.research_fields}. "
|
| 89 |
+
f"I will give you a paper. You need to review this paper and discuss the novelty and originality of ideas, correctness, clarity, the significance of results, potential impact and quality of the presentation. "
|
| 90 |
+
f"Due to the length limitations, I am only allowed to provide you the abstract, introduction, conclusion and at most two sections of this paper."
|
| 91 |
+
f"Now I will give you the title and abstract and the headings of potential sections. "
|
| 92 |
+
f"You need to reply at most two headings. Then I will further provide you the full information, includes aforementioned sections and at most two sections you called for.\n\n"
|
| 93 |
+
f"Title: {paper.title}\n\n"
|
| 94 |
+
f"Abstract: {paper.section_texts['Abstract']}\n\n"
|
| 95 |
+
f"Potential Sections: {paper.section_names[2:-1]}\n\n"
|
| 96 |
+
f"Follow the following format to output your choice of sections:"
|
| 97 |
+
f"{{chosen section 1}}, {{chosen section 2}}\n\n"},
|
| 98 |
+
{"role": "user", "content": text},
|
| 99 |
+
]
|
| 100 |
+
response = openai.ChatCompletion.create(
|
| 101 |
+
model="gpt-3.5-turbo",
|
| 102 |
+
messages=messages,
|
| 103 |
+
)
|
| 104 |
+
result = ''
|
| 105 |
+
for choice in response.choices:
|
| 106 |
+
result += choice.message.content
|
| 107 |
+
print(result)
|
| 108 |
+
return result.split(',')
|
| 109 |
+
|
| 110 |
+
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
| 111 |
+
stop=tenacity.stop_after_attempt(5),
|
| 112 |
+
reraise=True)
|
| 113 |
+
def chat_review(self, text):
|
| 114 |
+
openai.api_key = self.chat_api_list[self.cur_api]
|
| 115 |
+
self.cur_api += 1
|
| 116 |
+
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
|
| 117 |
+
review_prompt_token = 1000
|
| 118 |
+
text_token = len(self.encoding.encode(text))
|
| 119 |
+
input_text_index = int(len(text)*(self.max_token_num-review_prompt_token)/text_token)
|
| 120 |
+
input_text = "This is the paper for your review:" + text[:input_text_index]
|
| 121 |
+
with open('ReviewFormat.txt', 'r') as file: # 读取特定的审稿格式
|
| 122 |
+
review_format = file.read()
|
| 123 |
+
messages=[
|
| 124 |
+
{"role": "system", "content": "You are a professional reviewer in the field of "+args.research_fields+". Now I will give you a paper. You need to give a complete review opinion according to the following requirements and format:"+ review_format +" Please answer in {}.".format(self.language)},
|
| 125 |
+
{"role": "user", "content": input_text},
|
| 126 |
+
]
|
| 127 |
+
|
| 128 |
+
response = openai.ChatCompletion.create(
|
| 129 |
+
model="gpt-3.5-turbo",
|
| 130 |
+
messages=messages,
|
| 131 |
+
)
|
| 132 |
+
result = ''
|
| 133 |
+
for choice in response.choices:
|
| 134 |
+
result += choice.message.content
|
| 135 |
+
print("********"*10)
|
| 136 |
+
print(result)
|
| 137 |
+
print("********"*10)
|
| 138 |
+
print("prompt_token_used:", response.usage.prompt_tokens)
|
| 139 |
+
print("completion_token_used:", response.usage.completion_tokens)
|
| 140 |
+
print("total_token_used:", response.usage.total_tokens)
|
| 141 |
+
print("response_time:", response.response_ms/1000.0, 's')
|
| 142 |
+
return result
|
| 143 |
+
|
| 144 |
+
def export_to_markdown(self, text, file_name, mode='w'):
|
| 145 |
+
# 使用markdown模块的convert方法,将文本转换为html格式
|
| 146 |
+
# html = markdown.markdown(text)
|
| 147 |
+
# 打开一个文件,以写入模式
|
| 148 |
+
with open(file_name, mode, encoding="utf-8") as f:
|
| 149 |
+
# 将html格式的内容写入文件
|
| 150 |
+
f.write(text)
|
| 151 |
+
|
| 152 |
+
def main(args):
|
| 153 |
+
|
| 154 |
+
reviewer1 = Reviewer(args=args)
|
| 155 |
+
# 开始判断是路径还是文件:
|
| 156 |
+
paper_list = []
|
| 157 |
+
if args.paper_path.endswith(".pdf"):
|
| 158 |
+
paper_list.append(Paper(path=args.paper_path))
|
| 159 |
+
else:
|
| 160 |
+
for root, dirs, files in os.walk(args.paper_path):
|
| 161 |
+
print("root:", root, "dirs:", dirs, 'files:', files) #当前目录路径
|
| 162 |
+
for filename in files:
|
| 163 |
+
# 如果找到PDF文件,则将其复制到目标文件夹中
|
| 164 |
+
if filename.endswith(".pdf"):
|
| 165 |
+
paper_list.append(Paper(path=os.path.join(root, filename)))
|
| 166 |
+
print("------------------paper_num: {}------------------".format(len(paper_list)))
|
| 167 |
+
[print(paper_index, paper_name.path.split('\\')[-1]) for paper_index, paper_name in enumerate(paper_list)]
|
| 168 |
+
reviewer1.review_by_chatgpt(paper_list=paper_list)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
if __name__ == '__main__':
|
| 173 |
+
parser = argparse.ArgumentParser()
|
| 174 |
+
parser.add_argument("--paper_path", type=str, default='', help="path of papers")
|
| 175 |
+
parser.add_argument("--file_format", type=str, default='txt', help="output file format")
|
| 176 |
+
parser.add_argument("--research_fields", type=str, default='computer science, artificial intelligence and reinforcement learning', help="the research fields of paper")
|
| 177 |
+
parser.add_argument("--language", type=str, default='en', help="output lauguage, en or zh")
|
| 178 |
+
|
| 179 |
+
args = parser.parse_args()
|
| 180 |
+
start_time = time.time()
|
| 181 |
+
main(args=args)
|
| 182 |
+
print("review time:", time.time() - start_time)
|
| 183 |
+
|
get_paper_from_pdf.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz, io, os
|
| 2 |
+
from PIL import Image
|
| 3 |
+
from collections import Counter
|
| 4 |
+
import json
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Paper:
|
| 10 |
+
def __init__(self, path, title='', url='', abs='', authors=[]):
|
| 11 |
+
# 初始化函数,根据pdf路径初始化Paper对象
|
| 12 |
+
self.url = url # 文章链接
|
| 13 |
+
self.path = path # pdf路径
|
| 14 |
+
self.section_names = [] # 段落标题
|
| 15 |
+
self.section_texts = {} # 段落内容
|
| 16 |
+
self.abs = abs
|
| 17 |
+
self.title_page = 0
|
| 18 |
+
if title == '':
|
| 19 |
+
self.pdf = fitz.open(self.path) # pdf文档
|
| 20 |
+
self.title = self.get_title()
|
| 21 |
+
self.parse_pdf()
|
| 22 |
+
else:
|
| 23 |
+
self.title = title
|
| 24 |
+
self.authors = authors
|
| 25 |
+
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
|
| 26 |
+
self.digit_num = [str(d+1) for d in range(10)]
|
| 27 |
+
self.first_image = ''
|
| 28 |
+
|
| 29 |
+
def parse_pdf(self):
|
| 30 |
+
self.pdf = fitz.open(self.path) # pdf文档
|
| 31 |
+
self.text_list = [page.get_text() for page in self.pdf]
|
| 32 |
+
self.all_text = ' '.join(self.text_list)
|
| 33 |
+
self.extract_section_infomation()
|
| 34 |
+
self.section_texts.update({"title": self.title})
|
| 35 |
+
self.section_texts.update({"paper_info": self.get_paper_info()})
|
| 36 |
+
self.pdf.close()
|
| 37 |
+
|
| 38 |
+
def get_paper_info(self):
|
| 39 |
+
first_page_text = self.pdf[self.title_page].get_text()
|
| 40 |
+
if "Abstract" in self.section_texts.keys():
|
| 41 |
+
abstract_text = self.section_texts['Abstract']
|
| 42 |
+
else:
|
| 43 |
+
abstract_text = self.abs
|
| 44 |
+
introduction_text = self.section_texts['Introduction']
|
| 45 |
+
first_page_text = first_page_text.replace(abstract_text, "").replace(introduction_text, "")
|
| 46 |
+
return first_page_text
|
| 47 |
+
|
| 48 |
+
# 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
|
| 49 |
+
def get_chapter_names(self,):
|
| 50 |
+
# # 打开一个pdf文件
|
| 51 |
+
doc = fitz.open(self.path) # pdf文档
|
| 52 |
+
text_list = [page.get_text() for page in doc]
|
| 53 |
+
all_text = ''
|
| 54 |
+
for text in text_list:
|
| 55 |
+
all_text += text
|
| 56 |
+
# # 创建一个空列表,用于存储章节名称
|
| 57 |
+
chapter_names = []
|
| 58 |
+
for line in all_text.split('\n'):
|
| 59 |
+
line_list = line.split(' ')
|
| 60 |
+
if '.' in line:
|
| 61 |
+
point_split_list = line.split('.')
|
| 62 |
+
space_split_list = line.split(' ')
|
| 63 |
+
if 1 < len(space_split_list) < 5:
|
| 64 |
+
if 1 < len(point_split_list) < 5 and (point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
|
| 65 |
+
# print("line:", line)
|
| 66 |
+
chapter_names.append(line)
|
| 67 |
+
|
| 68 |
+
return chapter_names
|
| 69 |
+
|
| 70 |
+
def get_title(self):
|
| 71 |
+
doc = self.pdf # 打开pdf文件
|
| 72 |
+
max_font_size = 0 # 初始化最大字体大小为0
|
| 73 |
+
max_string = "" # 初始化最大字体大小对应的字符串为空
|
| 74 |
+
max_font_sizes = [0]
|
| 75 |
+
for page_index, page in enumerate(doc): # 遍历每一页
|
| 76 |
+
text = page.get_text("dict") # 获取页面上的文本信息
|
| 77 |
+
blocks = text["blocks"] # 获取文本块列表
|
| 78 |
+
for block in blocks: # 遍历每个文本块
|
| 79 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
| 80 |
+
if len(block["lines"][0]["spans"]):
|
| 81 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
| 82 |
+
max_font_sizes.append(font_size)
|
| 83 |
+
if font_size > max_font_size: # 如果字体大小大于当前最大值
|
| 84 |
+
max_font_size = font_size # 更新最大值
|
| 85 |
+
max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
| 86 |
+
max_font_sizes.sort()
|
| 87 |
+
# print("max_font_sizes", max_font_sizes[-10:])
|
| 88 |
+
cur_title = ''
|
| 89 |
+
for page_index, page in enumerate(doc): # 遍历每一页
|
| 90 |
+
text = page.get_text("dict") # 获取页面上的文本信息
|
| 91 |
+
blocks = text["blocks"] # 获取文本块列表
|
| 92 |
+
for block in blocks: # 遍历每个文本块
|
| 93 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
| 94 |
+
if len(block["lines"][0]["spans"]):
|
| 95 |
+
cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
| 96 |
+
font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
|
| 97 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
| 98 |
+
# print(font_size)
|
| 99 |
+
if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
|
| 100 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
| 101 |
+
if len(cur_string) > 4 and "arXiv" not in cur_string:
|
| 102 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
| 103 |
+
if cur_title == '':
|
| 104 |
+
cur_title += cur_string
|
| 105 |
+
else:
|
| 106 |
+
cur_title += ' ' + cur_string
|
| 107 |
+
self.title_page = page_index
|
| 108 |
+
# break
|
| 109 |
+
title = cur_title.replace('\n', ' ')
|
| 110 |
+
return title
|
| 111 |
+
|
| 112 |
+
def extract_section_infomation(self):
|
| 113 |
+
doc = fitz.open(self.path)
|
| 114 |
+
|
| 115 |
+
# 获取文档中所有字体大小
|
| 116 |
+
font_sizes = []
|
| 117 |
+
for page in doc:
|
| 118 |
+
blocks = page.get_text("dict")["blocks"]
|
| 119 |
+
for block in blocks:
|
| 120 |
+
if 'lines' not in block:
|
| 121 |
+
continue
|
| 122 |
+
lines = block["lines"]
|
| 123 |
+
for line in lines:
|
| 124 |
+
for span in line["spans"]:
|
| 125 |
+
font_sizes.append(span["size"])
|
| 126 |
+
most_common_size, _ = Counter(font_sizes).most_common(1)[0]
|
| 127 |
+
|
| 128 |
+
# 按照最频繁的字体大小确定标题字体大小的阈值
|
| 129 |
+
threshold = most_common_size * 1
|
| 130 |
+
|
| 131 |
+
section_dict = {}
|
| 132 |
+
last_heading = None
|
| 133 |
+
subheadings = []
|
| 134 |
+
heading_font = -1
|
| 135 |
+
# 遍历每一页并查找子标题
|
| 136 |
+
found_abstract = False
|
| 137 |
+
for page in doc:
|
| 138 |
+
blocks = page.get_text("dict")["blocks"]
|
| 139 |
+
for block in blocks:
|
| 140 |
+
if not found_abstract:
|
| 141 |
+
text = json.dumps(block)
|
| 142 |
+
if re.search(r"\bAbstract\b", text, re.IGNORECASE):
|
| 143 |
+
found_abstract = True
|
| 144 |
+
if found_abstract:
|
| 145 |
+
if 'lines' not in block:
|
| 146 |
+
continue
|
| 147 |
+
lines = block["lines"]
|
| 148 |
+
for line in lines:
|
| 149 |
+
for span in line["spans"]:
|
| 150 |
+
if span["size"] > threshold and re.match(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
|
| 151 |
+
span["text"].strip()):
|
| 152 |
+
if heading_font == -1:
|
| 153 |
+
heading_font = span["size"]
|
| 154 |
+
elif heading_font != span["size"]:
|
| 155 |
+
continue
|
| 156 |
+
heading = span["text"].strip()
|
| 157 |
+
if "References" in heading: # reference 以后的内容不考虑
|
| 158 |
+
self.section_names = subheadings
|
| 159 |
+
self.section_texts = section_dict
|
| 160 |
+
return
|
| 161 |
+
subheadings.append(heading)
|
| 162 |
+
if last_heading is not None:
|
| 163 |
+
section_dict[last_heading] = section_dict[last_heading].strip()
|
| 164 |
+
section_dict[heading] = ""
|
| 165 |
+
last_heading = heading
|
| 166 |
+
# 否则将当前文本添加到上一个子标题的文本中
|
| 167 |
+
elif last_heading is not None:
|
| 168 |
+
section_dict[last_heading] += " " + span["text"].strip()
|
| 169 |
+
self.section_names = subheadings
|
| 170 |
+
self.section_texts = section_dict
|
| 171 |
+
|
| 172 |
+
def main():
|
| 173 |
+
path = r'demo.pdf'
|
| 174 |
+
paper = Paper(path=path)
|
| 175 |
+
paper.parse_pdf()
|
| 176 |
+
# for key, value in paper.section_text_dict.items():
|
| 177 |
+
# print(key, value)
|
| 178 |
+
# print("*"*40)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
if __name__ == '__main__':
|
| 182 |
+
main()
|