Spaces:

ShiwenNi
/

ChatReviewer

Running

App Files Files Community

ShiwenNi commited on Mar 21, 2023

Commit

8ddfa3b

1 Parent(s): d3b3756

Upload 2 files

Browse files

Files changed (2) hide show

chat_reviewer.py +183 -0
get_paper_from_pdf.py +182 -0

chat_reviewer.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import numpy as np
+import os
+import re
+import datetime
+import time
+import openai, tenacity
+import argparse
+import configparser
+import json
+import tiktoken
+from get_paper_from_pdf import Paper
+# 定义Reviewer类
+class Reviewer:
+    # 初始化方法，设置属性
+    def __init__(self, args=None):
+        if args.language == 'en':
+            self.language = 'English'
+        elif args.language == 'zh':
+            self.language = 'Chinese'
+        else:
+            self.language = 'Chinese'
+        # 创建一个ConfigParser对象
+        self.config = configparser.ConfigParser()
+        # 读取配置文件
+        self.config.read('apikey.ini')
+        # 获取某个键对应的值
+        self.chat_api_list = self.config.get('OpenAI', 'OPENAI_API_KEYS')[1:-1].replace('\'', '').split(',')
+        self.chat_api_list = [api.strip() for api in self.chat_api_list if len(api) > 5]
+        self.cur_api = 0
+        self.file_format = args.file_format
+        self.max_token_num = 4096
+        self.encoding = tiktoken.get_encoding("gpt2")
+    def validateTitle(self, title):
+        # 修正论文的路径格式
+        rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
+        new_title = re.sub(rstr, "_", title) # 替换为下划线
+        return new_title
+    def review_by_chatgpt(self, paper_list):
+        htmls = []
+        for paper_index, paper in enumerate(paper_list):
+            sections_of_interest = self.stage_1(paper)
+            # extract the essential parts of the paper
+            text = ''
+            text += 'Title:' + paper.title + '. '
+            text += 'Abstract: ' + paper.section_texts['Abstract']
+            intro_title = next((item for item in paper.section_names if 'ntroduction' in item), None)
+            if intro_title is not None:
+                text += 'Introduction: ' + paper.section_texts[intro_title]
+            # Similar for conclusion section
+            conclusion_title = next((item for item in paper.section_names if 'onclusion' in item), None)
+            if conclusion_title is not None:
+                text += 'Conclusion: ' + paper.section_texts[conclusion_title]
+            for heading in sections_of_interest:
+                if heading in paper.section_names:
+                    text += heading + ': ' + paper.section_texts[heading]
+            chat_review_text = self.chat_review(text=text)
+            htmls.append('## Paper:' + str(paper_index+1))
+            htmls.append('\n\n\n')
+            htmls.append(chat_review_text)
+            # 将审稿意见保存起来
+            date_str = str(datetime.datetime.now())[:13].replace(' ', '-')
+            try:
+                export_path = os.path.join('./', 'output_file')
+                os.makedirs(export_path)
+            except:
+                pass
+            mode = 'w' if paper_index == 0 else 'a'
+            file_name = os.path.join(export_path, date_str+'-'+self.validateTitle(paper.title)+"."+self.file_format)
+            self.export_to_markdown("\n".join(htmls), file_name=file_name, mode=mode)
+            htmls = []
+    def stage_1(self, paper):
+        htmls = []
+        text = ''
+        text += 'Title: ' + paper.title + '. '
+        text += 'Abstract: ' + paper.section_texts['Abstract']
+        openai.api_key = self.chat_api_list[self.cur_api]
+        self.cur_api += 1
+        self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
+        messages = [
+            {"role": "system",
+             "content": f"You are a professional reviewer in the field of {args.research_fields}. "
+                        f"I will give you a paper. You need to review this paper and discuss the novelty and originality of ideas, correctness, clarity, the significance of results, potential impact and quality of the presentation. "
+                        f"Due to the length limitations, I am only allowed to provide you the abstract, introduction, conclusion and at most two sections of this paper."
+                        f"Now I will give you the title and abstract and the headings of potential sections. "
+                        f"You need to reply at most two headings. Then I will further provide you the full information, includes aforementioned sections and at most two sections you called for.\n\n"
+                        f"Title: {paper.title}\n\n"
+                        f"Abstract: {paper.section_texts['Abstract']}\n\n"
+                        f"Potential Sections: {paper.section_names[2:-1]}\n\n"
+                        f"Follow the following format to output your choice of sections:"
+                        f"{{chosen section 1}}, {{chosen section 2}}\n\n"},
+            {"role": "user", "content": text},
+        ]
+        response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=messages,
+        )
+        result = ''
+        for choice in response.choices:
+            result += choice.message.content
+        print(result)
+        return result.split(',')
+    @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
+                    stop=tenacity.stop_after_attempt(5),
+                    reraise=True)
+    def chat_review(self, text):
+        openai.api_key = self.chat_api_list[self.cur_api]
+        self.cur_api += 1
+        self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
+        review_prompt_token = 1000
+        text_token = len(self.encoding.encode(text))
+        input_text_index = int(len(text)*(self.max_token_num-review_prompt_token)/text_token)
+        input_text = "This is the paper for your review:" + text[:input_text_index]
+        with open('ReviewFormat.txt', 'r') as file:   # 读取特定的审稿格式
+            review_format = file.read()
+        messages=[
+                {"role": "system", "content": "You are a professional reviewer in the field of "+args.research_fields+". Now I will give you a paper. You need to give a complete review opinion according to the following requirements and format:"+ review_format +" Please answer in {}.".format(self.language)},
+                {"role": "user", "content": input_text},
+            ]
+        response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=messages,
+        )
+        result = ''
+        for choice in response.choices:
+            result += choice.message.content
+        print("********"*10)
+        print(result)
+        print("********"*10)
+        print("prompt_token_used:", response.usage.prompt_tokens)
+        print("completion_token_used:", response.usage.completion_tokens)
+        print("total_token_used:", response.usage.total_tokens)
+        print("response_time:", response.response_ms/1000.0, 's')
+        return result
+    def export_to_markdown(self, text, file_name, mode='w'):
+        # 使用markdown模块的convert方法，将文本转换为html格式
+        # html = markdown.markdown(text)
+        # 打开一个文件，以写入模式
+        with open(file_name, mode, encoding="utf-8") as f:
+            # 将html格式的内容写入文件
+            f.write(text)
+def main(args):
+    reviewer1 = Reviewer(args=args)
+    # 开始判断是路径还是文件：
+    paper_list = []
+    if args.paper_path.endswith(".pdf"):
+        paper_list.append(Paper(path=args.paper_path))
+    else:
+        for root, dirs, files in os.walk(args.paper_path):
+            print("root:", root, "dirs:", dirs, 'files:', files) #当前目录路径
+            for filename in files:
+                # 如果找到PDF文件，则将其复制到目标文件夹中
+                if filename.endswith(".pdf"):
+                    paper_list.append(Paper(path=os.path.join(root, filename)))
+    print("------------------paper_num: {}------------------".format(len(paper_list)))
+    [print(paper_index, paper_name.path.split('\\')[-1]) for paper_index, paper_name in enumerate(paper_list)]
+    reviewer1.review_by_chatgpt(paper_list=paper_list)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--paper_path", type=str, default='', help="path of papers")
+    parser.add_argument("--file_format", type=str, default='txt', help="output file format")
+    parser.add_argument("--research_fields", type=str, default='computer science, artificial intelligence and reinforcement learning', help="the research fields of paper")
+    parser.add_argument("--language", type=str, default='en', help="output lauguage, en or zh")
+    args = parser.parse_args()
+    start_time = time.time()
+    main(args=args)
+    print("review time:", time.time() - start_time)

get_paper_from_pdf.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import fitz, io, os
+from PIL import Image
+from collections import Counter
+import json
+import re
+class Paper:
+    def __init__(self, path, title='', url='', abs='', authors=[]):
+        # 初始化函数，根据pdf路径初始化Paper对象
+        self.url =  url           # 文章链接
+        self.path = path          # pdf路径
+        self.section_names = []   # 段落标题
+        self.section_texts = {}   # 段落内容
+        self.abs = abs
+        self.title_page = 0
+        if title == '':
+            self.pdf = fitz.open(self.path) # pdf文档
+            self.title = self.get_title()
+            self.parse_pdf()
+        else:
+            self.title = title
+        self.authors = authors
+        self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
+        self.digit_num = [str(d+1) for d in range(10)]
+        self.first_image = ''
+    def parse_pdf(self):
+        self.pdf = fitz.open(self.path) # pdf文档
+        self.text_list = [page.get_text() for page in self.pdf]
+        self.all_text = ' '.join(self.text_list)
+        self.extract_section_infomation()
+        self.section_texts.update({"title": self.title})
+        self.section_texts.update({"paper_info": self.get_paper_info()})
+        self.pdf.close()
+    def get_paper_info(self):
+        first_page_text = self.pdf[self.title_page].get_text()
+        if "Abstract" in self.section_texts.keys():
+            abstract_text = self.section_texts['Abstract']
+        else:
+            abstract_text = self.abs
+        introduction_text = self.section_texts['Introduction']
+        first_page_text = first_page_text.replace(abstract_text, "").replace(introduction_text, "")
+        return first_page_text
+    # 定义一个函数，根据字体的大小，识别每个章节名称，并返回一个列表
+    def get_chapter_names(self,):
+        # # 打开一个pdf文件
+        doc = fitz.open(self.path) # pdf文档
+        text_list = [page.get_text() for page in doc]
+        all_text = ''
+        for text in text_list:
+            all_text += text
+        # # 创建一个空列表，用于存储章节名称
+        chapter_names = []
+        for line in all_text.split('\n'):
+            line_list = line.split(' ')
+            if '.' in line:
+                point_split_list = line.split('.')
+                space_split_list = line.split(' ')
+                if 1 < len(space_split_list) < 5:
+                    if 1 < len(point_split_list) < 5 and (point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
+                        # print("line:", line)
+                        chapter_names.append(line)
+        return chapter_names
+    def get_title(self):
+        doc = self.pdf # 打开pdf文件
+        max_font_size = 0 # 初始化最大字体大小为0
+        max_string = "" # 初始化最大字体大小对应的字符串为空
+        max_font_sizes = [0]
+        for page_index, page in enumerate(doc): # 遍历每一页
+            text = page.get_text("dict") # 获取页面上的文本信息
+            blocks = text["blocks"] # 获取文本块列表
+            for block in blocks: # 遍历每个文本块
+                if block["type"] == 0 and len(block['lines']): # 如果是文字类型
+                    if len(block["lines"][0]["spans"]):
+                        font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
+                        max_font_sizes.append(font_size)
+                        if font_size > max_font_size: # 如果字体大小大于当前最大值
+                            max_font_size = font_size # 更新最大值
+                            max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
+        max_font_sizes.sort()
+        # print("max_font_sizes", max_font_sizes[-10:])
+        cur_title = ''
+        for page_index, page in enumerate(doc): # 遍历每一页
+            text = page.get_text("dict") # 获取页面上的文本信息
+            blocks = text["blocks"] # 获取文本块列表
+            for block in blocks: # 遍历每个文本块
+                if block["type"] == 0 and len(block['lines']): # 如果是文字类型
+                    if len(block["lines"][0]["spans"]):
+                        cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
+                        font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
+                        font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
+                        # print(font_size)
+                        if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
+                            # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
+                            if len(cur_string) > 4 and "arXiv" not in cur_string:
+                                # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
+                                if cur_title == '':
+                                    cur_title += cur_string
+                                else:
+                                    cur_title += ' ' + cur_string
+                                self.title_page = page_index
+                                # break
+        title = cur_title.replace('\n', ' ')
+        return title
+    def extract_section_infomation(self):
+        doc = fitz.open(self.path)
+        # 获取文档中所有字体大小
+        font_sizes = []
+        for page in doc:
+            blocks = page.get_text("dict")["blocks"]
+            for block in blocks:
+                if 'lines' not in block:
+                    continue
+                lines = block["lines"]
+                for line in lines:
+                    for span in line["spans"]:
+                        font_sizes.append(span["size"])
+        most_common_size, _ = Counter(font_sizes).most_common(1)[0]
+        # 按照最频繁的字体大小确定标题字体大小的阈值
+        threshold = most_common_size * 1
+        section_dict = {}
+        last_heading = None
+        subheadings = []
+        heading_font = -1
+        # 遍历每一页并查找子标题
+        found_abstract = False
+        for page in doc:
+            blocks = page.get_text("dict")["blocks"]
+            for block in blocks:
+                if not found_abstract:
+                    text = json.dumps(block)
+                    if re.search(r"\bAbstract\b", text, re.IGNORECASE):
+                        found_abstract = True
+                if found_abstract:
+                    if 'lines' not in block:
+                        continue
+                    lines = block["lines"]
+                    for line in lines:
+                        for span in line["spans"]:
+                            if span["size"] > threshold and re.match(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
+                                                                     span["text"].strip()):
+                                if heading_font == -1:
+                                    heading_font = span["size"]
+                                elif heading_font != span["size"]:
+                                    continue
+                                heading = span["text"].strip()
+                                if "References" in heading:  # reference 以后的内容不考虑
+                                    self.section_names = subheadings
+                                    self.section_texts = section_dict
+                                    return
+                                subheadings.append(heading)
+                                if last_heading is not None:
+                                    section_dict[last_heading] = section_dict[last_heading].strip()
+                                section_dict[heading] = ""
+                                last_heading = heading
+                            # 否则将当前文本添加到上一个子标题的文本中
+                            elif last_heading is not None:
+                                section_dict[last_heading] += " " + span["text"].strip()
+        self.section_names = subheadings
+        self.section_texts = section_dict
+def main():
+    path = r'demo.pdf'
+    paper = Paper(path=path)
+    paper.parse_pdf()
+    # for key, value in paper.section_text_dict.items():
+        # print(key, value)
+        # print("*"*40)
+if __name__ == '__main__':
+    main()