Spaces:

Bostoncake
/

ChatAssistant

Build error

App Files Files Community

Bostoncake commited on Apr 13, 2023

Commit

f24bfa5

1 Parent(s): 6eef56f

Delete unused files.

Browse files

Files changed (2) hide show

chat_assistant.py +0 -208
get_paper_from_pdf.py +0 -194

chat_assistant.py DELETED Viewed

@@ -1,208 +0,0 @@
-import numpy as np
-import os
-import re
-import datetime
-import time
-import openai, tenacity
-import argparse
-import configparser
-import json
-import tiktoken
-from get_paper_from_pdf import Paper
-class Assistant:
-    def __init__(self, args=None):
-        if args.language == 'en':
-            self.language = 'English'
-        elif args.language == 'zh':
-            self.language = 'Chinese'
-        else:
-            self.language = 'Chinese'
-        self.config = configparser.ConfigParser()
-        self.config.read('apikey.ini')
-        self.chat_api_list = self.config.get('OpenAI', 'OPENAI_API_KEYS')[1:-1].replace('\'', '').split(',')
-        self.chat_api_list = [api.strip() for api in self.chat_api_list if len(api) > 5]
-        self.cur_api = 0
-        self.file_format = args.file_format
-        self.max_token_num = 4096
-        self.encoding = tiktoken.get_encoding("gpt2")
-        self.result_backup = ''
-    def validateTitle(self, title):
-        rstr = r"[\/\\\:\*\?\"\<\>\|]"
-        new_title = re.sub(rstr, "_", title)
-        return new_title
-    def assist_reading_by_chatgpt(self, paper_list):
-        htmls = []
-        for paper_index, paper in enumerate(paper_list):
-            sections_of_interest = self.extract_paper(paper)
-            # extract the essential parts of the paper
-            text = ''
-            text += 'Title:' + paper.title + '. '
-            text += 'Abstract: ' + paper.section_texts['Abstract']
-            intro_title = next((item for item in paper.section_names if 'ntroduction' in item.lower()), None)
-            if intro_title is not None:
-                text += 'Introduction: ' + paper.section_texts[intro_title]
-            # Similar for conclusion section
-            conclusion_title = next((item for item in paper.section_names if 'onclusion' in item), None)
-            if conclusion_title is not None:
-                text += 'Conclusion: ' + paper.section_texts[conclusion_title]
-            for heading in sections_of_interest:
-                if heading in paper.section_names:
-                    text += heading + ': ' + paper.section_texts[heading]
-            chat_review_text = self.chat_assist(text=text)
-            htmls.append('## Paper:' + str(paper_index+1))
-            htmls.append('\n\n\n')
-            htmls.append(chat_review_text)
-            # 将问题与回答保存起来
-            date_str = str(datetime.datetime.now())[:19].replace(' ', '-').replace(':', '-')
-            try:
-                export_path = os.path.join('./', 'output_file')
-                os.makedirs(export_path)
-            except:
-                pass
-            mode = 'w' if paper_index == 0 else 'a'
-            file_name = os.path.join(export_path, date_str+'-'+self.validateTitle(paper.title)+"."+self.file_format)
-            self.export_to_markdown("\n".join(htmls), file_name=file_name, mode=mode)
-            htmls = []
-    def extract_paper(self, paper):
-        htmls = []
-        text = ''
-        text += 'Title: ' + paper.title + '. '
-        text += 'Abstract: ' + paper.section_texts['Abstract']
-        text_token = len(self.encoding.encode(text))
-        if text_token > self.max_token_num/2 - 800:
-            input_text_index = int(len(text)*((self.max_token_num/2)-800)/text_token)
-            text = text[:input_text_index]
-        openai.api_key = self.chat_api_list[self.cur_api]
-        self.cur_api += 1
-        self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
-        print("\n\n"+"********"*10)
-        print("Extracting content from PDF.")
-        print("********"*10)
-        messages = [
-            {"role": "system",
-             "content": f"You are a professional researcher in the field of {args.research_fields}. You are the mentor of a student who is new to this field. "
-                        f"I will give you a paper. You need to help your student to read this paper by instructing him to read the important sections in this paper and answer his questions towards these sections."
-                        f"Due to the length limitations, I am only allowed to provide you the abstract, introduction, conclusion and at most two sections of this paper."
-                        f"Now I will give you the title and abstract and the headings of potential sections. "
-                        f"You need to reply at most two headings. Then I will further provide you the full information, includes aforementioned sections and at most two sections you called for.\n\n"
-                        f"Title: {paper.title}\n\n"
-                        f"Abstract: {paper.section_texts['Abstract']}\n\n"
-                        f"Potential Sections: {paper.section_names[2:-1]}\n\n"
-                        f"Follow the following format to output your choice of sections:"
-                        f"{{chosen section 1}}, {{chosen section 2}}\n\n"},
-            {"role": "user", "content": text},
-        ]
-        response = openai.ChatCompletion.create(
-            model="gpt-3.5-turbo",
-            messages=messages,
-        )
-        result = ''
-        for choice in response.choices:
-            result += choice.message.content
-        print("\n\n"+"********"*10)
-        print("Important sections of this paper:")
-        print(result)
-        print("********"*10)
-        print("prompt_token_used:", response.usage.prompt_tokens)
-        print("completion_token_used:", response.usage.completion_tokens)
-        print("total_token_used:", response.usage.total_tokens)
-        print("response_time:", response.response_ms/1000.0, 's')
-        return result.split(',')
-    @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
-                    stop=tenacity.stop_after_attempt(5),
-                    reraise=True)
-    def chat_assist(self, text):
-        openai.api_key = self.chat_api_list[self.cur_api]
-        self.cur_api += 1
-        self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
-        review_prompt_token = 1000
-        text_token = len(self.encoding.encode(text))
-        input_text_index = int(len(text)*(self.max_token_num-review_prompt_token)/text_token)
-        input_text = "This is the paper for your review:" + text[:input_text_index] + "\n\n"
-        input_text_backup = input_text
-        while True:
-            print("\n\n"+"********"*10)
-            print("Ask ChatGPT questions of the important sections. Type \"quit\" to exit the program. To receive better responses, please describe why you ask the question.\nFor example, ask \"Why does the author use residual connections? I want to know how does the residual connections work in the model structure.\" instead of \"Why does the author use residual connections?\"")
-            print("********"*10)
-            student_question = input()
-            if student_question == "quit":
-                break
-            input_text = input_text_backup
-            input_text = input_text + "The question from your student is: " + student_question
-            messages=[
-                    {"role": "system", "content": "You are a professional researcher in the field of "+args.research_fields+". You are the mentor of a student who is new to this field. Now I will give you a paper. You need to help your student to read this paper by instructing him to read the important sections in this paper and answer his questions towards these sections. Please answer in {}.".format(self.language)},
-                    {"role": "user", "content": input_text},
-                ]
-            response = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=messages,
-            )
-            result = ''
-            for choice in response.choices:
-                result += choice.message.content
-            self.result_backup = self.result_backup + "\n\n" + student_question + "\n"
-            self.result_backup += result
-            print("\n\n"+"********"*10)
-            print(result)
-            print("********"*10)
-            print("prompt_token_used:", response.usage.prompt_tokens)
-            print("completion_token_used:", response.usage.completion_tokens)
-            print("total_token_used:", response.usage.total_tokens)
-            print("response_time:", response.response_ms/1000.0, 's')
-        return self.result_backup
-    def export_to_markdown(self, text, file_name, mode='w'):
-        # 使用markdown模块的convert方法，将文本转换为html格式
-        # html = markdown.markdown(text)
-        # 打开一个文件，以写入模式
-        with open(file_name, mode, encoding="utf-8") as f:
-            # 将html格式的内容写入文件
-            f.write(text)
-def main(args):
-    # Paper reading assistant instructions
-    print("********"*10)
-    print("Extracting content from PDF.")
-    print("********"*10)
-    assistant1 = Assistant(args=args)
-    # 开始判断是路径还是文件：
-    paper_list = []
-    if args.paper_path.endswith(".pdf"):
-        paper_list.append(Paper(path=args.paper_path))
-    else:
-        for root, dirs, files in os.walk(args.paper_path):
-            print("root:", root, "dirs:", dirs, 'files:', files) #当前目录路径
-            for filename in files:
-                # 如果找到PDF文件，则将其复制到目标文件夹中
-                if filename.endswith(".pdf"):
-                    paper_list.append(Paper(path=os.path.join(root, filename)))
-    print("------------------paper_num: {}------------------".format(len(paper_list)))
-    [print(paper_index, paper_name.path.split('\\')[-1]) for paper_index, paper_name in enumerate(paper_list)]
-    assistant1.assist_reading_by_chatgpt(paper_list=paper_list)
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--paper_path", type=str, default='', help="path of papers")
-    parser.add_argument("--file_format", type=str, default='txt', help="output file format")
-    parser.add_argument("--research_fields", type=str, default='computer science, artificial intelligence and transfer learning', help="the research fields of paper")
-    parser.add_argument("--language", type=str, default='en', help="output lauguage, en or zh")
-    args = parser.parse_args()
-    start_time = time.time()
-    main(args=args)
-    print("total time:", time.time() - start_time)

get_paper_from_pdf.py DELETED Viewed

@@ -1,194 +0,0 @@
-import fitz, io, os
-from PIL import Image
-from collections import Counter
-import json
-import re
-class Paper:
-    def __init__(self, path, title='', url='', abs='', authors=[]):
-        # 初始化函数，根据pdf路径初始化Paper对象
-        self.url = url  # 文章链接
-        self.path = path  # pdf路径
-        self.section_names = []  # 段落标题
-        self.section_texts = {}  # 段落内容
-        self.abs = abs
-        self.title_page = 0
-        if title == '':
-            self.pdf = fitz.open(self.path)  # pdf文档
-            self.title = self.get_title()
-            self.parse_pdf()
-        else:
-            self.title = title
-        self.authors = authors
-        self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
-        self.digit_num = [str(d + 1) for d in range(10)]
-        self.first_image = ''
-    def parse_pdf(self):
-        self.pdf = fitz.open(self.path)  # pdf文档
-        self.text_list = [page.get_text() for page in self.pdf]
-        self.all_text = ' '.join(self.text_list)
-        self.extract_section_infomation()
-        self.section_texts.update({"title": self.title})
-        self.pdf.close()
-    # 定义一个函数，根据字体的大小，识别每个章节名称，并返回一个列表
-    def get_chapter_names(self, ):
-        # # 打开一个pdf文件
-        doc = fitz.open(self.path)  # pdf文档
-        text_list = [page.get_text() for page in doc]
-        all_text = ''
-        for text in text_list:
-            all_text += text
-        # # 创建一个空列表，用于存储章节名称
-        chapter_names = []
-        for line in all_text.split('\n'):
-            line_list = line.split(' ')
-            if '.' in line:
-                point_split_list = line.split('.')
-                space_split_list = line.split(' ')
-                if 1 < len(space_split_list) < 5:
-                    if 1 < len(point_split_list) < 5 and (
-                            point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
-                        # print("line:", line)
-                        chapter_names.append(line)
-        return chapter_names
-    def get_title(self):
-        doc = self.pdf  # 打开pdf文件
-        max_font_size = 0  # 初始化最大字体大小为0
-        max_string = ""  # 初始化最大字体大小对应的字符串为空
-        max_font_sizes = [0]
-        for page_index, page in enumerate(doc):  # 遍历每一页
-            text = page.get_text("dict")  # 获取页面上的文本信息
-            blocks = text["blocks"]  # 获取文本块列表
-            for block in blocks:  # 遍历每个文本块
-                if block["type"] == 0 and len(block['lines']):  # 如果是文字类型
-                    if len(block["lines"][0]["spans"]):
-                        font_size = block["lines"][0]["spans"][0]["size"]  # 获取第一行第一段文字的字体大小
-                        max_font_sizes.append(font_size)
-                        if font_size > max_font_size:  # 如果字体大小大于当前最大值
-                            max_font_size = font_size  # 更新最大值
-                            max_string = block["lines"][0]["spans"][0]["text"]  # 更新最大值对应的字符串
-        max_font_sizes.sort()
-        # print("max_font_sizes", max_font_sizes[-10:])
-        cur_title = ''
-        for page_index, page in enumerate(doc):  # 遍历每一页
-            text = page.get_text("dict")  # 获取页面上的文本信息
-            blocks = text["blocks"]  # 获取文本块列表
-            for block in blocks:  # 遍历每个文本块
-                if block["type"] == 0 and len(block['lines']):  # 如果是文字类型
-                    if len(block["lines"][0]["spans"]):
-                        cur_string = block["lines"][0]["spans"][0]["text"]  # 更新最大值对应的字符串
-                        font_flags = block["lines"][0]["spans"][0]["flags"]  # 获取第一行第一段文字的字体特征
-                        font_size = block["lines"][0]["spans"][0]["size"]  # 获取第一行第一段文字的字体大小
-                        # print(font_size)
-                        if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
-                            # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
-                            if len(cur_string) > 4 and "arXiv" not in cur_string:
-                                # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
-                                if cur_title == '':
-                                    cur_title += cur_string
-                                else:
-                                    cur_title += ' ' + cur_string
-                                self.title_page = page_index
-                                # break
-        title = cur_title.replace('\n', ' ')
-        return title
-    def extract_section_infomation(self):
-        doc = fitz.open(self.path)
-        # 获取文档中所有字体大小
-        font_sizes = []
-        for page in doc:
-            blocks = page.get_text("dict")["blocks"]
-            for block in blocks:
-                if 'lines' not in block:
-                    continue
-                lines = block["lines"]
-                for line in lines:
-                    for span in line["spans"]:
-                        font_sizes.append(span["size"])
-        most_common_size, _ = Counter(font_sizes).most_common(1)[0]
-        # 按照最频繁的字体大小确定标题字体大小的阈值
-        threshold = most_common_size * 1
-        section_dict = {}
-        section_dict["Abstract"] = ""
-        last_heading = None
-        subheadings = []
-        heading_font = -1
-        # 遍历每一页并查找子标题
-        found_abstract = False
-        upper_heading = False
-        font_heading = False
-        for page in doc:
-            blocks = page.get_text("dict")["blocks"]
-            for block in blocks:
-                if not found_abstract:
-                    try:
-                        text = json.dumps(block)
-                    except:
-                        continue
-                    if re.search(r"\bAbstract\b", text, re.IGNORECASE):
-                        found_abstract = True
-                        last_heading = "Abstract"
-                if found_abstract:
-                    if 'lines' not in block:
-                        continue
-                    lines = block["lines"]
-                    for line in lines:
-                        for span in line["spans"]:
-                            # 如果当前文本是子标题
-                            if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4:  # 针对一些标题大小一样,但是全大写的论文
-                                upper_heading = True
-                                heading = span["text"].strip()
-                                if "References" in heading:  # reference 以后的内容不考虑
-                                    self.section_names = subheadings
-                                    self.section_texts = section_dict
-                                    return
-                                subheadings.append(heading)
-                                if last_heading is not None:
-                                    section_dict[last_heading] = section_dict[last_heading].strip()
-                                section_dict[heading] = ""
-                                last_heading = heading
-                            if not upper_heading and span["size"] > threshold and re.match(  # 正常情况下,通过字体大小判断
-                                    r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
-                                    span["text"].strip()):
-                                font_heading = True
-                                if heading_font == -1:
-                                    heading_font = span["size"]
-                                elif heading_font != span["size"]:
-                                    continue
-                                heading = span["text"].strip()
-                                if "References" in heading:  # reference 以后的内容不考虑
-                                    self.section_names = subheadings
-                                    self.section_texts = section_dict
-                                    return
-                                subheadings.append(heading)
-                                if last_heading is not None:
-                                    section_dict[last_heading] = section_dict[last_heading].strip()
-                                section_dict[heading] = ""
-                                last_heading = heading
-                            # 否则将当前文本添加到上一个子标题的文本中
-                            elif last_heading is not None:
-                                section_dict[last_heading] += " " + span["text"].strip()
-        self.section_names = subheadings
-        self.section_texts = section_dict
-def main():
-    path = r'demo.pdf'
-    paper = Paper(path=path)
-    paper.parse_pdf()
-    # for key, value in paper.section_text_dict.items():
-    # print(key, value)
-    # print("*"*40)
-if __name__ == '__main__':
-    main()