Davidsamuel101 commited on
Commit
9f2dd14
·
1 Parent(s): 0752d04

Tidy Up Code

Browse files
.gitignore CHANGED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ src/test.py
2
+ test
3
+ test.py
4
+ test.sh
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (142 Bytes). View file
 
src/__pycache__/app.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/app.cpython-38.pyc and b/src/__pycache__/app.cpython-38.pyc differ
 
src/__pycache__/summarizer.cpython-311.pyc ADDED
Binary file (5.34 kB). View file
 
src/__pycache__/summarizer.cpython-38.pyc ADDED
Binary file (2.8 kB). View file
 
src/__pycache__/summarizer.cpython-39.pyc ADDED
Binary file (2.81 kB). View file
 
src/__pycache__/test.cpython-38.pyc ADDED
Binary file (1.31 kB). View file
 
src/__pycache__/text_extractor.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/text_extractor.cpython-38.pyc and b/src/__pycache__/text_extractor.cpython-38.pyc differ
 
src/app.py CHANGED
@@ -1,72 +1,17 @@
1
- from src.text_extractor import TextExtractor
2
- from tqdm import tqdm
3
- from transformers import PegasusForConditionalGeneration, PegasusTokenizer
4
- from transformers import pipeline
5
- from mdutils.mdutils import MdUtils
6
- from pathlib import Path
7
-
8
  import gradio as gr
9
- import fitz
10
- import torch
11
- import copy
12
- import os
13
-
14
- FILENAME = ""
15
-
16
- preprocess = TextExtractor()
17
- model_name = "sshleifer/distill-pegasus-cnn-16-4"
18
- device = "cuda" if torch.cuda.is_available() else "cpu"
19
- tokenizer = PegasusTokenizer.from_pretrained(model_name)
20
- model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
21
-
22
- def summarize(slides):
23
- generated_slides = copy.deepcopy(slides)
24
- for page, contents in tqdm(generated_slides.items()):
25
- for idx, (tag, content) in enumerate(contents):
26
- if tag.startswith('p'):
27
- try:
28
- input = tokenizer(content, truncation=True, padding="longest", return_tensors="pt").to(device)
29
- tensor = model.generate(**input)
30
- summary = tokenizer.batch_decode(tensor, skip_special_tokens=True)[0]
31
- contents[idx] = (tag, summary)
32
- except Exception as e:
33
- print(f"Summarization Fails, Error: {e}")
34
- return generated_slides
35
-
36
- def convert2markdown(generated_slides):
37
- mdFile = MdUtils(file_name=FILENAME, title=f'{FILENAME} Presentation')
38
- for k, v in generated_slides.items():
39
- mdFile.new_line('---\n')
40
- for section in v:
41
- tag = section[0]
42
- content = section[1]
43
- if tag.startswith('h'):
44
- mdFile.new_header(level=int(tag[1]), title=content)
45
- if tag == 'p':
46
- contents = content.split('<n>')
47
- for content in contents:
48
- mdFile.new_line(f"{content}\n")
49
- mdFile.create_md_file()
50
- return f"{FILENAME}.md"
51
 
52
  def inference(document):
53
- global FILENAME
54
- doc = fitz.open(document)
55
- FILENAME = document.name.split('/')[-1].split('.')[0]
56
- font_counts, styles = preprocess.get_font_info(doc, granularity=False)
57
- size_tag = preprocess.get_font_tags(font_counts, styles)
58
- texts = preprocess.assign_tags(doc, size_tag)
59
- slides = preprocess.get_slides(texts)
60
- generated_slides = summarize(slides)
61
- markdown_name = convert2markdown(generated_slides)
62
- print(f"Markdown File Name: {markdown_name}")
63
- return markdown_name
64
-
65
 
66
  with gr.Blocks() as demo:
67
  inp = gr.File(file_types=['pdf'])
68
  out = gr.File(label="Markdown File")
69
- # out = gr.Textbox(label="Markdown Content")
70
  inference_btn = gr.Button("Summarized PDF")
71
  inference_btn.click(fn=inference, inputs=inp, outputs=out, show_progress=True, api_name="summarize")
72
 
 
1
+ from src.summarizer import Summarizer
 
 
 
 
 
 
2
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def inference(document):
5
+ summarizer = Summarizer("sshleifer/distill-pegasus-cnn-16-4")
6
+ slide_content = summarizer.extract_text(document)
7
+ summarized_slides = summarizer(slide_content)
8
+ markdown = summarizer.convert2markdown(summarized_slides)
9
+ print(f"Markdown File Name: {markdown.file_name}")
10
+ return markdown.file_name
 
 
 
 
 
 
11
 
12
  with gr.Blocks() as demo:
13
  inp = gr.File(file_types=['pdf'])
14
  out = gr.File(label="Markdown File")
 
15
  inference_btn = gr.Button("Summarized PDF")
16
  inference_btn.click(fn=inference, inputs=inp, outputs=out, show_progress=True, api_name="summarize")
17
 
src/summarizer.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Tuple, Optional
2
+ from tqdm import tqdm
3
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
4
+ from src.text_extractor import TextExtractor
5
+ from mdutils.mdutils import MdUtils
6
+
7
+ import torch
8
+ import fitz
9
+ import copy
10
+
11
+ class Summarizer():
12
+ def __init__(self, model_name: str):
13
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
15
+ self.model = PegasusForConditionalGeneration.from_pretrained(model_name).to(self.device)
16
+ self.preprocess = TextExtractor()
17
+
18
+ def extract_text(self, document: object) -> Dict[str, List[Tuple[str, str]]]:
19
+ doc = fitz.open(document)
20
+ self.filename = doc.name.split('/')[-1].split('.')[0]
21
+ font_counts, styles = self.preprocess.get_font_info(doc, granularity=False)
22
+ size_tag = self.preprocess.get_font_tags(font_counts, styles)
23
+ texts = self.preprocess.assign_tags(doc, size_tag)
24
+ slide_content = self.preprocess.get_slides(texts)
25
+ return slide_content
26
+
27
+ def __call__(self, slides: Dict[str, List[Tuple[str, str]]]) -> Dict[str, List[Tuple[str, str]]]:
28
+ summarized_slides = copy.deepcopy(slides)
29
+ for page, contents in tqdm(summarized_slides.items()):
30
+ for idx, (tag, content) in enumerate(contents):
31
+ if tag.startswith('p'):
32
+ try:
33
+ input = self.tokenizer(content, truncation=True, padding="longest", return_tensors="pt").to(self.device)
34
+ tensor = self.model.generate(**input)
35
+ summary = self.tokenizer.batch_decode(tensor, skip_special_tokens=True)[0]
36
+ contents[idx] = (tag, summary)
37
+ except Exception as e:
38
+ print(f"Summarization Fails, Error: {e}")
39
+
40
+ return summarized_slides
41
+
42
+ def convert2markdown(self, summarized_slides: Dict[str, List[Tuple[str, str]]], target_path: Optional[str]=None) -> str:
43
+ filename = self.filename
44
+ if target_path:
45
+ filename = target_path
46
+ mdFile = MdUtils(file_name=filename, title=f'{self.filename} Presentation')
47
+ for k, v in summarized_slides.items():
48
+ mdFile.new_line('---\n')
49
+ for section in v:
50
+ tag = section[0]
51
+ content = section[1]
52
+ if tag.startswith('h'):
53
+ mdFile.new_header(level=int(tag[1]), title=content)
54
+ if tag == 'p':
55
+ contents = content.split('<n>')
56
+ for content in contents:
57
+ mdFile.new_line(f"{content}\n")
58
+ markdown = mdFile.create_md_file()
59
+ return markdown
60
+
61
+
62
+
src/text_extractor.py CHANGED
@@ -2,7 +2,6 @@ from operator import itemgetter
2
  from collections import OrderedDict
3
  from typing import Dict, List, Iterator, Union, Tuple
4
 
5
-
6
  import re
7
 
8
  class TextExtractor:
 
2
  from collections import OrderedDict
3
  from typing import Dict, List, Iterator, Union, Tuple
4
 
 
5
  import re
6
 
7
  class TextExtractor: