Update app.py
Browse files
app.py
CHANGED
|
@@ -41,13 +41,16 @@ def read_fn(path):
|
|
| 41 |
|
| 42 |
|
| 43 |
# @spaces.GPU
|
| 44 |
-
def parse_pdf(doc_path, output_dir, end_page_id):
|
| 45 |
os.makedirs(output_dir, exist_ok=True)
|
| 46 |
|
| 47 |
try:
|
| 48 |
file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
|
| 49 |
pdf_data = read_fn(doc_path)
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
| 51 |
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
|
| 52 |
do_parse(
|
| 53 |
output_dir,
|
|
@@ -108,9 +111,9 @@ def replace_image_with_base64(markdown_text, image_dir_path):
|
|
| 108 |
return re.sub(pattern, replace, markdown_text)
|
| 109 |
|
| 110 |
|
| 111 |
-
def to_markdown(file_path, end_pages):
|
| 112 |
# 获取识别的md文件以及压缩包文件路径
|
| 113 |
-
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1)
|
| 114 |
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
|
| 115 |
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
| 116 |
if zip_archive_success == 0:
|
|
@@ -177,6 +180,7 @@ if __name__ == "__main__":
|
|
| 177 |
pdf_show = gr.Markdown()
|
| 178 |
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
|
| 179 |
with gr.Row() as bu_flow:
|
|
|
|
| 180 |
change_bu = gr.Button("Convert")
|
| 181 |
clear_bu = gr.ClearButton([pdf_show], value="Clear")
|
| 182 |
# pdf_show = gr.HTML(label="PDF preview")
|
|
@@ -191,7 +195,7 @@ if __name__ == "__main__":
|
|
| 191 |
with gr.Tab("Markdown text"):
|
| 192 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
| 193 |
# file.upload(fn=show_pdf, inputs=file, outputs=pdf_show)
|
| 194 |
-
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages], outputs=[md, md_text, output_file, pdf_show])
|
| 195 |
-
clear_bu.add([md, pdf_show, md_text, output_file])
|
| 196 |
|
| 197 |
demo.launch()
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
# @spaces.GPU
|
| 44 |
+
def parse_pdf(doc_path, output_dir, end_page_id, ocr):
|
| 45 |
os.makedirs(output_dir, exist_ok=True)
|
| 46 |
|
| 47 |
try:
|
| 48 |
file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
|
| 49 |
pdf_data = read_fn(doc_path)
|
| 50 |
+
if ocr:
|
| 51 |
+
parse_method = "ocr"
|
| 52 |
+
else:
|
| 53 |
+
parse_method = "auto"
|
| 54 |
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
|
| 55 |
do_parse(
|
| 56 |
output_dir,
|
|
|
|
| 111 |
return re.sub(pattern, replace, markdown_text)
|
| 112 |
|
| 113 |
|
| 114 |
+
def to_markdown(file_path, end_pages, ocr):
|
| 115 |
# 获取识别的md文件以及压缩包文件路径
|
| 116 |
+
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, ocr)
|
| 117 |
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
|
| 118 |
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
| 119 |
if zip_archive_success == 0:
|
|
|
|
| 180 |
pdf_show = gr.Markdown()
|
| 181 |
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
|
| 182 |
with gr.Row() as bu_flow:
|
| 183 |
+
is_ocr = gr.Checkbox(label="Force enable OCR")
|
| 184 |
change_bu = gr.Button("Convert")
|
| 185 |
clear_bu = gr.ClearButton([pdf_show], value="Clear")
|
| 186 |
# pdf_show = gr.HTML(label="PDF preview")
|
|
|
|
| 195 |
with gr.Tab("Markdown text"):
|
| 196 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
| 197 |
# file.upload(fn=show_pdf, inputs=file, outputs=pdf_show)
|
| 198 |
+
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show])
|
| 199 |
+
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
|
| 200 |
|
| 201 |
demo.launch()
|