Spaces:
Sleeping
Sleeping
Commit
·
ab599b4
1
Parent(s):
6dd265b
Update with magic-pdf API implementation
Browse files- README.md +41 -30
- api.py +186 -0
- convert_pdf.py +92 -0
- deploy_to_hf.sh +72 -0
- download_models_hf.py +74 -0
- magic-pdf.json +62 -0
- start.sh +12 -0
- test_api.py +75 -0
README.md
CHANGED
@@ -1,17 +1,24 @@
|
|
1 |
---
|
2 |
-
title: MinerU PDF
|
3 |
emoji: 📄
|
4 |
colorFrom: blue
|
5 |
colorTo: indigo
|
6 |
sdk: docker
|
7 |
-
sdk_version: "latest"
|
8 |
-
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
10 |
---
|
11 |
|
12 |
-
# MinerU PDF
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
## API Endpoints
|
17 |
|
@@ -21,7 +28,7 @@ This Hugging Face Space provides a FastAPI-based service that uses `magic-pdf` t
|
|
21 |
GET /health
|
22 |
```
|
23 |
|
24 |
-
Returns the
|
25 |
|
26 |
### Extract PDF Content
|
27 |
|
@@ -29,47 +36,51 @@ Returns the service status and timestamp.
|
|
29 |
POST /extract
|
30 |
```
|
31 |
|
32 |
-
Upload a PDF file to extract its text
|
33 |
|
34 |
#### Request
|
35 |
|
36 |
-
-
|
37 |
-
- Body: PDF file in the 'file' field
|
38 |
|
39 |
#### Response
|
40 |
|
41 |
JSON object containing:
|
42 |
-
-
|
43 |
-
-
|
44 |
-
- Tables in Markdown format
|
45 |
|
46 |
-
##
|
47 |
|
48 |
-
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
-F "file=@your_document.pdf" \
|
54 |
-
--output result.json
|
55 |
-
```
|
56 |
|
57 |
-
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
|
|
|
|
61 |
|
62 |
-
|
63 |
-
files = {"file": open("your_document.pdf", "rb")}
|
64 |
|
65 |
-
|
66 |
-
data = response.json()
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
71 |
```
|
72 |
|
|
|
|
|
|
|
|
|
73 |
## API Documentation
|
74 |
|
75 |
Once deployed, you can access the auto-generated Swagger documentation at:
|
|
|
1 |
---
|
2 |
+
title: MinerU PDF Processor
|
3 |
emoji: 📄
|
4 |
colorFrom: blue
|
5 |
colorTo: indigo
|
6 |
sdk: docker
|
|
|
|
|
7 |
pinned: false
|
8 |
+
license: apache-2.0
|
9 |
+
app_port: 7860
|
10 |
---
|
11 |
|
12 |
+
# MinerU PDF API
|
13 |
|
14 |
+
A simple API for extracting text and tables from PDF documents using MinerU's magic-pdf library.
|
15 |
+
|
16 |
+
## Features
|
17 |
+
|
18 |
+
- Extract text from PDF documents
|
19 |
+
- Identify and extract tables from PDFs
|
20 |
+
- Works with both regular and scanned PDFs
|
21 |
+
- Simple JSON response format
|
22 |
|
23 |
## API Endpoints
|
24 |
|
|
|
28 |
GET /health
|
29 |
```
|
30 |
|
31 |
+
Returns the current status of the service.
|
32 |
|
33 |
### Extract PDF Content
|
34 |
|
|
|
36 |
POST /extract
|
37 |
```
|
38 |
|
39 |
+
Upload a PDF file to extract its text and tables.
|
40 |
|
41 |
#### Request
|
42 |
|
43 |
+
- `file`: The PDF file to process (multipart/form-data)
|
|
|
44 |
|
45 |
#### Response
|
46 |
|
47 |
JSON object containing:
|
48 |
+
- `filename`: Original filename
|
49 |
+
- `pages`: Array of pages with text and tables
|
|
|
50 |
|
51 |
+
## Deployment
|
52 |
|
53 |
+
This application is deployed as a Hugging Face Space using Docker.
|
54 |
|
55 |
+
## Local Development
|
56 |
+
|
57 |
+
To run this application locally:
|
|
|
|
|
|
|
58 |
|
59 |
+
1. Install the requirements:
|
60 |
+
```
|
61 |
+
pip install -r requirements.txt
|
62 |
+
```
|
63 |
|
64 |
+
2. Run the application:
|
65 |
+
```
|
66 |
+
python app.py
|
67 |
+
```
|
68 |
|
69 |
+
3. Access the API at `http://localhost:7860`
|
|
|
70 |
|
71 |
+
## Docker
|
|
|
72 |
|
73 |
+
You can also build and run with Docker:
|
74 |
+
|
75 |
+
```bash
|
76 |
+
docker build -t mineru-pdf-api .
|
77 |
+
docker run -p 7860:7860 mineru-pdf-api
|
78 |
```
|
79 |
|
80 |
+
## About
|
81 |
+
|
82 |
+
This API is built on top of MinerU and magic-pdf, a powerful PDF extraction tool.
|
83 |
+
|
84 |
## API Documentation
|
85 |
|
86 |
Once deployed, you can access the auto-generated Swagger documentation at:
|
api.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException
|
2 |
+
from fastapi.responses import JSONResponse, FileResponse
|
3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
4 |
+
from fastapi.staticfiles import StaticFiles
|
5 |
+
import tempfile
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import traceback
|
9 |
+
from datetime import datetime
|
10 |
+
from typing import Dict, List, Any, Optional
|
11 |
+
import shutil
|
12 |
+
from convert_pdf import convert_pdf
|
13 |
+
|
14 |
+
# Create output directory if it doesn't exist
|
15 |
+
os.makedirs("output", exist_ok=True)
|
16 |
+
os.makedirs("output/images", exist_ok=True)
|
17 |
+
|
18 |
+
# Application metadata
|
19 |
+
app_description = """
|
20 |
+
# MinerU PDF Processor API
|
21 |
+
|
22 |
+
This API provides PDF processing capabilities using MinerU's magic-pdf library.
|
23 |
+
It extracts text content, tables, and generates markdown from PDF documents.
|
24 |
+
|
25 |
+
## Features:
|
26 |
+
- PDF text extraction
|
27 |
+
- Markdown conversion
|
28 |
+
- Layout analysis
|
29 |
+
"""
|
30 |
+
|
31 |
+
app = FastAPI(
|
32 |
+
title="MinerU PDF API",
|
33 |
+
description=app_description,
|
34 |
+
version="1.0.0",
|
35 |
+
contact={
|
36 |
+
"name": "PDF Converter Service",
|
37 |
+
},
|
38 |
+
)
|
39 |
+
|
40 |
+
# Add CORS middleware to allow cross-origin requests
|
41 |
+
app.add_middleware(
|
42 |
+
CORSMiddleware,
|
43 |
+
allow_origins=["*"], # Allow all origins
|
44 |
+
allow_credentials=True,
|
45 |
+
allow_methods=["*"], # Allow all methods
|
46 |
+
allow_headers=["*"], # Allow all headers
|
47 |
+
)
|
48 |
+
|
49 |
+
# Mount the output directory as static files
|
50 |
+
app.mount("/output", StaticFiles(directory="output"), name="output")
|
51 |
+
|
52 |
+
# Health check endpoint
|
53 |
+
@app.get("/health", tags=["Health"])
|
54 |
+
async def health_check() -> Dict[str, Any]:
|
55 |
+
"""
|
56 |
+
Health check endpoint to verify the service is running.
|
57 |
+
Returns the service status and current time.
|
58 |
+
"""
|
59 |
+
return {
|
60 |
+
"status": "healthy",
|
61 |
+
"timestamp": datetime.now().isoformat(),
|
62 |
+
"service": "mineru-pdf-processor"
|
63 |
+
}
|
64 |
+
|
65 |
+
@app.post("/convert", tags=["PDF Processing"])
|
66 |
+
async def convert(file: UploadFile = File(...)) -> Dict[str, Any]:
|
67 |
+
"""
|
68 |
+
Convert a PDF file to markdown using the magic-pdf library.
|
69 |
+
|
70 |
+
Parameters:
|
71 |
+
file: The PDF file to process
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
A JSON object containing the conversion result and links to output files
|
75 |
+
"""
|
76 |
+
if not file.filename or not file.filename.lower().endswith('.pdf'):
|
77 |
+
raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
|
78 |
+
|
79 |
+
content = await file.read()
|
80 |
+
temp_pdf_path = None
|
81 |
+
|
82 |
+
try:
|
83 |
+
# Save the uploaded PDF to a temporary file
|
84 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
|
85 |
+
temp_pdf.write(content)
|
86 |
+
temp_pdf_path = temp_pdf.name
|
87 |
+
|
88 |
+
# Clear previous output files
|
89 |
+
for item in os.listdir("output/images"):
|
90 |
+
os.remove(os.path.join("output/images", item))
|
91 |
+
for item in os.listdir("output"):
|
92 |
+
if os.path.isfile(os.path.join("output", item)):
|
93 |
+
os.remove(os.path.join("output", item))
|
94 |
+
|
95 |
+
# Process the PDF using convert_pdf function
|
96 |
+
md_content = convert_pdf(temp_pdf_path)
|
97 |
+
|
98 |
+
# Get the base name of the processed file
|
99 |
+
filename_without_ext = os.path.splitext(os.path.basename(temp_pdf_path))[0]
|
100 |
+
|
101 |
+
# Gather the output files
|
102 |
+
output_files = {}
|
103 |
+
|
104 |
+
# Markdown file
|
105 |
+
md_path = os.path.join("output", f"{filename_without_ext}.md")
|
106 |
+
if os.path.exists(md_path):
|
107 |
+
output_files["markdown"] = f"/output/{filename_without_ext}.md"
|
108 |
+
|
109 |
+
# Layout PDF
|
110 |
+
layout_path = os.path.join("output", f"{filename_without_ext}_layout.pdf")
|
111 |
+
if os.path.exists(layout_path):
|
112 |
+
output_files["layout"] = f"/output/{filename_without_ext}_layout.pdf"
|
113 |
+
|
114 |
+
# Spans PDF
|
115 |
+
spans_path = os.path.join("output", f"{filename_without_ext}_spans.pdf")
|
116 |
+
if os.path.exists(spans_path):
|
117 |
+
output_files["spans"] = f"/output/{filename_without_ext}_spans.pdf"
|
118 |
+
|
119 |
+
# Model PDF
|
120 |
+
model_path = os.path.join("output", f"{filename_without_ext}_model.pdf")
|
121 |
+
if os.path.exists(model_path):
|
122 |
+
output_files["model"] = f"/output/{filename_without_ext}_model.pdf"
|
123 |
+
|
124 |
+
# Content list JSON
|
125 |
+
content_list_path = os.path.join("output", f"{filename_without_ext}_content_list.json")
|
126 |
+
if os.path.exists(content_list_path):
|
127 |
+
output_files["content_list"] = f"/output/{filename_without_ext}_content_list.json"
|
128 |
+
|
129 |
+
# Middle JSON
|
130 |
+
middle_json_path = os.path.join("output", f"{filename_without_ext}_middle.json")
|
131 |
+
if os.path.exists(middle_json_path):
|
132 |
+
output_files["middle_json"] = f"/output/{filename_without_ext}_middle.json"
|
133 |
+
|
134 |
+
return {
|
135 |
+
"filename": file.filename,
|
136 |
+
"status": "success",
|
137 |
+
"markdown_content": md_content,
|
138 |
+
"output_files": output_files
|
139 |
+
}
|
140 |
+
|
141 |
+
except Exception as e:
|
142 |
+
error_detail = str(e)
|
143 |
+
error_trace = traceback.format_exc()
|
144 |
+
|
145 |
+
# Log the error
|
146 |
+
print(f"Error processing PDF: {error_detail}")
|
147 |
+
print(error_trace)
|
148 |
+
|
149 |
+
return JSONResponse(
|
150 |
+
status_code=500,
|
151 |
+
content={
|
152 |
+
"error": "Error processing PDF",
|
153 |
+
"detail": error_detail,
|
154 |
+
"filename": file.filename if file and hasattr(file, 'filename') else None
|
155 |
+
}
|
156 |
+
)
|
157 |
+
|
158 |
+
finally:
|
159 |
+
# Clean up the temporary file
|
160 |
+
if temp_pdf_path and os.path.exists(temp_pdf_path):
|
161 |
+
try:
|
162 |
+
os.unlink(temp_pdf_path)
|
163 |
+
except Exception:
|
164 |
+
pass
|
165 |
+
|
166 |
+
@app.get("/files/{filename}", tags=["Files"])
|
167 |
+
async def get_file(filename: str):
|
168 |
+
"""
|
169 |
+
Get a file from the output directory.
|
170 |
+
|
171 |
+
Parameters:
|
172 |
+
filename: The name of the file to retrieve
|
173 |
+
|
174 |
+
Returns:
|
175 |
+
The requested file
|
176 |
+
"""
|
177 |
+
file_path = os.path.join("output", filename)
|
178 |
+
|
179 |
+
if not os.path.exists(file_path):
|
180 |
+
raise HTTPException(status_code=404, detail=f"File {filename} not found")
|
181 |
+
|
182 |
+
return FileResponse(path=file_path)
|
183 |
+
|
184 |
+
if __name__ == "__main__":
|
185 |
+
import uvicorn
|
186 |
+
uvicorn.run("api:app", host="0.0.0.0", port=7860, reload=False)
|
convert_pdf.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
4 |
+
from magic_pdf.data.dataset import PymuDocDataset
|
5 |
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
6 |
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
7 |
+
|
8 |
+
def convert_pdf(pdf_file_path):
|
9 |
+
# Get filename and prepare output paths
|
10 |
+
pdf_file_name = os.path.basename(pdf_file_path)
|
11 |
+
name_without_suff = os.path.splitext(pdf_file_name)[0]
|
12 |
+
|
13 |
+
# prepare env
|
14 |
+
local_image_dir, local_md_dir = "output/images", "output"
|
15 |
+
image_dir = str(os.path.basename(local_image_dir))
|
16 |
+
|
17 |
+
os.makedirs(local_image_dir, exist_ok=True)
|
18 |
+
os.makedirs(local_md_dir, exist_ok=True)
|
19 |
+
|
20 |
+
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
|
21 |
+
local_md_dir
|
22 |
+
)
|
23 |
+
|
24 |
+
# read bytes
|
25 |
+
reader1 = FileBasedDataReader(os.path.dirname(pdf_file_path))
|
26 |
+
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
|
27 |
+
|
28 |
+
print(f"Processing PDF: {pdf_file_path}")
|
29 |
+
|
30 |
+
# proc
|
31 |
+
## Create Dataset Instance
|
32 |
+
ds = PymuDocDataset(pdf_bytes)
|
33 |
+
|
34 |
+
## inference
|
35 |
+
if ds.classify() == SupportedPdfParseMethod.OCR:
|
36 |
+
infer_result = ds.apply(doc_analyze, ocr=True)
|
37 |
+
|
38 |
+
## pipeline
|
39 |
+
pipe_result = infer_result.pipe_ocr_mode(image_writer)
|
40 |
+
|
41 |
+
else:
|
42 |
+
infer_result = ds.apply(doc_analyze, ocr=False)
|
43 |
+
|
44 |
+
## pipeline
|
45 |
+
pipe_result = infer_result.pipe_txt_mode(image_writer)
|
46 |
+
|
47 |
+
### draw model result on each page
|
48 |
+
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
|
49 |
+
|
50 |
+
### get model inference result
|
51 |
+
model_inference_result = infer_result.get_infer_res()
|
52 |
+
|
53 |
+
### draw layout result on each page
|
54 |
+
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
|
55 |
+
|
56 |
+
### draw spans result on each page
|
57 |
+
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
|
58 |
+
|
59 |
+
### get markdown content
|
60 |
+
md_content = pipe_result.get_markdown(image_dir)
|
61 |
+
|
62 |
+
### dump markdown
|
63 |
+
md_file_path = f"{name_without_suff}.md"
|
64 |
+
pipe_result.dump_md(md_writer, md_file_path, image_dir)
|
65 |
+
print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}")
|
66 |
+
|
67 |
+
### get content list content
|
68 |
+
content_list_content = pipe_result.get_content_list(image_dir)
|
69 |
+
|
70 |
+
### dump content list
|
71 |
+
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
|
72 |
+
|
73 |
+
### get middle json
|
74 |
+
middle_json_content = pipe_result.get_middle_json()
|
75 |
+
|
76 |
+
### dump middle json
|
77 |
+
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
|
78 |
+
|
79 |
+
return md_content
|
80 |
+
|
81 |
+
if __name__ == "__main__":
|
82 |
+
if len(sys.argv) < 2:
|
83 |
+
print("Usage: python convert_pdf.py <pdf_path>")
|
84 |
+
sys.exit(1)
|
85 |
+
|
86 |
+
pdf_path = sys.argv[1]
|
87 |
+
|
88 |
+
if not os.path.exists(pdf_path):
|
89 |
+
print(f"Error: PDF file not found at {pdf_path}")
|
90 |
+
sys.exit(1)
|
91 |
+
|
92 |
+
convert_pdf(pdf_path)
|
deploy_to_hf.sh
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Script to deploy the PDF processor to Hugging Face Spaces
|
4 |
+
|
5 |
+
# Check if huggingface_hub is installed
|
6 |
+
pip show huggingface_hub > /dev/null 2>&1
|
7 |
+
if [ $? -ne 0 ]; then
|
8 |
+
echo "Installing huggingface_hub..."
|
9 |
+
pip install huggingface_hub
|
10 |
+
fi
|
11 |
+
|
12 |
+
# Set up variables
|
13 |
+
if [ -z "$1" ]; then
|
14 |
+
read -p "Enter your Hugging Face username: " HF_USERNAME
|
15 |
+
else
|
16 |
+
HF_USERNAME=$1
|
17 |
+
fi
|
18 |
+
|
19 |
+
if [ -z "$2" ]; then
|
20 |
+
read -p "Enter the name for your Space: " SPACE_NAME
|
21 |
+
else
|
22 |
+
SPACE_NAME=$2
|
23 |
+
fi
|
24 |
+
|
25 |
+
SPACE_REPO="$HF_USERNAME/$SPACE_NAME"
|
26 |
+
SPACE_URL="https://huggingface.co/spaces/$SPACE_REPO"
|
27 |
+
|
28 |
+
# Check if the repo exists
|
29 |
+
echo "Checking if the Space already exists..."
|
30 |
+
huggingface-cli repo info spaces/$SPACE_REPO > /dev/null 2>&1
|
31 |
+
if [ $? -eq 0 ]; then
|
32 |
+
echo "Space $SPACE_REPO already exists."
|
33 |
+
read -p "Do you want to continue and update it? (y/n): " CONTINUE
|
34 |
+
if [ "$CONTINUE" != "y" ] && [ "$CONTINUE" != "Y" ]; then
|
35 |
+
echo "Deployment cancelled."
|
36 |
+
exit 1
|
37 |
+
fi
|
38 |
+
else
|
39 |
+
echo "Creating new Space: $SPACE_REPO"
|
40 |
+
huggingface-cli repo create spaces/$SPACE_NAME --type space --organization $HF_USERNAME
|
41 |
+
fi
|
42 |
+
|
43 |
+
# Create a temporary directory
|
44 |
+
TEMP_DIR=$(mktemp -d)
|
45 |
+
echo "Created temporary directory: $TEMP_DIR"
|
46 |
+
|
47 |
+
# Clone the repository
|
48 |
+
echo "Cloning repository..."
|
49 |
+
git clone https://huggingface.co/spaces/$SPACE_REPO $TEMP_DIR
|
50 |
+
|
51 |
+
# Copy files to the repository
|
52 |
+
echo "Copying files to the repository..."
|
53 |
+
cp -r api.py app.py convert_pdf.py download_models_hf.py requirements.txt Dockerfile README.md .gitattributes $TEMP_DIR/
|
54 |
+
mkdir -p $TEMP_DIR/output/images
|
55 |
+
|
56 |
+
# If magic-pdf.json exists, copy it
|
57 |
+
if [ -f "magic-pdf.json" ]; then
|
58 |
+
cp magic-pdf.json $TEMP_DIR/
|
59 |
+
fi
|
60 |
+
|
61 |
+
# Configure Git LFS
|
62 |
+
cd $TEMP_DIR
|
63 |
+
git lfs install
|
64 |
+
|
65 |
+
# Add, commit, and push changes
|
66 |
+
echo "Committing changes..."
|
67 |
+
git add .
|
68 |
+
git commit -m "Update PDF processor application"
|
69 |
+
git push
|
70 |
+
|
71 |
+
echo "Deployment completed successfully!"
|
72 |
+
echo "Your Space is available at: $SPACE_URL"
|
download_models_hf.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
|
5 |
+
import requests
|
6 |
+
from huggingface_hub import snapshot_download
|
7 |
+
|
8 |
+
|
9 |
+
def download_json(url):
|
10 |
+
# 下载JSON文件
|
11 |
+
response = requests.get(url)
|
12 |
+
response.raise_for_status() # 检查请求是否成功
|
13 |
+
return response.json()
|
14 |
+
|
15 |
+
|
16 |
+
def download_and_modify_json(url, local_filename, modifications):
|
17 |
+
if os.path.exists(local_filename):
|
18 |
+
data = json.load(open(local_filename))
|
19 |
+
config_version = data.get('config_version', '0.0.0')
|
20 |
+
if config_version < '1.2.0':
|
21 |
+
data = download_json(url)
|
22 |
+
else:
|
23 |
+
data = download_json(url)
|
24 |
+
|
25 |
+
# 修改内容
|
26 |
+
for key, value in modifications.items():
|
27 |
+
data[key] = value
|
28 |
+
|
29 |
+
# 保存修改后的内容
|
30 |
+
with open(local_filename, 'w', encoding='utf-8') as f:
|
31 |
+
json.dump(data, f, ensure_ascii=False, indent=4)
|
32 |
+
|
33 |
+
|
34 |
+
if __name__ == '__main__':
|
35 |
+
|
36 |
+
mineru_patterns = [
|
37 |
+
# "models/Layout/LayoutLMv3/*",
|
38 |
+
"models/Layout/YOLO/*",
|
39 |
+
"models/MFD/YOLO/*",
|
40 |
+
"models/MFR/unimernet_hf_small_2503/*",
|
41 |
+
"models/OCR/paddleocr_torch/*",
|
42 |
+
# "models/TabRec/TableMaster/*",
|
43 |
+
# "models/TabRec/StructEqTable/*",
|
44 |
+
]
|
45 |
+
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
|
46 |
+
|
47 |
+
layoutreader_pattern = [
|
48 |
+
"*.json",
|
49 |
+
"*.safetensors",
|
50 |
+
]
|
51 |
+
layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
|
52 |
+
|
53 |
+
model_dir = model_dir + '/models'
|
54 |
+
print(f'model_dir is: {model_dir}')
|
55 |
+
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
|
56 |
+
|
57 |
+
# paddleocr_model_dir = model_dir + '/OCR/paddleocr'
|
58 |
+
# user_paddleocr_dir = os.path.expanduser('~/.paddleocr')
|
59 |
+
# if os.path.exists(user_paddleocr_dir):
|
60 |
+
# shutil.rmtree(user_paddleocr_dir)
|
61 |
+
# shutil.copytree(paddleocr_model_dir, user_paddleocr_dir)
|
62 |
+
|
63 |
+
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
|
64 |
+
config_file_name = 'magic-pdf.json'
|
65 |
+
home_dir = os.path.expanduser('~')
|
66 |
+
config_file = os.path.join(home_dir, config_file_name)
|
67 |
+
|
68 |
+
json_mods = {
|
69 |
+
'models-dir': model_dir,
|
70 |
+
'layoutreader-model-dir': layoutreader_model_dir,
|
71 |
+
}
|
72 |
+
|
73 |
+
download_and_modify_json(json_url, config_file, json_mods)
|
74 |
+
print(f'The configuration file has been configured successfully, the path is: {config_file}')
|
magic-pdf.json
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bucket_info": {
|
3 |
+
"bucket-name-1": [
|
4 |
+
"ak",
|
5 |
+
"sk",
|
6 |
+
"endpoint"
|
7 |
+
],
|
8 |
+
"bucket-name-2": [
|
9 |
+
"ak",
|
10 |
+
"sk",
|
11 |
+
"endpoint"
|
12 |
+
]
|
13 |
+
},
|
14 |
+
"models-dir": "/Users/marcos/.cache/huggingface/hub/models--opendatalab--PDF-Extract-Kit-1.0/snapshots/14efd64068741c8e1d79d635dd236a80a9db66ba/models",
|
15 |
+
"layoutreader-model-dir": "/Users/marcos/.cache/huggingface/hub/models--hantian--layoutreader/snapshots/641226775a0878b1014a96ad01b9642915136853",
|
16 |
+
"device-mode": "cuda",
|
17 |
+
"layout-config": {
|
18 |
+
"model": "doclayout_yolo"
|
19 |
+
},
|
20 |
+
"formula-config": {
|
21 |
+
"mfd_model": "yolo_v8_mfd",
|
22 |
+
"mfr_model": "unimernet_small",
|
23 |
+
"enable": true
|
24 |
+
},
|
25 |
+
"table-config": {
|
26 |
+
"model": "rapid_table",
|
27 |
+
"sub_model": "slanet_plus",
|
28 |
+
"enable": true,
|
29 |
+
"max_time": 400
|
30 |
+
},
|
31 |
+
"latex-delimiter-config": {
|
32 |
+
"display": {
|
33 |
+
"left": "$$",
|
34 |
+
"right": "$$"
|
35 |
+
},
|
36 |
+
"inline": {
|
37 |
+
"left": "$",
|
38 |
+
"right": "$"
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"llm-aided-config": {
|
42 |
+
"formula_aided": {
|
43 |
+
"api_key": "your_api_key",
|
44 |
+
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
45 |
+
"model": "qwen2.5-7b-instruct",
|
46 |
+
"enable": false
|
47 |
+
},
|
48 |
+
"text_aided": {
|
49 |
+
"api_key": "your_api_key",
|
50 |
+
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
51 |
+
"model": "qwen2.5-7b-instruct",
|
52 |
+
"enable": false
|
53 |
+
},
|
54 |
+
"title_aided": {
|
55 |
+
"api_key": "your_api_key",
|
56 |
+
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
57 |
+
"model": "qwen2.5-32b-instruct",
|
58 |
+
"enable": false
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"config_version": "1.2.1"
|
62 |
+
}
|
start.sh
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Start script for Hugging Face Spaces deployment
|
4 |
+
|
5 |
+
# Activate the virtual environment
|
6 |
+
. /opt/mineru_venv/bin/activate
|
7 |
+
|
8 |
+
# Set environment variables
|
9 |
+
export HF_SPACE_ID="${SPACE_ID:-default}"
|
10 |
+
|
11 |
+
# Start the FastAPI server
|
12 |
+
python -m uvicorn api:app --host 0.0.0.0 --port 7860
|
test_api.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for the PDF processor API
|
4 |
+
"""
|
5 |
+
import requests
|
6 |
+
import argparse
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
def test_api(api_url, pdf_path):
|
12 |
+
"""
|
13 |
+
Test the PDF processor API by sending a PDF file and checking the response
|
14 |
+
"""
|
15 |
+
print(f"Testing API at {api_url} with PDF file: {pdf_path}")
|
16 |
+
|
17 |
+
if not os.path.exists(pdf_path):
|
18 |
+
print(f"Error: PDF file not found at {pdf_path}")
|
19 |
+
return
|
20 |
+
|
21 |
+
# Send the PDF file to the API
|
22 |
+
with open(pdf_path, 'rb') as pdf_file:
|
23 |
+
files = {'file': (os.path.basename(pdf_path), pdf_file, 'application/pdf')}
|
24 |
+
|
25 |
+
try:
|
26 |
+
print("Sending request to API...")
|
27 |
+
response = requests.post(f"{api_url}/convert", files=files)
|
28 |
+
|
29 |
+
if response.status_code == 200:
|
30 |
+
print("Request successful!")
|
31 |
+
result = response.json()
|
32 |
+
|
33 |
+
# Print response summary
|
34 |
+
print("\nResponse summary:")
|
35 |
+
print(f"Filename: {result.get('filename', 'N/A')}")
|
36 |
+
print(f"Status: {result.get('status', 'N/A')}")
|
37 |
+
|
38 |
+
# Check output files
|
39 |
+
output_files = result.get('output_files', {})
|
40 |
+
print("\nOutput files:")
|
41 |
+
for file_type, file_path in output_files.items():
|
42 |
+
print(f"- {file_type}: {file_path}")
|
43 |
+
|
44 |
+
# Save the markdown content to a file
|
45 |
+
md_content = result.get('markdown_content', '')
|
46 |
+
output_dir = Path('test_output')
|
47 |
+
output_dir.mkdir(exist_ok=True)
|
48 |
+
|
49 |
+
output_file = output_dir / f"{Path(pdf_path).stem}_output.md"
|
50 |
+
with open(output_file, 'w') as f:
|
51 |
+
f.write(md_content)
|
52 |
+
|
53 |
+
print(f"\nMarkdown content saved to: {output_file}")
|
54 |
+
|
55 |
+
# Save the full response as JSON
|
56 |
+
response_file = output_dir / f"{Path(pdf_path).stem}_response.json"
|
57 |
+
with open(response_file, 'w') as f:
|
58 |
+
json.dump(result, f, indent=2)
|
59 |
+
|
60 |
+
print(f"Full response saved to: {response_file}")
|
61 |
+
|
62 |
+
else:
|
63 |
+
print(f"Request failed with status code: {response.status_code}")
|
64 |
+
print(f"Response content: {response.text}")
|
65 |
+
|
66 |
+
except Exception as e:
|
67 |
+
print(f"Error during API test: {str(e)}")
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
parser = argparse.ArgumentParser(description="Test the PDF processor API")
|
71 |
+
parser.add_argument("--api", default="http://localhost:7860", help="API URL (default: http://localhost:7860)")
|
72 |
+
parser.add_argument("--pdf", required=True, help="Path to the PDF file to test")
|
73 |
+
|
74 |
+
args = parser.parse_args()
|
75 |
+
test_api(args.api, args.pdf)
|