Upload folder using huggingface_hub
Browse files- .gitattributes +41 -9
- LICENSE +27 -0
- README.md +201 -3
- README_CN.md +215 -0
- configuration.json +1 -0
- onnx/PH-OCRv1/rec/ch_rec_decoder_v1.onnx +3 -0
- onnx/PH-OCRv1/rec/ch_rec_encoder_v1.onnx +3 -0
- onnx/PH-OCRv1/rec/jp_rec_decoder_v1.onnx +3 -0
- onnx/PH-OCRv1/rec/jp_rec_encoder_v1.onnx +3 -0
- onnx/PH-OCRv1/rec/ko_rec_decoder_v1.onnx +3 -0
- onnx/PH-OCRv1/rec/ko_rec_encoder_v1.onnx +3 -0
- onnx/PH-OCRv1/rec/ru_rec_decoder_v1.onnx +3 -0
- onnx/PH-OCRv1/rec/ru_rec_encoder_v1.onnx +3 -0
- onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx +3 -0
- onnx/PP-OCRv4/det/Multilingual_PP-OCRv3_det_infer.onnx +3 -0
- onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx +3 -0
- onnx/PP-OCRv4/det/ch_PP-OCRv4_det_server_infer.onnx +3 -0
- onnx/PP-OCRv4/det/en_PP-OCRv3_det_infer.onnx +3 -0
- onnx/PP-OCRv5/det/ch_PP-OCRv5_mobile_det.onnx +3 -0
- onnx/PP-OCRv5/det/ch_PP-OCRv5_server_det.onnx +3 -0
- resources/fonts/FZYTK.TTF +3 -0
- resources/fonts/cyrillic.ttf +0 -0
- resources/fonts/japan.ttc +3 -0
- resources/fonts/korean.ttf +3 -0
- resources/fonts/方正宋黑.TTF +3 -0
.gitattributes
CHANGED
@@ -1,35 +1,67 @@
|
|
1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
4 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
5 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
|
|
6 |
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
|
|
11 |
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
13 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
14 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
15 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
17 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
19 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
|
|
20 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
21 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
22 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
23 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
|
|
24 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.db* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.ark* filter=lfs diff=lfs merge=lfs -text
|
30 |
+
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
|
31 |
+
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
|
32 |
+
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.gguf* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.ggml filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.llamafile* filter=lfs diff=lfs merge=lfs -text
|
38 |
+
*.pt2 filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
41 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
42 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
46 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
47 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
48 |
+
|
49 |
+
resources/fonts/FZYTK.TTF filter=lfs diff=lfs merge=lfs -text
|
50 |
+
resources/fonts/japan.ttc filter=lfs diff=lfs merge=lfs -text
|
51 |
+
resources/fonts/korean.ttf filter=lfs diff=lfs merge=lfs -text
|
52 |
+
resources/fonts/方正宋黑.TTF filter=lfs diff=lfs merge=lfs -text
|
53 |
+
|
54 |
+
resources/fonts/FZYTK.TTF filter=lfs diff=lfs merge=lfs -text
|
55 |
+
resources/fonts/japan.ttc filter=lfs diff=lfs merge=lfs -text
|
56 |
+
resources/fonts/korean.ttf filter=lfs diff=lfs merge=lfs -text
|
57 |
+
resources/fonts/方正宋黑.TTF filter=lfs diff=lfs merge=lfs -text
|
58 |
+
|
59 |
+
resources/fonts/FZYTK.TTF filter=lfs diff=lfs merge=lfs -text
|
60 |
+
resources/fonts/japan.ttc filter=lfs diff=lfs merge=lfs -text
|
61 |
+
resources/fonts/korean.ttf filter=lfs diff=lfs merge=lfs -text
|
62 |
+
resources/fonts/方正宋黑.TTF filter=lfs diff=lfs merge=lfs -text
|
63 |
+
|
64 |
+
resources/fonts/japan.ttc filter=lfs diff=lfs merge=lfs -text
|
65 |
+
resources/fonts/FZYTK.TTF filter=lfs diff=lfs merge=lfs -text
|
66 |
+
resources/fonts/korean.ttf filter=lfs diff=lfs merge=lfs -text
|
67 |
+
resources/fonts/方正宋黑.TTF filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 PuHui Lab
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
22 |
+
|
23 |
+
Additional Terms
|
24 |
+
|
25 |
+
If you use the PHOCR models in commercial products or services,
|
26 |
+
you **must include clear attribution** to PHOCR in your product documentation,
|
27 |
+
user interface, or other appropriate locations, e.g. Powered By PHOCR from PuHui Lab.
|
README.md
CHANGED
@@ -1,3 +1,201 @@
|
|
1 |
-
---
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- ocr
|
4 |
+
- image-to-text
|
5 |
+
license: mit
|
6 |
+
library_name: transformers
|
7 |
+
---
|
8 |
+
|
9 |
+
# Model Card: PHOCR
|
10 |
+
|
11 |
+
an open high-performance Optical Character Recognition (OCR) toolkit [PHOCR](https://github.com/puhuilab/phocr).
|
12 |
+
|
13 |
+
# PHOCR: High-Performance OCR Toolkit
|
14 |
+
|
15 |
+
[English](README.md) | [简体中文](README_CN.md)
|
16 |
+
|
17 |
+
PHOCR is an open high-performance Optical Character Recognition (OCR) toolkit designed for efficient text recognition across multiple languages including Chinese, Japanese, Korean, Russian, Vietnamese, and Thai. **PHOCR features a completely custom-developed recognition model (PH-OCRv1) that significantly outperforms existing solutions.**
|
18 |
+
|
19 |
+
## Motivation
|
20 |
+
|
21 |
+
Current token-prediction-based model architectures are highly sensitive to the accuracy of contextual tokens. Repetitive patterns, even as few as a thousand instances, can lead to persistent memorization by the model. While most open-source text recognition models currently achieve character error rates (CER) in the percent range, our goal is to push this further into the per-mille range. At that level, for a system processing 100 million characters, the total number of recognition errors would be reduced to under 1 million — an order of magnitude improvement.
|
22 |
+
|
23 |
+
## Features
|
24 |
+
|
25 |
+
- **Custom Recognition Model**: **PH-OCRv1** achieves sub-0.x% character error rate in document-style settings by leveraging open-source models. Even achieves 0.0x% character error rate in English.
|
26 |
+
- **Multi-language Support**: Chinese, English, Japanese, Korean, Russian, and more
|
27 |
+
- **Rich Vocabulary**: Comprehensive vocabulary for each language. Chinese: 15,316, Korean: 17,388, Japanese: 11,186, Russian: 292.
|
28 |
+
- **High Performance**: Optimized inference engine with ONNX Runtime support
|
29 |
+
- **Easy Integration**: Simple Python API for quick deployment
|
30 |
+
- **Cross-platform**: Support for CPU and CUDA
|
31 |
+
|
32 |
+
## Visualization
|
33 |
+
|
34 |
+

|
35 |
+
|
36 |
+
## Installation
|
37 |
+
|
38 |
+
```bash
|
39 |
+
pip install phocr
|
40 |
+
```
|
41 |
+
|
42 |
+
## Quick Start
|
43 |
+
|
44 |
+
```python
|
45 |
+
from phocr import PHOCR
|
46 |
+
|
47 |
+
# Initialize OCR engine
|
48 |
+
engine = PHOCR()
|
49 |
+
|
50 |
+
# Perform OCR on image
|
51 |
+
result = engine("path/to/image.jpg")
|
52 |
+
print(result)
|
53 |
+
|
54 |
+
# Visualize results
|
55 |
+
result.vis("output.jpg")
|
56 |
+
print(result.to_markdown())
|
57 |
+
```
|
58 |
+
|
59 |
+
## Benchmarks
|
60 |
+
|
61 |
+
We conducted comprehensive benchmarks comparing PHOCR with leading OCR solutions across multiple languages and scenarios. **Our custom-developed PH-OCRv1 model demonstrates significant improvements over existing solutions.**
|
62 |
+
|
63 |
+
### Overall Performance Comparison
|
64 |
+
|
65 |
+
<table style="width: 90%; margin: auto; border-collapse: collapse; font-size: small;">
|
66 |
+
<thead>
|
67 |
+
<tr>
|
68 |
+
<th rowspan="2">Model</th>
|
69 |
+
<th colspan="4">ZH & EN<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
|
70 |
+
<th colspan="2">JP<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
|
71 |
+
<th colspan="2">KO<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
|
72 |
+
<th colspan="1">RU<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
|
73 |
+
</tr>
|
74 |
+
<tr>
|
75 |
+
<th><i>English</i></th>
|
76 |
+
<th><i>Simplified Chinese</i></th>
|
77 |
+
<th><i>EN CH Mixed</i></th>
|
78 |
+
<th><i>Traditional Chinese</i></th>
|
79 |
+
<th><i>Document</i></th>
|
80 |
+
<th><i>Scene</i></th>
|
81 |
+
<th><i>Document</i></th>
|
82 |
+
<th><i>Scene</i></th>
|
83 |
+
<th><i>Document</i></th>
|
84 |
+
</tr>
|
85 |
+
</thead>
|
86 |
+
<tbody>
|
87 |
+
<tr>
|
88 |
+
<td>PHOCR</td>
|
89 |
+
<td><strong>0.0008</strong></td>
|
90 |
+
<td><strong>0.0057</strong></td>
|
91 |
+
<td><strong>0.0171</strong></td>
|
92 |
+
<td><strong>0.0145</strong></td>
|
93 |
+
<td><strong>0.0039</strong></td>
|
94 |
+
<td><strong>0.0197</strong></td>
|
95 |
+
<td><strong>0.0050</strong></td>
|
96 |
+
<td><strong>0.0255</strong></td>
|
97 |
+
<td><strong>0.0046</strong></td>
|
98 |
+
</tr>
|
99 |
+
<tr>
|
100 |
+
<td>Baidu</td>
|
101 |
+
<td>0.0014</td>
|
102 |
+
<td>0.0069</td>
|
103 |
+
<td>0.0354</td>
|
104 |
+
<td>0.0431</td>
|
105 |
+
<td>0.0222</td>
|
106 |
+
<td>0.0607</td>
|
107 |
+
<td>0.0238</td>
|
108 |
+
<td>0.212</td>
|
109 |
+
<td>0.0786</td>
|
110 |
+
</tr>
|
111 |
+
<tr>
|
112 |
+
<td>Ali</td>
|
113 |
+
<td>-</td>
|
114 |
+
<td>-</td>
|
115 |
+
<td>-</td>
|
116 |
+
<td>-</td>
|
117 |
+
<td>0.0272</td>
|
118 |
+
<td>0.0564</td>
|
119 |
+
<td>0.0159</td>
|
120 |
+
<td>0.102</td>
|
121 |
+
<td>0.0616</td>
|
122 |
+
</tr>
|
123 |
+
</tbody>
|
124 |
+
</table>
|
125 |
+
|
126 |
+
|
127 |
+
Notice
|
128 |
+
|
129 |
+
- baidu: [Baidu Accurate API](https://ai.baidu.com/tech/ocr/general)
|
130 |
+
- Ali: [Aliyun API](https://help.aliyun.com/zh/ocr/product-overview/recognition-of-characters-in-languages-except-for-chinese-and-english-1)
|
131 |
+
- CER: the total edit distance divided by the total number of characters in the ground truth.
|
132 |
+
|
133 |
+
|
134 |
+
## Advanced Usage
|
135 |
+
|
136 |
+
With global KV cache enabled, we implement a simple version using PyTorch (CUDA). When running with torch (CUDA), you can enable caching by setting `use_cache=True` in `ORTSeq2Seq(...)`, which also allows for larger batch sizes.
|
137 |
+
|
138 |
+
### Language-specific Configuration
|
139 |
+
|
140 |
+
See [demo.py](./demo.py) for more examples.
|
141 |
+
|
142 |
+
## Evaluation & Benchmarking
|
143 |
+
|
144 |
+
PHOCR provides comprehensive benchmarking tools to evaluate model performance across different languages and scenarios.
|
145 |
+
|
146 |
+
### Quick Benchmark
|
147 |
+
|
148 |
+
Run the complete benchmark pipeline:
|
149 |
+
```bash
|
150 |
+
sh benchmark/run_recognition.sh
|
151 |
+
```
|
152 |
+
|
153 |
+
Calculate Character Error Rate (CER) for model predictions:
|
154 |
+
```bash
|
155 |
+
sh benchmark/run_score.sh
|
156 |
+
```
|
157 |
+
|
158 |
+
### Benchmark Datasets
|
159 |
+
|
160 |
+
PHOCR uses standardized benchmark datasets for fair comparison:
|
161 |
+
|
162 |
+
- **zh_en_rec_bench** [Chinese & English mixed text recognition](https://huggingface.co/datasets/puhuilab/zh_en_rec_bench)
|
163 |
+
- **jp_rec_bench** [Japanese text recognition](https://huggingface.co/datasets/puhuilab/jp_rec_bench)
|
164 |
+
- **ko_rec_bench** [Korean text recognition](https://huggingface.co/datasets/puhuilab/ko_rec_bench)
|
165 |
+
- **ru_rec_bench** [Russian text recognition](https://huggingface.co/datasets/puhuilab/ru_rec_bench)
|
166 |
+
|
167 |
+
## Further Improvements
|
168 |
+
|
169 |
+
- Character error rate (CER), including punctuation, can be further reduced through additional normalization of the training corpus.
|
170 |
+
- Text detection accuracy can be further enhanced by employing a more advanced detection framework.
|
171 |
+
|
172 |
+
## Contributing
|
173 |
+
|
174 |
+
We welcome contributions! Please feel free to submit issues, feature requests, or pull requests.
|
175 |
+
|
176 |
+
## Support
|
177 |
+
|
178 |
+
For questions and support, please open an issue on GitHub or contact the maintainers.
|
179 |
+
|
180 |
+
## Acknowledgements
|
181 |
+
|
182 |
+
Many thanks to [RapidOCR](https://github.com/RapidAI/RapidOCR) for detection and main framework.
|
183 |
+
|
184 |
+
## License
|
185 |
+
|
186 |
+
- This project is released under the Apache 2.0 license
|
187 |
+
- The copyright of the OCR detection and classification model is held by Baidu
|
188 |
+
- The PHOCR recognition models are under the modified MIT License - see the [LICENSE](./LICENSE) file for details
|
189 |
+
|
190 |
+
## Citation
|
191 |
+
|
192 |
+
If you use PHOCR in your research, please cite:
|
193 |
+
|
194 |
+
```bibtex
|
195 |
+
@misc{phocr2025,
|
196 |
+
title={PHOCR: High-Performance OCR Toolkit},
|
197 |
+
author={PuHui Lab},
|
198 |
+
year={2025},
|
199 |
+
url={https://github.com/puhuilab/phocr}
|
200 |
+
}
|
201 |
+
```
|
README_CN.md
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- ocr
|
4 |
+
- image-to-text
|
5 |
+
license: mit
|
6 |
+
library_name: transformers
|
7 |
+
---
|
8 |
+
|
9 |
+
# Model Card: PHOCR
|
10 |
+
|
11 |
+
高性能文字识别工具包 [PHOCR](https://github.com/puhuilab/phocr).
|
12 |
+
|
13 |
+
|
14 |
+
# PHOCR:
|
15 |
+
|
16 |
+
[English](README.md) | [简体中文](README_CN.md)
|
17 |
+
|
18 |
+
PHOCR 是一个高性能的开源光学字符识别(OCR)工具包,专为多语种文本识别任务设计,支持包括中文、日文、韩文、俄文、越南文和泰文在内的多种语言。**PHOCR 搭载了我们完全自研的识别模型 PH-OCRv1,在准确率上显著优于现有解决方案。**
|
19 |
+
|
20 |
+
## 背景动机
|
21 |
+
|
22 |
+
当前基于下一个 token 预测的模型结构对上下文 token 的准确性非常敏感。即使仅有千次重复的模式,也可能导致模型产生永久性记忆。虽然大多数开源文字识别模型目前的字符错误率(CER)仍处于百分位水平,我们的目标是将其进一步提升至千分位(0.x%)。在这一精度下,对于处理 1 亿字符的系统,总错误字符数将下降至100万内,达到数量级的准确率提升。
|
23 |
+
|
24 |
+
## 主要特性
|
25 |
+
|
26 |
+
- **自研识别模型**:**PH-OCRv1**,可在文档场景中实现千分位CER,英文场景下甚至可达0.0x%。
|
27 |
+
- **多语种支持**:支持中文、英文、日文、韩文、俄文等多种语言。
|
28 |
+
- **丰富词表覆盖**:中文 15316,韩文 17388,日文 11186,俄文 292。
|
29 |
+
- **高性能推理**:深度优化的模型结构,集成 ONNX Runtime 支持。
|
30 |
+
- **轻松集成**:提供简洁的 Python API。
|
31 |
+
- **跨平台**:支持 CPU 与 CUDA 环境。
|
32 |
+
|
33 |
+
## 可视化效果
|
34 |
+
|
35 |
+

|
36 |
+
|
37 |
+
## 安装方式
|
38 |
+
|
39 |
+
```bash
|
40 |
+
pip install phocr
|
41 |
+
```
|
42 |
+
|
43 |
+
## 快速开始
|
44 |
+
|
45 |
+
```python
|
46 |
+
from phocr import PHOCR
|
47 |
+
|
48 |
+
# 初始化 OCR 引擎
|
49 |
+
engine = PHOCR()
|
50 |
+
|
51 |
+
# 对图像进行 OCR 识别
|
52 |
+
result = engine("path/to/image.jpg")
|
53 |
+
print(result)
|
54 |
+
|
55 |
+
# 可视化结果
|
56 |
+
result.vis("output.jpg")
|
57 |
+
print(result.to_markdown())
|
58 |
+
```
|
59 |
+
|
60 |
+
## 性能基准测试
|
61 |
+
|
62 |
+
我们进行了全面的基准测试,将 PHOCR 与领先的 OCR 解决方案在多种语言和场景下进行比较。**我们自研的 PH-OCRv1 模型在现有解决方案基础上实现了显著改进。**
|
63 |
+
|
64 |
+
### 整体性能对比
|
65 |
+
|
66 |
+
<table style="width: 90%; margin: auto; border-collapse: collapse; font-size: small;">
|
67 |
+
<thead>
|
68 |
+
<tr>
|
69 |
+
<th rowspan="2">模型</th>
|
70 |
+
<th colspan="4">中文 & 英文<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
|
71 |
+
<th colspan="2">日文<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
|
72 |
+
<th colspan="2">韩文<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
|
73 |
+
<th colspan="1">俄文<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
|
74 |
+
</tr>
|
75 |
+
<tr>
|
76 |
+
<th><i>英文</i></th>
|
77 |
+
<th><i>简体中文</i></th>
|
78 |
+
<th><i>中英混合</i></th>
|
79 |
+
<th><i>繁体中文</i></th>
|
80 |
+
<th><i>文档</i></th>
|
81 |
+
<th><i>场景</i></th>
|
82 |
+
<th><i>文档</i></th>
|
83 |
+
<th><i>场景</i></th>
|
84 |
+
<th><i>文档</i></th>
|
85 |
+
</tr>
|
86 |
+
</thead>
|
87 |
+
<tbody>
|
88 |
+
<tr>
|
89 |
+
<td>PHOCR</td>
|
90 |
+
<td><strong>0.0008</strong></td>
|
91 |
+
<td><strong>0.0057</strong></td>
|
92 |
+
<td><strong>0.0171</strong></td>
|
93 |
+
<td><strong>0.0145</strong></td>
|
94 |
+
<td><strong>0.0039</strong></td>
|
95 |
+
<td><strong>0.0197</strong></td>
|
96 |
+
<td><strong>0.0050</strong></td>
|
97 |
+
<td><strong>0.0255</strong></td>
|
98 |
+
<td><strong>0.0046</strong></td>
|
99 |
+
</tr>
|
100 |
+
<tr>
|
101 |
+
<td>百度</td>
|
102 |
+
<td>0.0014</td>
|
103 |
+
<td>0.0069</td>
|
104 |
+
<td>0.0354</td>
|
105 |
+
<td>0.0431</td>
|
106 |
+
<td>0.0222</td>
|
107 |
+
<td>0.0607</td>
|
108 |
+
<td>0.0238</td>
|
109 |
+
<td>0.212</td>
|
110 |
+
<td>0.0786</td>
|
111 |
+
</tr>
|
112 |
+
<tr>
|
113 |
+
<td>阿里</td>
|
114 |
+
<td>-</td>
|
115 |
+
<td>-</td>
|
116 |
+
<td>-</td>
|
117 |
+
<td>-</td>
|
118 |
+
<td>0.0272</td>
|
119 |
+
<td>0.0564</td>
|
120 |
+
<td>0.0159</td>
|
121 |
+
<td>0.102</td>
|
122 |
+
<td>0.0616</td>
|
123 |
+
</tr>
|
124 |
+
<tr>
|
125 |
+
<td>PPOCR V5</td>
|
126 |
+
<td>0.011</td>
|
127 |
+
<td>0.060</td>
|
128 |
+
<td>0.032</td>
|
129 |
+
<td>0.061</td>
|
130 |
+
<td>-</td>
|
131 |
+
<td>-</td>
|
132 |
+
<td>-</td>
|
133 |
+
<td>-</td>
|
134 |
+
<td>-</td>
|
135 |
+
</tr>
|
136 |
+
</tbody>
|
137 |
+
</table>
|
138 |
+
|
139 |
+
说明:
|
140 |
+
|
141 |
+
- baidu: [Baidu Accurate API](https://ai.baidu.com/tech/ocr/general)
|
142 |
+
- Ali: [Aliyun API](https://help.aliyun.com/zh/ocr/product-overview/recognition-of-characters-in-languages-except-for-chinese-and-english-1)
|
143 |
+
- 字符错误率(CER):总的编辑距离除以真实标签(ground truth)中字符的总数量。
|
144 |
+
|
145 |
+
## 高级用法
|
146 |
+
|
147 |
+
启用全局 KV 缓存后,我们使用 PyTorch (CUDA) 实现了一个简单版本。在使用 torch (CUDA) 运行时,您可以通过在 `ORTSeq2Seq(...)` 中设置 `use_cache=True` 来启用缓存,这也允许更大的批处理大小。
|
148 |
+
|
149 |
+
### 语言特定配置
|
150 |
+
|
151 |
+
更多示例请参见 [demo.py](./demo.py)。
|
152 |
+
|
153 |
+
## 评估与基准测试
|
154 |
+
|
155 |
+
PHOCR 提供全面的基准测试工具,用于评估模型在不同语言和场景下的性能。
|
156 |
+
|
157 |
+
### 快速基准测试
|
158 |
+
|
159 |
+
运行完整的基准测试流程:
|
160 |
+
```bash
|
161 |
+
sh benchmark/run_recognition.sh
|
162 |
+
```
|
163 |
+
|
164 |
+
计算模型预测的字符错误率 (CER):
|
165 |
+
```bash
|
166 |
+
sh benchmark/run_score.sh
|
167 |
+
```
|
168 |
+
|
169 |
+
### 基准测试数据集
|
170 |
+
|
171 |
+
PHOCR 使用标准化的基准测试数据集进行公平比较:
|
172 |
+
|
173 |
+
- **zh_en_rec_bench** [中英文混合文本识别](https://huggingface.co/datasets/puhuilab/zh_en_rec_bench)
|
174 |
+
- **jp_rec_bench** [日文文本识别](https://huggingface.co/datasets/puhuilab/jp_rec_bench)
|
175 |
+
- **ko_rec_bench** [韩文文本识别](https://huggingface.co/datasets/puhuilab/ko_rec_bench)
|
176 |
+
- **ru_rec_bench** [俄文文本识别](https://huggingface.co/datasets/puhuilab/ru_rec_bench)
|
177 |
+
|
178 |
+
中英文混合文本识别主要来自于[OmniDocBench](https://github.com/opendatalab/OmniDocBench)的随机采样。
|
179 |
+
其它数据由我们的团队手工采集完成。
|
180 |
+
|
181 |
+
## 后续优化方向
|
182 |
+
|
183 |
+
- 通过进一步归一化训练语料,字符错误率(CER,包括标点符号)可以得到进一步降低。
|
184 |
+
- 通过采用更先进的检测框架,文字检测的准确率可以进一步提升。
|
185 |
+
|
186 |
+
## 贡献指南
|
187 |
+
|
188 |
+
我们欢迎任何贡献!请随时提交 issue、功能请求或 pull request。
|
189 |
+
|
190 |
+
## 支持
|
191 |
+
|
192 |
+
如有问题或需要支持,请在 GitHub 上提交 issue 或联系维护者。
|
193 |
+
|
194 |
+
## 鸣谢
|
195 |
+
|
196 |
+
特别感谢 [RapidOCR](https://github.com/RapidAI/RapidOCR) 提供的检测模型及主框架支持。
|
197 |
+
|
198 |
+
## 许可证
|
199 |
+
|
200 |
+
- 本项目采用 Apache 2.0 开源许可证
|
201 |
+
- OCR 检测和分类模型的版权归百度所有
|
202 |
+
- PHOCR 识别模型采用修改版 MIT 许可证,详情请见 [LICENSE](./LICENSE) 文件
|
203 |
+
|
204 |
+
## 引用方式
|
205 |
+
|
206 |
+
如果您在研究中使用了 PHOCR,请引用:
|
207 |
+
|
208 |
+
```bibtex
|
209 |
+
@misc{phocr2025,
|
210 |
+
title={PHOCR: High-Performance OCR Toolkit},
|
211 |
+
author={PuHui Lab},
|
212 |
+
year={2025},
|
213 |
+
url={https://github.com/puhuilab/phocr}
|
214 |
+
}
|
215 |
+
```
|
configuration.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"framework":"other","task":"other"}
|
onnx/PH-OCRv1/rec/ch_rec_decoder_v1.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1a88ee6975b11b729c814b0fd4caac7b563ed3d4e68cc90b98a609222eb04ba
|
3 |
+
size 126086098
|
onnx/PH-OCRv1/rec/ch_rec_encoder_v1.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb4140a6369b3be487d7c05e84a89e9c58e954bc2de9f5f0ee988d46f492499f
|
3 |
+
size 99945814
|
onnx/PH-OCRv1/rec/jp_rec_decoder_v1.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b9c330f578606c625f449c49e8a993338e3e100ab0a67ef0d171360d410d3be9
|
3 |
+
size 113369603
|
onnx/PH-OCRv1/rec/jp_rec_encoder_v1.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a8ceade124c37c6a6ce7157e30322859a257a2aedc4f1f886473ea10d8478578
|
3 |
+
size 99945814
|
onnx/PH-OCRv1/rec/ko_rec_decoder_v1.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6798eb63098fa0342c340ee3d6f6c42b2cbff74e0572e4abe1b630462b7d039c
|
3 |
+
size 132465395
|
onnx/PH-OCRv1/rec/ko_rec_encoder_v1.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:215c9a04af7a37f00cf1deef6a12ac3a5eb6eebbf45267ba35cdc6cb7f1e46e9
|
3 |
+
size 99945814
|
onnx/PH-OCRv1/rec/ru_rec_decoder_v1.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8866fd683115abe2a2767192155e85b9b531da92927338e4bbeefa1fb78b69a
|
3 |
+
size 79826892
|
onnx/PH-OCRv1/rec/ru_rec_encoder_v1.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f6237816d4374281884d75ab364e35b31dcbd7cb06a7808bde4531578fec960d
|
3 |
+
size 99945814
|
onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e47acedf663230f8863ff1ab0e64dd2d82b838fceb5957146dab185a89d6215c
|
3 |
+
size 585532
|
onnx/PP-OCRv4/det/Multilingual_PP-OCRv3_det_infer.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5475c6c7f4d84a6c4f32241b487435d59f126a40c023387af99732258844cdc3
|
3 |
+
size 2421639
|
onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2a7720d45a54257208b1e13e36a8479894cb74155a5efe29462512d42f49da9
|
3 |
+
size 4745517
|
onnx/PP-OCRv4/det/ch_PP-OCRv4_det_server_infer.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfa39a3f298f6d3fc71789834d15da36d11a6c59b489fc16ea4733728012f786
|
3 |
+
size 113352104
|
onnx/PP-OCRv4/det/en_PP-OCRv3_det_infer.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea07c15d38ac40cd69da3c493444ec75b44ff23840553ff8ba102c1219ed39c2
|
3 |
+
size 2421707
|
onnx/PP-OCRv5/det/ch_PP-OCRv5_mobile_det.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d97c44a20d30a81aad087d6a396b08f786c4635742afc391f6621f5c6ae78ae
|
3 |
+
size 4819576
|
onnx/PP-OCRv5/det/ch_PP-OCRv5_server_det.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f8846b1d4bba223a2a2f9d9b44022fbc22cc019051a602b41a7fda9667e4cad
|
3 |
+
size 88118768
|
resources/fonts/FZYTK.TTF
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4065a23df6823c8e2b69a0e76d02f02a6470b8774a5e91086609701ad95cc33f
|
3 |
+
size 3241748
|
resources/fonts/cyrillic.ttf
ADDED
Binary file (56.2 kB). View file
|
|
resources/fonts/japan.ttc
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:11122490a5e3a862015c8894183750de59abf95c3936d63d5978293d92f23dba
|
3 |
+
size 3478068
|
resources/fonts/korean.ttf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0897316bdb2e308cea2841c54940f2ef5707856000aa07910c8bff39a47e40bd
|
3 |
+
size 1222780
|
resources/fonts/方正宋黑.TTF
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a2041695a15d341d8f344bac12e90e2228414a18531af6678c0e380d910e2c2
|
3 |
+
size 17546288
|