puhuilab commited on
Commit
f75ac71
·
verified ·
1 Parent(s): 21514c8

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,67 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.model filter=lfs diff=lfs merge=lfs -text
12
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
13
  *.onnx filter=lfs diff=lfs merge=lfs -text
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
17
  *.pt filter=lfs diff=lfs merge=lfs -text
18
  *.pth filter=lfs diff=lfs merge=lfs -text
19
  *.rar filter=lfs diff=lfs merge=lfs -text
 
20
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
22
  *.tflite filter=lfs diff=lfs merge=lfs -text
23
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *.tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.db* filter=lfs diff=lfs merge=lfs -text
29
+ *.ark* filter=lfs diff=lfs merge=lfs -text
30
+ **/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
31
+ **/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
32
+ **/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
33
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
34
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
35
+ *.gguf* filter=lfs diff=lfs merge=lfs -text
36
+ *.ggml filter=lfs diff=lfs merge=lfs -text
37
+ *.llamafile* filter=lfs diff=lfs merge=lfs -text
38
+ *.pt2 filter=lfs diff=lfs merge=lfs -text
39
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
40
+ *.npy filter=lfs diff=lfs merge=lfs -text
41
+ *.npz filter=lfs diff=lfs merge=lfs -text
42
+ *.pickle filter=lfs diff=lfs merge=lfs -text
43
+ *.pkl filter=lfs diff=lfs merge=lfs -text
44
+ *.tar filter=lfs diff=lfs merge=lfs -text
45
+ *.wasm filter=lfs diff=lfs merge=lfs -text
46
  *.zst filter=lfs diff=lfs merge=lfs -text
47
  *tfevents* filter=lfs diff=lfs merge=lfs -text
48
+
49
+ resources/fonts/FZYTK.TTF filter=lfs diff=lfs merge=lfs -text
50
+ resources/fonts/japan.ttc filter=lfs diff=lfs merge=lfs -text
51
+ resources/fonts/korean.ttf filter=lfs diff=lfs merge=lfs -text
52
+ resources/fonts/方正宋黑.TTF filter=lfs diff=lfs merge=lfs -text
53
+
54
+ resources/fonts/FZYTK.TTF filter=lfs diff=lfs merge=lfs -text
55
+ resources/fonts/japan.ttc filter=lfs diff=lfs merge=lfs -text
56
+ resources/fonts/korean.ttf filter=lfs diff=lfs merge=lfs -text
57
+ resources/fonts/方正宋黑.TTF filter=lfs diff=lfs merge=lfs -text
58
+
59
+ resources/fonts/FZYTK.TTF filter=lfs diff=lfs merge=lfs -text
60
+ resources/fonts/japan.ttc filter=lfs diff=lfs merge=lfs -text
61
+ resources/fonts/korean.ttf filter=lfs diff=lfs merge=lfs -text
62
+ resources/fonts/方正宋黑.TTF filter=lfs diff=lfs merge=lfs -text
63
+
64
+ resources/fonts/japan.ttc filter=lfs diff=lfs merge=lfs -text
65
+ resources/fonts/FZYTK.TTF filter=lfs diff=lfs merge=lfs -text
66
+ resources/fonts/korean.ttf filter=lfs diff=lfs merge=lfs -text
67
+ resources/fonts/方正宋黑.TTF filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 PuHui Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ Additional Terms
24
+
25
+ If you use the PHOCR models in commercial products or services,
26
+ you **must include clear attribution** to PHOCR in your product documentation,
27
+ user interface, or other appropriate locations, e.g. Powered By PHOCR from PuHui Lab.
README.md CHANGED
@@ -1,3 +1,201 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - ocr
4
+ - image-to-text
5
+ license: mit
6
+ library_name: transformers
7
+ ---
8
+
9
+ # Model Card: PHOCR
10
+
11
+ an open high-performance Optical Character Recognition (OCR) toolkit [PHOCR](https://github.com/puhuilab/phocr).
12
+
13
+ # PHOCR: High-Performance OCR Toolkit
14
+
15
+ [English](README.md) | [简体中文](README_CN.md)
16
+
17
+ PHOCR is an open high-performance Optical Character Recognition (OCR) toolkit designed for efficient text recognition across multiple languages including Chinese, Japanese, Korean, Russian, Vietnamese, and Thai. **PHOCR features a completely custom-developed recognition model (PH-OCRv1) that significantly outperforms existing solutions.**
18
+
19
+ ## Motivation
20
+
21
+ Current token-prediction-based model architectures are highly sensitive to the accuracy of contextual tokens. Repetitive patterns, even as few as a thousand instances, can lead to persistent memorization by the model. While most open-source text recognition models currently achieve character error rates (CER) in the percent range, our goal is to push this further into the per-mille range. At that level, for a system processing 100 million characters, the total number of recognition errors would be reduced to under 1 million — an order of magnitude improvement.
22
+
23
+ ## Features
24
+
25
+ - **Custom Recognition Model**: **PH-OCRv1** achieves sub-0.x% character error rate in document-style settings by leveraging open-source models. Even achieves 0.0x% character error rate in English.
26
+ - **Multi-language Support**: Chinese, English, Japanese, Korean, Russian, and more
27
+ - **Rich Vocabulary**: Comprehensive vocabulary for each language. Chinese: 15,316, Korean: 17,388, Japanese: 11,186, Russian: 292.
28
+ - **High Performance**: Optimized inference engine with ONNX Runtime support
29
+ - **Easy Integration**: Simple Python API for quick deployment
30
+ - **Cross-platform**: Support for CPU and CUDA
31
+
32
+ ## Visualization
33
+
34
+ ![Visualization](./vis.gif)
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install phocr
40
+ ```
41
+
42
+ ## Quick Start
43
+
44
+ ```python
45
+ from phocr import PHOCR
46
+
47
+ # Initialize OCR engine
48
+ engine = PHOCR()
49
+
50
+ # Perform OCR on image
51
+ result = engine("path/to/image.jpg")
52
+ print(result)
53
+
54
+ # Visualize results
55
+ result.vis("output.jpg")
56
+ print(result.to_markdown())
57
+ ```
58
+
59
+ ## Benchmarks
60
+
61
+ We conducted comprehensive benchmarks comparing PHOCR with leading OCR solutions across multiple languages and scenarios. **Our custom-developed PH-OCRv1 model demonstrates significant improvements over existing solutions.**
62
+
63
+ ### Overall Performance Comparison
64
+
65
+ <table style="width: 90%; margin: auto; border-collapse: collapse; font-size: small;">
66
+ <thead>
67
+ <tr>
68
+ <th rowspan="2">Model</th>
69
+ <th colspan="4">ZH & EN<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
70
+ <th colspan="2">JP<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
71
+ <th colspan="2">KO<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
72
+ <th colspan="1">RU<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
73
+ </tr>
74
+ <tr>
75
+ <th><i>English</i></th>
76
+ <th><i>Simplified Chinese</i></th>
77
+ <th><i>EN CH Mixed</i></th>
78
+ <th><i>Traditional Chinese</i></th>
79
+ <th><i>Document</i></th>
80
+ <th><i>Scene</i></th>
81
+ <th><i>Document</i></th>
82
+ <th><i>Scene</i></th>
83
+ <th><i>Document</i></th>
84
+ </tr>
85
+ </thead>
86
+ <tbody>
87
+ <tr>
88
+ <td>PHOCR</td>
89
+ <td><strong>0.0008</strong></td>
90
+ <td><strong>0.0057</strong></td>
91
+ <td><strong>0.0171</strong></td>
92
+ <td><strong>0.0145</strong></td>
93
+ <td><strong>0.0039</strong></td>
94
+ <td><strong>0.0197</strong></td>
95
+ <td><strong>0.0050</strong></td>
96
+ <td><strong>0.0255</strong></td>
97
+ <td><strong>0.0046</strong></td>
98
+ </tr>
99
+ <tr>
100
+ <td>Baidu</td>
101
+ <td>0.0014</td>
102
+ <td>0.0069</td>
103
+ <td>0.0354</td>
104
+ <td>0.0431</td>
105
+ <td>0.0222</td>
106
+ <td>0.0607</td>
107
+ <td>0.0238</td>
108
+ <td>0.212</td>
109
+ <td>0.0786</td>
110
+ </tr>
111
+ <tr>
112
+ <td>Ali</td>
113
+ <td>-</td>
114
+ <td>-</td>
115
+ <td>-</td>
116
+ <td>-</td>
117
+ <td>0.0272</td>
118
+ <td>0.0564</td>
119
+ <td>0.0159</td>
120
+ <td>0.102</td>
121
+ <td>0.0616</td>
122
+ </tr>
123
+ </tbody>
124
+ </table>
125
+
126
+
127
+ Notice
128
+
129
+ - baidu: [Baidu Accurate API](https://ai.baidu.com/tech/ocr/general)
130
+ - Ali: [Aliyun API](https://help.aliyun.com/zh/ocr/product-overview/recognition-of-characters-in-languages-except-for-chinese-and-english-1)
131
+ - CER: the total edit distance divided by the total number of characters in the ground truth.
132
+
133
+
134
+ ## Advanced Usage
135
+
136
+ With global KV cache enabled, we implement a simple version using PyTorch (CUDA). When running with torch (CUDA), you can enable caching by setting `use_cache=True` in `ORTSeq2Seq(...)`, which also allows for larger batch sizes.
137
+
138
+ ### Language-specific Configuration
139
+
140
+ See [demo.py](./demo.py) for more examples.
141
+
142
+ ## Evaluation & Benchmarking
143
+
144
+ PHOCR provides comprehensive benchmarking tools to evaluate model performance across different languages and scenarios.
145
+
146
+ ### Quick Benchmark
147
+
148
+ Run the complete benchmark pipeline:
149
+ ```bash
150
+ sh benchmark/run_recognition.sh
151
+ ```
152
+
153
+ Calculate Character Error Rate (CER) for model predictions:
154
+ ```bash
155
+ sh benchmark/run_score.sh
156
+ ```
157
+
158
+ ### Benchmark Datasets
159
+
160
+ PHOCR uses standardized benchmark datasets for fair comparison:
161
+
162
+ - **zh_en_rec_bench** [Chinese & English mixed text recognition](https://huggingface.co/datasets/puhuilab/zh_en_rec_bench)
163
+ - **jp_rec_bench** [Japanese text recognition](https://huggingface.co/datasets/puhuilab/jp_rec_bench)
164
+ - **ko_rec_bench** [Korean text recognition](https://huggingface.co/datasets/puhuilab/ko_rec_bench)
165
+ - **ru_rec_bench** [Russian text recognition](https://huggingface.co/datasets/puhuilab/ru_rec_bench)
166
+
167
+ ## Further Improvements
168
+
169
+ - Character error rate (CER), including punctuation, can be further reduced through additional normalization of the training corpus.
170
+ - Text detection accuracy can be further enhanced by employing a more advanced detection framework.
171
+
172
+ ## Contributing
173
+
174
+ We welcome contributions! Please feel free to submit issues, feature requests, or pull requests.
175
+
176
+ ## Support
177
+
178
+ For questions and support, please open an issue on GitHub or contact the maintainers.
179
+
180
+ ## Acknowledgements
181
+
182
+ Many thanks to [RapidOCR](https://github.com/RapidAI/RapidOCR) for detection and main framework.
183
+
184
+ ## License
185
+
186
+ - This project is released under the Apache 2.0 license
187
+ - The copyright of the OCR detection and classification model is held by Baidu
188
+ - The PHOCR recognition models are under the modified MIT License - see the [LICENSE](./LICENSE) file for details
189
+
190
+ ## Citation
191
+
192
+ If you use PHOCR in your research, please cite:
193
+
194
+ ```bibtex
195
+ @misc{phocr2025,
196
+ title={PHOCR: High-Performance OCR Toolkit},
197
+ author={PuHui Lab},
198
+ year={2025},
199
+ url={https://github.com/puhuilab/phocr}
200
+ }
201
+ ```
README_CN.md ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - ocr
4
+ - image-to-text
5
+ license: mit
6
+ library_name: transformers
7
+ ---
8
+
9
+ # Model Card: PHOCR
10
+
11
+ 高性能文字识别工具包 [PHOCR](https://github.com/puhuilab/phocr).
12
+
13
+
14
+ # PHOCR:
15
+
16
+ [English](README.md) | [简体中文](README_CN.md)
17
+
18
+ PHOCR 是一个高性能的开源光学字符识别(OCR)工具包,专为多语种文本识别任务设计,支持包括中文、日文、韩文、俄文、越南文和泰文在内的多种语言。**PHOCR 搭载了我们完全自研的识别模型 PH-OCRv1,在准确率上显著优于现有解决方案。**
19
+
20
+ ## 背景动机
21
+
22
+ 当前基于下一个 token 预测的模型结构对上下文 token 的准确性非常敏感。即使仅有千次重复的模式,也可能导致模型产生永久性记忆。虽然大多数开源文字识别模型目前的字符错误率(CER)仍处于百分位水平,我们的目标是将其进一步提升至千分位(0.x%)。在这一精度下,对于处理 1 亿字符的系统,总错误字符数将下降至100万内,达到数量级的准确率提升。
23
+
24
+ ## 主要特性
25
+
26
+ - **自研识别模型**:**PH-OCRv1**,可在文档场景中实现千分位CER,英文场景下甚至可达0.0x%。
27
+ - **多语种支持**:支持中文、英文、日文、韩文、俄文等多种语言。
28
+ - **丰富词表覆盖**:中文 15316,韩文 17388,日文 11186,俄文 292。
29
+ - **高性能推理**:深度优化的模型结构,集成 ONNX Runtime 支持。
30
+ - **轻松集成**:提供简洁的 Python API。
31
+ - **跨平台**:支持 CPU 与 CUDA 环境。
32
+
33
+ ## 可视化效果
34
+
35
+ ![可视化效果](./vis.gif)
36
+
37
+ ## 安装方式
38
+
39
+ ```bash
40
+ pip install phocr
41
+ ```
42
+
43
+ ## 快速开始
44
+
45
+ ```python
46
+ from phocr import PHOCR
47
+
48
+ # 初始化 OCR 引擎
49
+ engine = PHOCR()
50
+
51
+ # 对图像进行 OCR 识别
52
+ result = engine("path/to/image.jpg")
53
+ print(result)
54
+
55
+ # 可视化结果
56
+ result.vis("output.jpg")
57
+ print(result.to_markdown())
58
+ ```
59
+
60
+ ## 性能基准测试
61
+
62
+ 我们进行了全面的基准测试,将 PHOCR 与领先的 OCR 解决方案在多种语言和场景下进行比较。**我们自研的 PH-OCRv1 模型在现有解决方案基础上实现了显著改进。**
63
+
64
+ ### 整体性能对比
65
+
66
+ <table style="width: 90%; margin: auto; border-collapse: collapse; font-size: small;">
67
+ <thead>
68
+ <tr>
69
+ <th rowspan="2">模型</th>
70
+ <th colspan="4">中文 & 英文<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
71
+ <th colspan="2">日文<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
72
+ <th colspan="2">韩文<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
73
+ <th colspan="1">俄文<br><span style="font-weight: normal; font-size: x-small;">CER ↓</span></th>
74
+ </tr>
75
+ <tr>
76
+ <th><i>英文</i></th>
77
+ <th><i>简体中文</i></th>
78
+ <th><i>中英混合</i></th>
79
+ <th><i>繁体中文</i></th>
80
+ <th><i>文档</i></th>
81
+ <th><i>场景</i></th>
82
+ <th><i>文档</i></th>
83
+ <th><i>场景</i></th>
84
+ <th><i>文档</i></th>
85
+ </tr>
86
+ </thead>
87
+ <tbody>
88
+ <tr>
89
+ <td>PHOCR</td>
90
+ <td><strong>0.0008</strong></td>
91
+ <td><strong>0.0057</strong></td>
92
+ <td><strong>0.0171</strong></td>
93
+ <td><strong>0.0145</strong></td>
94
+ <td><strong>0.0039</strong></td>
95
+ <td><strong>0.0197</strong></td>
96
+ <td><strong>0.0050</strong></td>
97
+ <td><strong>0.0255</strong></td>
98
+ <td><strong>0.0046</strong></td>
99
+ </tr>
100
+ <tr>
101
+ <td>百度</td>
102
+ <td>0.0014</td>
103
+ <td>0.0069</td>
104
+ <td>0.0354</td>
105
+ <td>0.0431</td>
106
+ <td>0.0222</td>
107
+ <td>0.0607</td>
108
+ <td>0.0238</td>
109
+ <td>0.212</td>
110
+ <td>0.0786</td>
111
+ </tr>
112
+ <tr>
113
+ <td>阿里</td>
114
+ <td>-</td>
115
+ <td>-</td>
116
+ <td>-</td>
117
+ <td>-</td>
118
+ <td>0.0272</td>
119
+ <td>0.0564</td>
120
+ <td>0.0159</td>
121
+ <td>0.102</td>
122
+ <td>0.0616</td>
123
+ </tr>
124
+ <tr>
125
+ <td>PPOCR V5</td>
126
+ <td>0.011</td>
127
+ <td>0.060</td>
128
+ <td>0.032</td>
129
+ <td>0.061</td>
130
+ <td>-</td>
131
+ <td>-</td>
132
+ <td>-</td>
133
+ <td>-</td>
134
+ <td>-</td>
135
+ </tr>
136
+ </tbody>
137
+ </table>
138
+
139
+ 说明:
140
+
141
+ - baidu: [Baidu Accurate API](https://ai.baidu.com/tech/ocr/general)
142
+ - Ali: [Aliyun API](https://help.aliyun.com/zh/ocr/product-overview/recognition-of-characters-in-languages-except-for-chinese-and-english-1)
143
+ - 字符错误率(CER):总的编辑距离除以真实标签(ground truth)中字符的总数量。
144
+
145
+ ## 高级用法
146
+
147
+ 启用全局 KV 缓存后,我们使用 PyTorch (CUDA) 实现了一个简单版本。在使用 torch (CUDA) 运行时,您可以通过在 `ORTSeq2Seq(...)` 中设置 `use_cache=True` 来启用缓存,这也允许更大的批处理大小。
148
+
149
+ ### 语言特定配置
150
+
151
+ 更多示例请参见 [demo.py](./demo.py)。
152
+
153
+ ## 评估与基准测试
154
+
155
+ PHOCR 提供全面的基准测试工具,用于评估模型在不同语言和场景下的性能。
156
+
157
+ ### 快速基准测试
158
+
159
+ 运行完整的基准测试流程:
160
+ ```bash
161
+ sh benchmark/run_recognition.sh
162
+ ```
163
+
164
+ 计算模型预测的字符错误率 (CER):
165
+ ```bash
166
+ sh benchmark/run_score.sh
167
+ ```
168
+
169
+ ### 基准测试数据集
170
+
171
+ PHOCR 使用标准化的基准测试数据集进行公平比较:
172
+
173
+ - **zh_en_rec_bench** [中英文混合文本识别](https://huggingface.co/datasets/puhuilab/zh_en_rec_bench)
174
+ - **jp_rec_bench** [日文文本识别](https://huggingface.co/datasets/puhuilab/jp_rec_bench)
175
+ - **ko_rec_bench** [韩文文本识别](https://huggingface.co/datasets/puhuilab/ko_rec_bench)
176
+ - **ru_rec_bench** [俄文文本识别](https://huggingface.co/datasets/puhuilab/ru_rec_bench)
177
+
178
+ 中英文混合文本识别主要来自于[OmniDocBench](https://github.com/opendatalab/OmniDocBench)的随机采样。
179
+ 其它数据由我们的团队手工采集完成。
180
+
181
+ ## 后续优化方向
182
+
183
+ - 通过进一步归一化训练语料,字符错误率(CER,包括标点符号)可以得到进一步降低。
184
+ - 通过采用更先进的检测框架,文字检测的准确率可以进一步提升。
185
+
186
+ ## 贡献指南
187
+
188
+ 我们欢迎任何贡献!请随时提交 issue、功能请求或 pull request。
189
+
190
+ ## 支持
191
+
192
+ 如有问题或需要支持,请在 GitHub 上提交 issue 或联系维护者。
193
+
194
+ ## 鸣谢
195
+
196
+ 特别感谢 [RapidOCR](https://github.com/RapidAI/RapidOCR) 提供的检测模型及主框架支持。
197
+
198
+ ## 许可证
199
+
200
+ - 本项目采用 Apache 2.0 开源许可证
201
+ - OCR 检测和分类模型的版权归百度所有
202
+ - PHOCR 识别模型采用修改版 MIT 许可证,详情请见 [LICENSE](./LICENSE) 文件
203
+
204
+ ## 引用方式
205
+
206
+ 如果您在研究中使用了 PHOCR,请引用:
207
+
208
+ ```bibtex
209
+ @misc{phocr2025,
210
+ title={PHOCR: High-Performance OCR Toolkit},
211
+ author={PuHui Lab},
212
+ year={2025},
213
+ url={https://github.com/puhuilab/phocr}
214
+ }
215
+ ```
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"other","task":"other"}
onnx/PH-OCRv1/rec/ch_rec_decoder_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1a88ee6975b11b729c814b0fd4caac7b563ed3d4e68cc90b98a609222eb04ba
3
+ size 126086098
onnx/PH-OCRv1/rec/ch_rec_encoder_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb4140a6369b3be487d7c05e84a89e9c58e954bc2de9f5f0ee988d46f492499f
3
+ size 99945814
onnx/PH-OCRv1/rec/jp_rec_decoder_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9c330f578606c625f449c49e8a993338e3e100ab0a67ef0d171360d410d3be9
3
+ size 113369603
onnx/PH-OCRv1/rec/jp_rec_encoder_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8ceade124c37c6a6ce7157e30322859a257a2aedc4f1f886473ea10d8478578
3
+ size 99945814
onnx/PH-OCRv1/rec/ko_rec_decoder_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6798eb63098fa0342c340ee3d6f6c42b2cbff74e0572e4abe1b630462b7d039c
3
+ size 132465395
onnx/PH-OCRv1/rec/ko_rec_encoder_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:215c9a04af7a37f00cf1deef6a12ac3a5eb6eebbf45267ba35cdc6cb7f1e46e9
3
+ size 99945814
onnx/PH-OCRv1/rec/ru_rec_decoder_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8866fd683115abe2a2767192155e85b9b531da92927338e4bbeefa1fb78b69a
3
+ size 79826892
onnx/PH-OCRv1/rec/ru_rec_encoder_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6237816d4374281884d75ab364e35b31dcbd7cb06a7808bde4531578fec960d
3
+ size 99945814
onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e47acedf663230f8863ff1ab0e64dd2d82b838fceb5957146dab185a89d6215c
3
+ size 585532
onnx/PP-OCRv4/det/Multilingual_PP-OCRv3_det_infer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5475c6c7f4d84a6c4f32241b487435d59f126a40c023387af99732258844cdc3
3
+ size 2421639
onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2a7720d45a54257208b1e13e36a8479894cb74155a5efe29462512d42f49da9
3
+ size 4745517
onnx/PP-OCRv4/det/ch_PP-OCRv4_det_server_infer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfa39a3f298f6d3fc71789834d15da36d11a6c59b489fc16ea4733728012f786
3
+ size 113352104
onnx/PP-OCRv4/det/en_PP-OCRv3_det_infer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea07c15d38ac40cd69da3c493444ec75b44ff23840553ff8ba102c1219ed39c2
3
+ size 2421707
onnx/PP-OCRv5/det/ch_PP-OCRv5_mobile_det.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d97c44a20d30a81aad087d6a396b08f786c4635742afc391f6621f5c6ae78ae
3
+ size 4819576
onnx/PP-OCRv5/det/ch_PP-OCRv5_server_det.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f8846b1d4bba223a2a2f9d9b44022fbc22cc019051a602b41a7fda9667e4cad
3
+ size 88118768
resources/fonts/FZYTK.TTF ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4065a23df6823c8e2b69a0e76d02f02a6470b8774a5e91086609701ad95cc33f
3
+ size 3241748
resources/fonts/cyrillic.ttf ADDED
Binary file (56.2 kB). View file
 
resources/fonts/japan.ttc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11122490a5e3a862015c8894183750de59abf95c3936d63d5978293d92f23dba
3
+ size 3478068
resources/fonts/korean.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0897316bdb2e308cea2841c54940f2ef5707856000aa07910c8bff39a47e40bd
3
+ size 1222780
resources/fonts/方正宋黑.TTF ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a2041695a15d341d8f344bac12e90e2228414a18531af6678c0e380d910e2c2
3
+ size 17546288