Spaces:
Sleeping
Sleeping
Merge branch 'main' of https://huggingface.co/spaces/impresso-project/ocrqa-demo
Browse files
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
thumbnail.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
---
|
| 2 |
-
title: OCR Quality Assessment
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
-
short_description: OCR
|
| 9 |
---
|
| 10 |
|
|
|
|
| 1 |
---
|
| 2 |
+
title: OCR Quality Assessment Demo
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: gray
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
short_description: Measure the OCR output quality using word recognition ratios
|
| 9 |
---
|
| 10 |
|
app.py
CHANGED
|
@@ -59,9 +59,9 @@ def process_ocr_qa(text, lang_choice):
|
|
| 59 |
# Unknown tokens (potential OCR errors)
|
| 60 |
if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
|
| 61 |
unknown_tokens = diagnostics['unknown_tokens']
|
| 62 |
-
output_lines.append(f"β
|
| 63 |
elif 'unknown_tokens' in diagnostics:
|
| 64 |
-
output_lines.append("β¨
|
| 65 |
|
| 66 |
# Other fields
|
| 67 |
for key, value in result.items():
|
|
@@ -78,55 +78,82 @@ def process_ocr_qa(text, lang_choice):
|
|
| 78 |
|
| 79 |
# Create the interface with logo and improved description
|
| 80 |
with gr.Blocks(title="OCR QA Demo") as demo:
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
gr.Markdown(
|
| 85 |
"""
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
)
|
| 95 |
|
| 96 |
with gr.Row():
|
| 97 |
with gr.Column():
|
| 98 |
text_input = gr.Textbox(
|
| 99 |
-
label="
|
| 100 |
value=EXAMPLE_TEXT,
|
| 101 |
lines=8,
|
| 102 |
-
placeholder="
|
| 103 |
)
|
| 104 |
lang_dropdown = gr.Dropdown(
|
| 105 |
-
choices=
|
| 106 |
value="de",
|
| 107 |
-
label="Language"
|
| 108 |
)
|
| 109 |
-
submit_btn = gr.Button("π
|
|
|
|
| 110 |
|
| 111 |
with gr.Column():
|
| 112 |
with gr.Row():
|
| 113 |
output = gr.Textbox(
|
| 114 |
-
label="
|
| 115 |
lines=15,
|
| 116 |
-
placeholder="
|
| 117 |
scale=10
|
| 118 |
)
|
| 119 |
-
|
| 120 |
|
| 121 |
# Info modal/accordion for pipeline details
|
| 122 |
-
with gr.Accordion("π About the OCR QA
|
| 123 |
gr.Markdown(
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
submit_btn.click(
|
| 132 |
fn=process_ocr_qa,
|
|
|
|
| 59 |
# Unknown tokens (potential OCR errors)
|
| 60 |
if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
|
| 61 |
unknown_tokens = diagnostics['unknown_tokens']
|
| 62 |
+
output_lines.append(f"β Unrecognized tokens ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
|
| 63 |
elif 'unknown_tokens' in diagnostics:
|
| 64 |
+
output_lines.append("β¨ All tokens matched known lexicons β no OCR errors detected.")
|
| 65 |
|
| 66 |
# Other fields
|
| 67 |
for key, value in result.items():
|
|
|
|
| 78 |
|
| 79 |
# Create the interface with logo and improved description
|
| 80 |
with gr.Blocks(title="OCR QA Demo") as demo:
|
| 81 |
+
gr.HTML(
|
| 82 |
+
"""
|
| 83 |
+
<a href="https://impresso-project.ch" target="_blank">
|
| 84 |
+
<img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg"
|
| 85 |
+
alt="Impresso Project Logo"
|
| 86 |
+
style="height: 42px; display: block; margin: 5px auto; background-color: white;">
|
| 87 |
+
</a>
|
| 88 |
+
"""
|
| 89 |
+
)
|
| 90 |
gr.Markdown(
|
| 91 |
"""
|
| 92 |
+
# π Optical Character Recognition (OCR) Quality Assessment Demo
|
| 93 |
+
|
| 94 |
+
The demo showcases how the [Impresso Project](https://impresso-project.ch) assesses the quality of ORC transcripts by estimating the proportion of (un)known words with respect to a large clean text corpus.
|
| 95 |
+
|
| 96 |
+
It returns:
|
| 97 |
+
- a **quality score** between **0.0 (poor)** and **1.0 (excellent)**, and
|
| 98 |
+
- a list of **potential OCR errors** (unrecognized tokens) as well as the known tokens.
|
| 99 |
+
|
| 100 |
+
You can try the example below (a German text containing typical OCR errors), or paste your own OCR-processed text to assess its quality.
|
| 101 |
+
"""
|
| 102 |
)
|
| 103 |
|
| 104 |
with gr.Row():
|
| 105 |
with gr.Column():
|
| 106 |
text_input = gr.Textbox(
|
| 107 |
+
label="OCR Text (from digitized sources)",
|
| 108 |
value=EXAMPLE_TEXT,
|
| 109 |
lines=8,
|
| 110 |
+
placeholder="Paste OCR-processed text from a historical document..."
|
| 111 |
)
|
| 112 |
lang_dropdown = gr.Dropdown(
|
| 113 |
+
choices=LANGUAGES,
|
| 114 |
value="de",
|
| 115 |
+
label="Language of the Text"
|
| 116 |
)
|
| 117 |
+
submit_btn = gr.Button("π Assess OCR Text Quality", variant="primary")
|
| 118 |
+
info_btn = gr.Button("Help", size="md", scale=1)
|
| 119 |
|
| 120 |
with gr.Column():
|
| 121 |
with gr.Row():
|
| 122 |
output = gr.Textbox(
|
| 123 |
+
label="OCR Quality Report",
|
| 124 |
lines=15,
|
| 125 |
+
placeholder="The quality assessment will appear here...",
|
| 126 |
scale=10
|
| 127 |
)
|
| 128 |
+
|
| 129 |
|
| 130 |
# Info modal/accordion for pipeline details
|
| 131 |
+
with gr.Accordion("π About the OCR QA Method", open=False, visible=False) as info_accordion:
|
| 132 |
gr.Markdown(
|
| 133 |
+
"""
|
| 134 |
+
This pipeline estimates OCR quality by analyzing the proportion of **unique words** in a text that match curated wordlists for a given language.
|
| 135 |
+
|
| 136 |
+
#### How it works:
|
| 137 |
+
- **Scoring**: The quality score ranges from **0.0** (poor) to **1.0** (excellent) and is based on the ratio of recognized to unrecognized unique word forms.
|
| 138 |
+
- **Lexical resources**: Words are matched against precompiled lists derived from **Wikipedia** and **Wortschatz Leipzig**, using **Bloom filters** for fast, memory-efficient lookup.
|
| 139 |
+
- **Multilingual support**: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically.
|
| 140 |
+
- **Diagnostics output**:
|
| 141 |
+
- β
**Known tokens**: Words found in the reference wordlist, presumed correctly OCRβd.
|
| 142 |
+
- β **Unrecognized tokens**: Words not found in the listβoften OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms).
|
| 143 |
+
- Note: Non-alphabetic characters will be removed. For efficiency reasons, all digits are replace by the digit 0.
|
| 144 |
+
|
| 145 |
+
#### β οΈ Limitations:
|
| 146 |
+
- The wordlists are **not exhaustive**, particularly for **historical vocabulary**, **dialects**, or **named entities**.
|
| 147 |
+
- The method may fail to flag **short OCR artifacts** (e.g., 1β2 character noise) and **non-alphabetic symbols**.
|
| 148 |
+
|
| 149 |
+
As such, the score should be understood as a **heuristic indicator**, best used for:
|
| 150 |
+
- Comparative assessments between OCR outputs
|
| 151 |
+
- Filtering low-quality text from large corpora
|
| 152 |
+
- Supporting decisions in corpus preparation and annotation workflows
|
| 153 |
+
|
| 154 |
+
It is **not a substitute for manual inspection** or ground-truth evaluation.
|
| 155 |
+
"""
|
| 156 |
+
)
|
| 157 |
|
| 158 |
submit_btn.click(
|
| 159 |
fn=process_ocr_qa,
|
logo.jpeg
CHANGED
|
|