ocrqa-demo / app.py
simon-clmtd's picture
Update app.py
1fd6ee7 verified
raw
history blame
7.44 kB
import os
# Redirect cache to a writable path inside container
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
import gradio as gr
from impresso_pipelines.ocrqa import OCRQAPipeline
pipeline = OCRQAPipeline()
LANGUAGES = ["en", "de", "fr"]
# Example OCR text (German text with typical OCR errors)
EXAMPLE_TEXT = """Vieles Seltsame geschieht auf Erden :
Nichts Seltsameres sieht der Mond
Als das Glück, das im Knopfloch wohnt.
Zaubrisch faßt es den ernsten Mann.
Ohne nach Weib u. Kinjd zu fragen
Reitet er aus, nach dem Glück zu jagen,
Nur nacb ihm war stets sein Vegehr.
Aber neben ihm 1reitet der Dämon her
Des Ehrgeizes mit finsterer Tücke,
Und so jagt er zuletzt auf die Brücke,
Die über dem Abgrund, d:m nächtlich schwarzen
Jählings abbricht."""
def process_ocr_qa(text, lang_choice):
try:
lang = None if lang_choice == "Auto-detect" else lang_choice
result = pipeline(text, language=lang, diagnostics=True)
# Format the output for better readability
if isinstance(result, dict):
output_lines = []
# Language detection
if 'language' in result:
output_lines.append(f"🌍 Language: {result['language']}")
# Quality score
if 'score' in result:
score = result['score']
score_emoji = "🟢" if score >= 0.8 else "🟡" if score >= 0.5 else "🔴"
output_lines.append(f"{score_emoji} Quality Score: {score:.1f}")
# Diagnostics section
if 'diagnostics' in result and result['diagnostics']:
diagnostics = result['diagnostics']
# Model information
if 'model_id' in diagnostics:
output_lines.append(f"🤖 Model: {diagnostics['model_id']}")
# Known tokens
if 'known_tokens' in diagnostics and diagnostics['known_tokens']:
known_tokens = diagnostics['known_tokens']
output_lines.append(f"✅ Known tokens ({len(known_tokens)}): {', '.join(known_tokens)}")
# Unknown tokens (potential OCR errors)
if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
unknown_tokens = diagnostics['unknown_tokens']
output_lines.append(f"❌ Unrecognized tokens ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
elif 'unknown_tokens' in diagnostics:
output_lines.append("✨ All tokens matched known lexicons – no OCR errors detected.")
# Other fields
for key, value in result.items():
if key not in ['language', 'score', 'diagnostics']:
output_lines.append(f"🔍 {key.replace('_', ' ').title()}: {value}")
return "\n\n".join(output_lines)
else:
return f"✨ Processed Result:\n{result}"
except Exception as e:
print("❌ Pipeline error:", e)
return f"Error: {e}"
# Create the interface with logo and improved description
with gr.Blocks(title="OCR QA Demo") as demo:
gr.HTML(
"""
<a href="https://impresso-project.ch" target="_blank">
<img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg"
alt="Impresso Project Logo"
style="height: 42px; display: block; margin: 5px auto; background-color: white;">
</a>
"""
)
gr.Markdown(
"""
# 🔍 Optical Character Recognition (OCR) Quality Assessment Demo
The demo showcases how the [Impresso Project](https://impresso-project.ch) assesses the quality of ORC transcripts by estimating the proportion of (un)known words with respect to a large clean text corpus.
It returns:
- a **quality score** between **0.0 (poor)** and **1.0 (excellent)**, and
- a list of **potential OCR errors** (unrecognized tokens) as well as the known tokens.
You can try the example below (a German text containing typical OCR errors), or paste your own OCR-processed text to assess its quality.
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="OCR Text (from digitized sources)",
value=EXAMPLE_TEXT,
lines=8,
placeholder="Paste OCR-processed text from a historical document..."
)
lang_dropdown = gr.Dropdown(
choices=LANGUAGES,
value="de",
label="Language of the Text"
)
submit_btn = gr.Button("🔍 Assess OCR Text Quality", variant="primary")
info_btn = gr.Button("Help", size="md", scale=1)
with gr.Column():
with gr.Row():
output = gr.Textbox(
label="OCR Quality Report",
lines=15,
placeholder="The quality assessment will appear here...",
scale=10
)
# Info modal/accordion for pipeline details
with gr.Accordion("📝 About the OCR QA Method", open=False, visible=False) as info_accordion:
gr.Markdown(
"""
This pipeline estimates OCR quality by analyzing the proportion of **unique words** in a text that match curated wordlists for a given language.
#### How it works:
- **Scoring**: The quality score ranges from **0.0** (poor) to **1.0** (excellent) and is based on the ratio of recognized to unrecognized unique word forms.
- **Lexical resources**: Words are matched against precompiled lists derived from **Wikipedia** and **Wortschatz Leipzig**, using **Bloom filters** for fast, memory-efficient lookup.
- **Multilingual support**: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically.
- **Diagnostics output**:
- ✅ **Known tokens**: Words found in the reference wordlist, presumed correctly OCR’d.
- ❌ **Unrecognized tokens**: Words not found in the list—often OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms).
- Note: Non-alphabetic characters will be removed. For efficiency reasons, all digits are replace by the digit 0.
#### ⚠️ Limitations:
- The wordlists are **not exhaustive**, particularly for **historical vocabulary**, **dialects**, or **named entities**.
- The method may fail to flag **short OCR artifacts** (e.g., 1–2 character noise) and **non-alphabetic symbols**.
As such, the score should be understood as a **heuristic indicator**, best used for:
- Comparative assessments between OCR outputs
- Filtering low-quality text from large corpora
- Supporting decisions in corpus preparation and annotation workflows
It is **not a substitute for manual inspection** or ground-truth evaluation.
"""
)
submit_btn.click(
fn=process_ocr_qa,
inputs=[text_input, lang_dropdown],
outputs=output
)
# Toggle info visibility when info button is clicked
info_btn.click(
fn=lambda: gr.Accordion(visible=True, open=True),
outputs=info_accordion
)
demo.launch(server_name="0.0.0.0", server_port=7860)