Spaces:
Running
Running
import os | |
# Redirect cache to a writable path inside container | |
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache" | |
import gradio as gr | |
from impresso_pipelines.ocrqa import OCRQAPipeline | |
pipeline = OCRQAPipeline() | |
LANGUAGES = ["en", "de", "fr"] | |
# Example OCR text (German text with typical OCR errors) | |
EXAMPLE_TEXT = """Vieles Seltsame geschieht auf Erden : | |
Nichts Seltsameres sieht der Mond | |
Als das Glück, das im Knopfloch wohnt. | |
Zaubrisch faßt es den ernsten Mann. | |
Ohne nach Weib u. Kinjd zu fragen | |
Reitet er aus, nach dem Glück zu jagen, | |
Nur nacb ihm war stets sein Vegehr. | |
Aber neben ihm 1reitet der Dämon her | |
Des Ehrgeizes mit finsterer Tücke, | |
Und so jagt er zuletzt auf die Brücke, | |
Die über dem Abgrund, d:m nächtlich schwarzen | |
Jählings abbricht.""" | |
def process_ocr_qa(text, lang_choice): | |
try: | |
lang = None if lang_choice == "Auto-detect" else lang_choice | |
result = pipeline(text, language=lang, diagnostics=True) | |
# Format the output for better readability | |
if isinstance(result, dict): | |
output_lines = [] | |
# Language detection | |
if 'language' in result: | |
output_lines.append(f"🌍 Language: {result['language']}") | |
# Quality score | |
if 'score' in result: | |
score = result['score'] | |
score_emoji = "🟢" if score >= 0.8 else "🟡" if score >= 0.5 else "🔴" | |
output_lines.append(f"{score_emoji} Quality Score: {score:.1f}") | |
# Diagnostics section | |
if 'diagnostics' in result and result['diagnostics']: | |
diagnostics = result['diagnostics'] | |
# Model information | |
if 'model_id' in diagnostics: | |
output_lines.append(f"🤖 Model: {diagnostics['model_id']}") | |
# Known tokens | |
if 'known_tokens' in diagnostics and diagnostics['known_tokens']: | |
known_tokens = diagnostics['known_tokens'] | |
output_lines.append(f"✅ Known tokens ({len(known_tokens)}): {', '.join(known_tokens)}") | |
# Unknown tokens (potential OCR errors) | |
if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']: | |
unknown_tokens = diagnostics['unknown_tokens'] | |
output_lines.append(f"❌ Potential OCR errors ({len(unknown_tokens)}): {', '.join(unknown_tokens)}") | |
elif 'unknown_tokens' in diagnostics: | |
output_lines.append("✨ No potential OCR errors detected!") | |
# Other fields | |
for key, value in result.items(): | |
if key not in ['language', 'score', 'diagnostics']: | |
output_lines.append(f"🔍 {key.replace('_', ' ').title()}: {value}") | |
return "\n\n".join(output_lines) | |
else: | |
return f"✨ Processed Result:\n{result}" | |
except Exception as e: | |
print("❌ Pipeline error:", e) | |
return f"Error: {e}" | |
# Create the interface with logo and improved description | |
with gr.Blocks(title="OCR QA Demo") as demo: | |
gr.HTML( | |
""" | |
<a href="https://impresso-project.ch" target="_blank"> | |
<img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg" alt="Impresso Project Logo" style="height: 100px;"> | |
</a> | |
""" | |
) | |
gr.Markdown( | |
""" | |
# 🔍 OCR Quality Assessment Demo | |
This demo showcases the **OCR Quality Assessment (OCRQA)** pipeline developed as part of the [Impresso Project](https://impresso-project.ch). The pipeline evaluates the quality of text extracted via **Optical Character Recognition (OCR)** by estimating the proportion of recognizable words. | |
It returns: | |
- a **quality score** between **0.0 (poor)** and **1.0 (excellent)**, and | |
- a list of **potential OCR errors** (unrecognized tokens). | |
You can try the example below (a German text containing typical OCR errors), or paste your own OCR-processed text to assess its quality. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Enter OCR Text", | |
value=EXAMPLE_TEXT, | |
lines=8, | |
placeholder="Enter your OCR text here..." | |
) | |
lang_dropdown = gr.Dropdown( | |
choices=["Auto-detect"] + LANGUAGES, | |
value="de", | |
label="Language" | |
) | |
submit_btn = gr.Button("🔍 Analyze OCR Quality", variant="primary") | |
with gr.Column(): | |
with gr.Row(): | |
output = gr.Textbox( | |
label="Analysis Results", | |
lines=15, | |
placeholder="Results will appear here...", | |
scale=10 | |
) | |
info_btn = gr.Button("Pipeline Info", size="sm", scale=1) | |
# Info modal/accordion for pipeline details | |
with gr.Accordion("📝 About the OCR QA Pipeline", open=False, visible=False) as info_accordion: | |
gr.Markdown( | |
""" | |
- **Quality Score**: Evaluates the overall quality of OCR text. From 0.0 (poor) to 1.0 (excellent) | |
- **Known tokens**: Words recognized as valid in the selected language | |
- **Potential OCR errors**: Identifies common OCR mistakes and artifacts | |
""" | |
) | |
submit_btn.click( | |
fn=process_ocr_qa, | |
inputs=[text_input, lang_dropdown], | |
outputs=output | |
) | |
# Toggle info visibility when info button is clicked | |
info_btn.click( | |
fn=lambda: gr.Accordion(visible=True, open=True), | |
outputs=info_accordion | |
) | |
demo.launch(server_name="0.0.0.0", server_port=7860) |