Spaces:

impresso-project
/

ocrqa-demo

Sleeping

App Files Files Community

maslionok commited on Sep 24

Commit

b88cf51

2 Parent(s): 37fdf42 1fd6ee7

Merge branch 'main' of https://huggingface.co/spaces/impresso-project/ocrqa-demo

Browse files

Files changed (4) hide show

.gitattributes +1 -0
README.md +4 -4
app.py +55 -28
logo.jpeg +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+thumbnail.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: OCR Quality Assessment Pipeline Demo
-emoji: 🔥
-colorFrom: blue
 colorTo: indigo
 sdk: docker
 pinned: false
-short_description: OCR Quality Assessment demo for Impresso project
 ---

 ---
+title: OCR Quality Assessment Demo
+emoji: 🔍
+colorFrom: gray
 colorTo: indigo
 sdk: docker
 pinned: false
+short_description: Measure the OCR output quality using word recognition ratios
 ---

app.py CHANGED Viewed

@@ -59,9 +59,9 @@ def process_ocr_qa(text, lang_choice):
                 # Unknown tokens (potential OCR errors)
                 if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
                     unknown_tokens = diagnostics['unknown_tokens']
-                    output_lines.append(f"❌ Potential OCR errors ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
                 elif 'unknown_tokens' in diagnostics:
-                    output_lines.append("✨ No potential OCR errors detected!")
             # Other fields
             for key, value in result.items():
@@ -78,55 +78,82 @@ def process_ocr_qa(text, lang_choice):
 # Create the interface with logo and improved description
 with gr.Blocks(title="OCR QA Demo") as demo:
-    # Add logo at the top
-    gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)
     gr.Markdown(
         """
-        # 🔍 OCR Quality Assessment Pipeline Demo
-        **OCR Quality Assessment** demonstrates how text extracted from OCR (Optical Character Recognition)
-        is analyzed in the **Impresso** project. This pipeline identifies OCR errors and
-        assesses text quality, returning a score from 0.0 to 1.0.
-        Try the example below (German text with typical OCR errors) or enter your own OCR text to see how it gets processed!
-        """
     )
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(
-                label="Enter OCR Text",
                 value=EXAMPLE_TEXT,
                 lines=8,
-                placeholder="Enter your OCR text here..."
             )
             lang_dropdown = gr.Dropdown(
-                choices=["Auto-detect"] + LANGUAGES,
                 value="de",
-                label="Language"
             )
-            submit_btn = gr.Button("🔍 Analyze OCR Quality", variant="primary")
         with gr.Column():
             with gr.Row():
                 output = gr.Textbox(
-                    label="Analysis Results",
                     lines=15,
-                    placeholder="Results will appear here...",
                     scale=10
                 )
-                info_btn = gr.Button("Pipeline Info", size="sm", scale=1)
     # Info modal/accordion for pipeline details
-    with gr.Accordion("📝 About the OCR QA Pipeline", open=False, visible=False) as info_accordion:
         gr.Markdown(
-            """
-            - **Quality Score**: Evaluates the overall quality of OCR text. From 0.0 (poor) to 1.0 (excellent)
-            - **Known tokens**: Words recognized as valid in the selected language
-            - **Potential OCR errors**: Identifies common OCR mistakes and artifacts
-            """
-        )
     submit_btn.click(
         fn=process_ocr_qa,

                 # Unknown tokens (potential OCR errors)
                 if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
                     unknown_tokens = diagnostics['unknown_tokens']
+                    output_lines.append(f"❌ Unrecognized tokens ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
                 elif 'unknown_tokens' in diagnostics:
+                    output_lines.append("✨ All tokens matched known lexicons – no OCR errors detected.")
             # Other fields
             for key, value in result.items():
 # Create the interface with logo and improved description
 with gr.Blocks(title="OCR QA Demo") as demo:
+    gr.HTML(
+    """
+    <a href="https://impresso-project.ch" target="_blank">
+        <img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg"
+             alt="Impresso Project Logo"
+             style="height: 42px; display: block; margin: 5px auto; background-color: white;">
+    </a>
+    """
+)
     gr.Markdown(
         """
+    # 🔍 Optical Character Recognition (OCR) Quality Assessment Demo
+    The demo showcases how the [Impresso Project](https://impresso-project.ch) assesses the quality of ORC transcripts by estimating the proportion of (un)known words with respect to a large clean text corpus.
+    It returns:
+    - a **quality score** between **0.0 (poor)** and **1.0 (excellent)**, and
+    - a list of **potential OCR errors** (unrecognized tokens) as well as the known tokens.
+    You can try the example below (a German text containing typical OCR errors), or paste your own OCR-processed text to assess its quality.
+    """
     )
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(
+                label="OCR Text (from digitized sources)",
                 value=EXAMPLE_TEXT,
                 lines=8,
+                placeholder="Paste OCR-processed text from a historical document..."
             )
             lang_dropdown = gr.Dropdown(
+                choices=LANGUAGES,
                 value="de",
+                label="Language of the Text"
             )
+            submit_btn = gr.Button("🔍 Assess OCR Text Quality", variant="primary")
+            info_btn = gr.Button("Help", size="md", scale=1)
         with gr.Column():
             with gr.Row():
                 output = gr.Textbox(
+                    label="OCR Quality Report",
                     lines=15,
+                    placeholder="The quality assessment will appear here...",
                     scale=10
                 )
     # Info modal/accordion for pipeline details
+    with gr.Accordion("📝 About the OCR QA Method", open=False, visible=False) as info_accordion:
         gr.Markdown(
+    """
+    This pipeline estimates OCR quality by analyzing the proportion of **unique words** in a text that match curated wordlists for a given language.
+    #### How it works:
+    - **Scoring**: The quality score ranges from **0.0** (poor) to **1.0** (excellent) and is based on the ratio of recognized to unrecognized unique word forms.
+    - **Lexical resources**: Words are matched against precompiled lists derived from **Wikipedia** and **Wortschatz Leipzig**, using **Bloom filters** for fast, memory-efficient lookup.
+    - **Multilingual support**: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically.
+    - **Diagnostics output**:
+        - ✅ **Known tokens**: Words found in the reference wordlist, presumed correctly OCR’d.
+        - ❌ **Unrecognized tokens**: Words not found in the list—often OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms).
+        - Note: Non-alphabetic characters will be removed. For efficiency reasons, all digits are replace by the digit 0.
+    #### ⚠️ Limitations:
+    - The wordlists are **not exhaustive**, particularly for **historical vocabulary**, **dialects**, or **named entities**.
+    - The method may fail to flag **short OCR artifacts** (e.g., 1–2 character noise) and **non-alphabetic symbols**.
+    As such, the score should be understood as a **heuristic indicator**, best used for:
+    - Comparative assessments between OCR outputs
+    - Filtering low-quality text from large corpora
+    - Supporting decisions in corpus preparation and annotation workflows
+    It is **not a substitute for manual inspection** or ground-truth evaluation.
+    """
+)
     submit_btn.click(
         fn=process_ocr_qa,

logo.jpeg CHANGED Viewed