maslionok commited on
Commit
b88cf51
Β·
2 Parent(s): 37fdf42 1fd6ee7

Merge branch 'main' of https://huggingface.co/spaces/impresso-project/ocrqa-demo

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. README.md +4 -4
  3. app.py +55 -28
  4. logo.jpeg +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ thumbnail.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: OCR Quality Assessment Pipeline Demo
3
- emoji: πŸ”₯
4
- colorFrom: blue
5
  colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
- short_description: OCR Quality Assessment demo for Impresso project
9
  ---
10
 
 
1
  ---
2
+ title: OCR Quality Assessment Demo
3
+ emoji: πŸ”
4
+ colorFrom: gray
5
  colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
+ short_description: Measure the OCR output quality using word recognition ratios
9
  ---
10
 
app.py CHANGED
@@ -59,9 +59,9 @@ def process_ocr_qa(text, lang_choice):
59
  # Unknown tokens (potential OCR errors)
60
  if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
61
  unknown_tokens = diagnostics['unknown_tokens']
62
- output_lines.append(f"❌ Potential OCR errors ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
63
  elif 'unknown_tokens' in diagnostics:
64
- output_lines.append("✨ No potential OCR errors detected!")
65
 
66
  # Other fields
67
  for key, value in result.items():
@@ -78,55 +78,82 @@ def process_ocr_qa(text, lang_choice):
78
 
79
  # Create the interface with logo and improved description
80
  with gr.Blocks(title="OCR QA Demo") as demo:
81
- # Add logo at the top
82
- gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)
83
-
 
 
 
 
 
 
84
  gr.Markdown(
85
  """
86
- # πŸ” OCR Quality Assessment Pipeline Demo
87
-
88
- **OCR Quality Assessment** demonstrates how text extracted from OCR (Optical Character Recognition)
89
- is analyzed in the **Impresso** project. This pipeline identifies OCR errors and
90
- assesses text quality, returning a score from 0.0 to 1.0.
91
-
92
- Try the example below (German text with typical OCR errors) or enter your own OCR text to see how it gets processed!
93
- """
 
 
94
  )
95
 
96
  with gr.Row():
97
  with gr.Column():
98
  text_input = gr.Textbox(
99
- label="Enter OCR Text",
100
  value=EXAMPLE_TEXT,
101
  lines=8,
102
- placeholder="Enter your OCR text here..."
103
  )
104
  lang_dropdown = gr.Dropdown(
105
- choices=["Auto-detect"] + LANGUAGES,
106
  value="de",
107
- label="Language"
108
  )
109
- submit_btn = gr.Button("πŸ” Analyze OCR Quality", variant="primary")
 
110
 
111
  with gr.Column():
112
  with gr.Row():
113
  output = gr.Textbox(
114
- label="Analysis Results",
115
  lines=15,
116
- placeholder="Results will appear here...",
117
  scale=10
118
  )
119
- info_btn = gr.Button("Pipeline Info", size="sm", scale=1)
120
 
121
  # Info modal/accordion for pipeline details
122
- with gr.Accordion("πŸ“ About the OCR QA Pipeline", open=False, visible=False) as info_accordion:
123
  gr.Markdown(
124
- """
125
- - **Quality Score**: Evaluates the overall quality of OCR text. From 0.0 (poor) to 1.0 (excellent)
126
- - **Known tokens**: Words recognized as valid in the selected language
127
- - **Potential OCR errors**: Identifies common OCR mistakes and artifacts
128
- """
129
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  submit_btn.click(
132
  fn=process_ocr_qa,
 
59
  # Unknown tokens (potential OCR errors)
60
  if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
61
  unknown_tokens = diagnostics['unknown_tokens']
62
+ output_lines.append(f"❌ Unrecognized tokens ({len(unknown_tokens)}): {', '.join(unknown_tokens)}")
63
  elif 'unknown_tokens' in diagnostics:
64
+ output_lines.append("✨ All tokens matched known lexicons – no OCR errors detected.")
65
 
66
  # Other fields
67
  for key, value in result.items():
 
78
 
79
  # Create the interface with logo and improved description
80
  with gr.Blocks(title="OCR QA Demo") as demo:
81
+ gr.HTML(
82
+ """
83
+ <a href="https://impresso-project.ch" target="_blank">
84
+ <img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg"
85
+ alt="Impresso Project Logo"
86
+ style="height: 42px; display: block; margin: 5px auto; background-color: white;">
87
+ </a>
88
+ """
89
+ )
90
  gr.Markdown(
91
  """
92
+ # πŸ” Optical Character Recognition (OCR) Quality Assessment Demo
93
+
94
+ The demo showcases how the [Impresso Project](https://impresso-project.ch) assesses the quality of ORC transcripts by estimating the proportion of (un)known words with respect to a large clean text corpus.
95
+
96
+ It returns:
97
+ - a **quality score** between **0.0 (poor)** and **1.0 (excellent)**, and
98
+ - a list of **potential OCR errors** (unrecognized tokens) as well as the known tokens.
99
+
100
+ You can try the example below (a German text containing typical OCR errors), or paste your own OCR-processed text to assess its quality.
101
+ """
102
  )
103
 
104
  with gr.Row():
105
  with gr.Column():
106
  text_input = gr.Textbox(
107
+ label="OCR Text (from digitized sources)",
108
  value=EXAMPLE_TEXT,
109
  lines=8,
110
+ placeholder="Paste OCR-processed text from a historical document..."
111
  )
112
  lang_dropdown = gr.Dropdown(
113
+ choices=LANGUAGES,
114
  value="de",
115
+ label="Language of the Text"
116
  )
117
+ submit_btn = gr.Button("πŸ” Assess OCR Text Quality", variant="primary")
118
+ info_btn = gr.Button("Help", size="md", scale=1)
119
 
120
  with gr.Column():
121
  with gr.Row():
122
  output = gr.Textbox(
123
+ label="OCR Quality Report",
124
  lines=15,
125
+ placeholder="The quality assessment will appear here...",
126
  scale=10
127
  )
128
+
129
 
130
  # Info modal/accordion for pipeline details
131
+ with gr.Accordion("πŸ“ About the OCR QA Method", open=False, visible=False) as info_accordion:
132
  gr.Markdown(
133
+ """
134
+ This pipeline estimates OCR quality by analyzing the proportion of **unique words** in a text that match curated wordlists for a given language.
135
+
136
+ #### How it works:
137
+ - **Scoring**: The quality score ranges from **0.0** (poor) to **1.0** (excellent) and is based on the ratio of recognized to unrecognized unique word forms.
138
+ - **Lexical resources**: Words are matched against precompiled lists derived from **Wikipedia** and **Wortschatz Leipzig**, using **Bloom filters** for fast, memory-efficient lookup.
139
+ - **Multilingual support**: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically.
140
+ - **Diagnostics output**:
141
+ - βœ… **Known tokens**: Words found in the reference wordlist, presumed correctly OCR’d.
142
+ - ❌ **Unrecognized tokens**: Words not found in the listβ€”often OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms).
143
+ - Note: Non-alphabetic characters will be removed. For efficiency reasons, all digits are replace by the digit 0.
144
+
145
+ #### ⚠️ Limitations:
146
+ - The wordlists are **not exhaustive**, particularly for **historical vocabulary**, **dialects**, or **named entities**.
147
+ - The method may fail to flag **short OCR artifacts** (e.g., 1–2 character noise) and **non-alphabetic symbols**.
148
+
149
+ As such, the score should be understood as a **heuristic indicator**, best used for:
150
+ - Comparative assessments between OCR outputs
151
+ - Filtering low-quality text from large corpora
152
+ - Supporting decisions in corpus preparation and annotation workflows
153
+
154
+ It is **not a substitute for manual inspection** or ground-truth evaluation.
155
+ """
156
+ )
157
 
158
  submit_btn.click(
159
  fn=process_ocr_qa,
logo.jpeg CHANGED