Spaces:
Running
Running
maslionok
commited on
Commit
·
1ee396e
1
Parent(s):
2d73d72
fist commit
Browse files
Dockerfile
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
RUN pip install --no-cache-dir --upgrade gradio==4.44.0 impresso-pipelines[ocrqa]==0.4.6.6
|
6 |
+
|
7 |
+
COPY . /code
|
8 |
+
|
9 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,11 +1,34 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# OCR Quality Assessment Pipeline Demo
|
2 |
+
|
3 |
+
This demo showcases the **OCR Quality Assessment Pipeline** from the Impresso project, which analyzes and improves text extracted from OCR (Optical Character Recognition).
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- **OCR Error Detection**: Identifies common OCR mistakes and artifacts
|
8 |
+
- **Quality Assessment**: Evaluates the overall quality of OCR text
|
9 |
+
- **Text Correction**: Suggests improvements for detected errors
|
10 |
+
- **Interactive Interface**: User-friendly Gradio web interface
|
11 |
+
|
12 |
+
## Usage
|
13 |
+
|
14 |
+
The demo accepts OCR text input and provides:
|
15 |
+
- Quality assessment scores
|
16 |
+
- Detected OCR errors
|
17 |
+
- Suggested corrections
|
18 |
+
- Processed/improved text
|
19 |
+
|
20 |
+
## Example
|
21 |
+
|
22 |
+
Try the provided German text example that contains typical OCR errors like:
|
23 |
+
- Character misrecognition (e.g., "Zaubrisch" instead of "Zauberisch")
|
24 |
+
- Spacing issues (e.g., "nacb" instead of "nach")
|
25 |
+
- Punctuation errors (e.g., "d:m" instead of "dem")
|
26 |
+
|
27 |
+
## Installation
|
28 |
+
|
29 |
+
```bash
|
30 |
+
pip install -r requirements.txt
|
31 |
+
python app.py
|
32 |
+
```
|
33 |
+
|
34 |
+
The demo will be available at `http://localhost:7860`
|
app.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
# Redirect cache to a writable path inside container
|
4 |
+
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
from impresso_pipelines.ocrqa import OCRQAPipeline
|
8 |
+
|
9 |
+
pipeline = OCRQAPipeline()
|
10 |
+
|
11 |
+
# Example OCR text (German text with typical OCR errors)
|
12 |
+
EXAMPLE_TEXT = """Vieles Seltsame geschieht auf Erden :
|
13 |
+
Nichts Seltsameres sieht der Mond
|
14 |
+
Als das Glück, das im Knopfloch wohnt.
|
15 |
+
Zaubrisch faßt es den ernsten Mann.
|
16 |
+
Ohne nach Weib u. Kind zu fragen
|
17 |
+
Reitet er aus, nach dem Glück zu jagen,
|
18 |
+
Nur nacb ihm war stets sein Vegehr.
|
19 |
+
Aber neben ihm reitet der Dämon her
|
20 |
+
Des Ehrgeizes mit finsterer Tücke,
|
21 |
+
Und so jagt er zuletzt auf die Brücke,
|
22 |
+
Die über dem Abgrund, d:m nächtlich schwarzen
|
23 |
+
Jählings abbricht."""
|
24 |
+
|
25 |
+
def process_ocr_qa(text):
|
26 |
+
try:
|
27 |
+
result = pipeline(text)
|
28 |
+
|
29 |
+
# Format the output for better readability
|
30 |
+
if isinstance(result, dict):
|
31 |
+
output_lines = []
|
32 |
+
for key, value in result.items():
|
33 |
+
if key == 'corrections':
|
34 |
+
output_lines.append(f"📝 **{key.replace('_', ' ').title()}:**")
|
35 |
+
if isinstance(value, list) and value:
|
36 |
+
for correction in value:
|
37 |
+
output_lines.append(f" • {correction}")
|
38 |
+
elif isinstance(value, dict) and value:
|
39 |
+
for sub_key, sub_value in value.items():
|
40 |
+
output_lines.append(f" • {sub_key}: {sub_value}")
|
41 |
+
else:
|
42 |
+
output_lines.append(f" No corrections found")
|
43 |
+
elif key == 'quality_score':
|
44 |
+
output_lines.append(f"⭐ **Quality Score:** {value}")
|
45 |
+
elif key == 'processed_text':
|
46 |
+
output_lines.append(f"✨ **Processed Text:**\n{value}")
|
47 |
+
else:
|
48 |
+
output_lines.append(f"🔍 **{key.replace('_', ' ').title()}:** {value}")
|
49 |
+
|
50 |
+
return "\n\n".join(output_lines)
|
51 |
+
else:
|
52 |
+
return f"✨ **Processed Result:**\n{result}"
|
53 |
+
|
54 |
+
except Exception as e:
|
55 |
+
print("❌ Pipeline error:", e)
|
56 |
+
return f"Error: {e}"
|
57 |
+
|
58 |
+
# Create the interface with logo and improved description
|
59 |
+
with gr.Blocks(title="OCR QA Demo") as demo:
|
60 |
+
# Add logo at the top
|
61 |
+
gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)
|
62 |
+
|
63 |
+
gr.Markdown(
|
64 |
+
"""
|
65 |
+
# 🔍 OCR Quality Assessment Pipeline Demo
|
66 |
+
|
67 |
+
**OCR Quality Assessment** demonstrates how text extracted from OCR (Optical Character Recognition)
|
68 |
+
is analyzed and improved in the **Impresso** project. This pipeline identifies OCR errors,
|
69 |
+
assesses text quality, and provides corrections for better text processing.
|
70 |
+
|
71 |
+
Try the example below (German text with typical OCR errors) or enter your own OCR text to see how it gets processed!
|
72 |
+
"""
|
73 |
+
)
|
74 |
+
|
75 |
+
with gr.Row():
|
76 |
+
with gr.Column():
|
77 |
+
text_input = gr.Textbox(
|
78 |
+
label="Enter OCR Text",
|
79 |
+
value=EXAMPLE_TEXT,
|
80 |
+
lines=8,
|
81 |
+
placeholder="Enter your OCR text here..."
|
82 |
+
)
|
83 |
+
submit_btn = gr.Button("🔍 Analyze OCR Quality", variant="primary")
|
84 |
+
|
85 |
+
with gr.Column():
|
86 |
+
with gr.Row():
|
87 |
+
output = gr.Textbox(
|
88 |
+
label="Analysis Results",
|
89 |
+
lines=15,
|
90 |
+
placeholder="Results will appear here...",
|
91 |
+
scale=10
|
92 |
+
)
|
93 |
+
info_btn = gr.Button("Pipeline Info", size="sm", scale=1)
|
94 |
+
|
95 |
+
# Info modal/accordion for pipeline details
|
96 |
+
with gr.Accordion("📝 About the OCR QA Pipeline", open=False, visible=False) as info_accordion:
|
97 |
+
gr.Markdown(
|
98 |
+
"""
|
99 |
+
- **OCR Error Detection**: Identifies common OCR mistakes and artifacts
|
100 |
+
- **Quality Assessment**: Evaluates the overall quality of OCR text
|
101 |
+
- **Text Correction**: Suggests improvements for detected errors
|
102 |
+
- **Language Processing**: Handles multilingual OCR text processing
|
103 |
+
"""
|
104 |
+
)
|
105 |
+
|
106 |
+
submit_btn.click(
|
107 |
+
fn=process_ocr_qa,
|
108 |
+
inputs=[text_input],
|
109 |
+
outputs=output
|
110 |
+
)
|
111 |
+
|
112 |
+
# Toggle info visibility when info button is clicked
|
113 |
+
info_btn.click(
|
114 |
+
fn=lambda: gr.Accordion(visible=True, open=True),
|
115 |
+
outputs=info_accordion
|
116 |
+
)
|
117 |
+
|
118 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
logo.jpeg
ADDED
![]() |