maslionok commited on
Commit
1ee396e
·
1 Parent(s): 2d73d72

fist commit

Browse files
Files changed (4) hide show
  1. Dockerfile +9 -0
  2. README.md +34 -11
  3. app.py +118 -0
  4. logo.jpeg +0 -0
Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ RUN pip install --no-cache-dir --upgrade gradio==4.44.0 impresso-pipelines[ocrqa]==0.4.6.6
6
+
7
+ COPY . /code
8
+
9
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,11 +1,34 @@
1
- ---
2
- title: Ocrqa Demo
3
- emoji: 😻
4
- colorFrom: yellow
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- short_description: Demo of ocrqa pipeline from impresso pipelines
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OCR Quality Assessment Pipeline Demo
2
+
3
+ This demo showcases the **OCR Quality Assessment Pipeline** from the Impresso project, which analyzes and improves text extracted from OCR (Optical Character Recognition).
4
+
5
+ ## Features
6
+
7
+ - **OCR Error Detection**: Identifies common OCR mistakes and artifacts
8
+ - **Quality Assessment**: Evaluates the overall quality of OCR text
9
+ - **Text Correction**: Suggests improvements for detected errors
10
+ - **Interactive Interface**: User-friendly Gradio web interface
11
+
12
+ ## Usage
13
+
14
+ The demo accepts OCR text input and provides:
15
+ - Quality assessment scores
16
+ - Detected OCR errors
17
+ - Suggested corrections
18
+ - Processed/improved text
19
+
20
+ ## Example
21
+
22
+ Try the provided German text example that contains typical OCR errors like:
23
+ - Character misrecognition (e.g., "Zaubrisch" instead of "Zauberisch")
24
+ - Spacing issues (e.g., "nacb" instead of "nach")
25
+ - Punctuation errors (e.g., "d:m" instead of "dem")
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install -r requirements.txt
31
+ python app.py
32
+ ```
33
+
34
+ The demo will be available at `http://localhost:7860`
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Redirect cache to a writable path inside container
4
+ os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
5
+
6
+ import gradio as gr
7
+ from impresso_pipelines.ocrqa import OCRQAPipeline
8
+
9
+ pipeline = OCRQAPipeline()
10
+
11
+ # Example OCR text (German text with typical OCR errors)
12
+ EXAMPLE_TEXT = """Vieles Seltsame geschieht auf Erden :
13
+ Nichts Seltsameres sieht der Mond
14
+ Als das Glück, das im Knopfloch wohnt.
15
+ Zaubrisch faßt es den ernsten Mann.
16
+ Ohne nach Weib u. Kind zu fragen
17
+ Reitet er aus, nach dem Glück zu jagen,
18
+ Nur nacb ihm war stets sein Vegehr.
19
+ Aber neben ihm reitet der Dämon her
20
+ Des Ehrgeizes mit finsterer Tücke,
21
+ Und so jagt er zuletzt auf die Brücke,
22
+ Die über dem Abgrund, d:m nächtlich schwarzen
23
+ Jählings abbricht."""
24
+
25
+ def process_ocr_qa(text):
26
+ try:
27
+ result = pipeline(text)
28
+
29
+ # Format the output for better readability
30
+ if isinstance(result, dict):
31
+ output_lines = []
32
+ for key, value in result.items():
33
+ if key == 'corrections':
34
+ output_lines.append(f"📝 **{key.replace('_', ' ').title()}:**")
35
+ if isinstance(value, list) and value:
36
+ for correction in value:
37
+ output_lines.append(f" • {correction}")
38
+ elif isinstance(value, dict) and value:
39
+ for sub_key, sub_value in value.items():
40
+ output_lines.append(f" • {sub_key}: {sub_value}")
41
+ else:
42
+ output_lines.append(f" No corrections found")
43
+ elif key == 'quality_score':
44
+ output_lines.append(f"⭐ **Quality Score:** {value}")
45
+ elif key == 'processed_text':
46
+ output_lines.append(f"✨ **Processed Text:**\n{value}")
47
+ else:
48
+ output_lines.append(f"🔍 **{key.replace('_', ' ').title()}:** {value}")
49
+
50
+ return "\n\n".join(output_lines)
51
+ else:
52
+ return f"✨ **Processed Result:**\n{result}"
53
+
54
+ except Exception as e:
55
+ print("❌ Pipeline error:", e)
56
+ return f"Error: {e}"
57
+
58
+ # Create the interface with logo and improved description
59
+ with gr.Blocks(title="OCR QA Demo") as demo:
60
+ # Add logo at the top
61
+ gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)
62
+
63
+ gr.Markdown(
64
+ """
65
+ # 🔍 OCR Quality Assessment Pipeline Demo
66
+
67
+ **OCR Quality Assessment** demonstrates how text extracted from OCR (Optical Character Recognition)
68
+ is analyzed and improved in the **Impresso** project. This pipeline identifies OCR errors,
69
+ assesses text quality, and provides corrections for better text processing.
70
+
71
+ Try the example below (German text with typical OCR errors) or enter your own OCR text to see how it gets processed!
72
+ """
73
+ )
74
+
75
+ with gr.Row():
76
+ with gr.Column():
77
+ text_input = gr.Textbox(
78
+ label="Enter OCR Text",
79
+ value=EXAMPLE_TEXT,
80
+ lines=8,
81
+ placeholder="Enter your OCR text here..."
82
+ )
83
+ submit_btn = gr.Button("🔍 Analyze OCR Quality", variant="primary")
84
+
85
+ with gr.Column():
86
+ with gr.Row():
87
+ output = gr.Textbox(
88
+ label="Analysis Results",
89
+ lines=15,
90
+ placeholder="Results will appear here...",
91
+ scale=10
92
+ )
93
+ info_btn = gr.Button("Pipeline Info", size="sm", scale=1)
94
+
95
+ # Info modal/accordion for pipeline details
96
+ with gr.Accordion("📝 About the OCR QA Pipeline", open=False, visible=False) as info_accordion:
97
+ gr.Markdown(
98
+ """
99
+ - **OCR Error Detection**: Identifies common OCR mistakes and artifacts
100
+ - **Quality Assessment**: Evaluates the overall quality of OCR text
101
+ - **Text Correction**: Suggests improvements for detected errors
102
+ - **Language Processing**: Handles multilingual OCR text processing
103
+ """
104
+ )
105
+
106
+ submit_btn.click(
107
+ fn=process_ocr_qa,
108
+ inputs=[text_input],
109
+ outputs=output
110
+ )
111
+
112
+ # Toggle info visibility when info button is clicked
113
+ info_btn.click(
114
+ fn=lambda: gr.Accordion(visible=True, open=True),
115
+ outputs=info_accordion
116
+ )
117
+
118
+ demo.launch(server_name="0.0.0.0", server_port=7860)
logo.jpeg ADDED