Gabriel commited on
Commit
fb3185e
·
1 Parent(s): cfb37bf
Files changed (5) hide show
  1. .python-version +1 -0
  2. app.py +858 -4
  3. pyproject.toml +12 -0
  4. requirements.txt +4 -0
  5. uv.lock +0 -0
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
app.py CHANGED
@@ -1,7 +1,861 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import yaml
3
+ import json
4
+ import base64
5
+ import tempfile
6
+ import os
7
+ from typing import Dict, List, Optional, Literal
8
+ from datetime import datetime
9
+ from PIL import Image, ImageDraw, ImageFont
10
+ import io
11
 
12
+ from htrflow.volume.volume import Collection
13
+ from htrflow.pipeline.pipeline import Pipeline
14
 
15
+ PIPELINE_CONFIGS = {
16
+ "letter_english": {
17
+ "steps": [
18
+ {
19
+ "step": "Segmentation",
20
+ "settings": {
21
+ "model": "yolo",
22
+ "model_settings": {
23
+ "model": "Riksarkivet/yolov9-lines-within-regions-1"
24
+ },
25
+ "generation_settings": {"batch_size": 8},
26
+ },
27
+ },
28
+ {
29
+ "step": "TextRecognition",
30
+ "settings": {
31
+ "model": "TrOCR",
32
+ "model_settings": {"model": "microsoft/trocr-base-handwritten"},
33
+ "generation_settings": {"batch_size": 16},
34
+ },
35
+ },
36
+ {"step": "OrderLines"},
37
+ ]
38
+ },
39
+ "letter_swedish": {
40
+ "steps": [
41
+ {
42
+ "step": "Segmentation",
43
+ "settings": {
44
+ "model": "yolo",
45
+ "model_settings": {
46
+ "model": "Riksarkivet/yolov9-lines-within-regions-1"
47
+ },
48
+ "generation_settings": {"batch_size": 8},
49
+ },
50
+ },
51
+ {
52
+ "step": "TextRecognition",
53
+ "settings": {
54
+ "model": "TrOCR",
55
+ "model_settings": {
56
+ "model": "Riksarkivet/trocr-base-handwritten-hist-swe-2"
57
+ },
58
+ "generation_settings": {"batch_size": 16},
59
+ },
60
+ },
61
+ {"step": "OrderLines"},
62
+ ]
63
+ },
64
+ "spread_english": {
65
+ "steps": [
66
+ {
67
+ "step": "Segmentation",
68
+ "settings": {
69
+ "model": "yolo",
70
+ "model_settings": {"model": "Riksarkivet/yolov9-regions-1"},
71
+ "generation_settings": {"batch_size": 4},
72
+ },
73
+ },
74
+ {
75
+ "step": "Segmentation",
76
+ "settings": {
77
+ "model": "yolo",
78
+ "model_settings": {
79
+ "model": "Riksarkivet/yolov9-lines-within-regions-1"
80
+ },
81
+ "generation_settings": {"batch_size": 8},
82
+ },
83
+ },
84
+ {
85
+ "step": "TextRecognition",
86
+ "settings": {
87
+ "model": "TrOCR",
88
+ "model_settings": {"model": "microsoft/trocr-base-handwritten"},
89
+ "generation_settings": {"batch_size": 16},
90
+ },
91
+ },
92
+ {"step": "ReadingOrderMarginalia", "settings": {"two_page": True}},
93
+ ]
94
+ },
95
+ "spread_swedish": {
96
+ "steps": [
97
+ {
98
+ "step": "Segmentation",
99
+ "settings": {
100
+ "model": "yolo",
101
+ "model_settings": {"model": "Riksarkivet/yolov9-regions-1"},
102
+ "generation_settings": {"batch_size": 4},
103
+ },
104
+ },
105
+ {
106
+ "step": "Segmentation",
107
+ "settings": {
108
+ "model": "yolo",
109
+ "model_settings": {
110
+ "model": "Riksarkivet/yolov9-lines-within-regions-1"
111
+ },
112
+ "generation_settings": {"batch_size": 8},
113
+ },
114
+ },
115
+ {
116
+ "step": "TextRecognition",
117
+ "settings": {
118
+ "model": "TrOCR",
119
+ "model_settings": {
120
+ "model": "Riksarkivet/trocr-base-handwritten-hist-swe-2"
121
+ },
122
+ "generation_settings": {"batch_size": 16},
123
+ },
124
+ },
125
+ {"step": "ReadingOrderMarginalia", "settings": {"two_page": True}},
126
+ ]
127
+ },
128
+ }
129
+
130
+ @spaces.GPU
131
+ def process_htr(
132
+ image: Image.Image,
133
+ document_type: Literal[
134
+ "letter_english", "letter_swedish", "spread_english", "spread_swedish"
135
+ ] = "spread_swedish",
136
+ confidence_threshold: float = 0.8,
137
+ custom_settings: Optional[str] = None,
138
+ ) -> Dict:
139
+ """
140
+ Process handwritten text recognition on uploaded images using HTRflow pipelines.
141
+
142
+ Supports templates for different document types (letters vs spreads) and
143
+ languages (English vs Swedish). Uses HTRflow's modular pipeline system with
144
+ configurable segmentation and text recognition models.
145
+
146
+ Args:
147
+ image (Image.Image): PIL Image object to process
148
+ document_type (str): Type of document processing template to use
149
+ confidence_threshold (float): Minimum confidence threshold for text recognition
150
+ custom_settings (str, optional): JSON string with custom pipeline settings
151
+
152
+ Returns:
153
+ dict: Processing results including extracted text, metadata, and processing state
154
+ """
155
+ try:
156
+ if image is None:
157
+ return {"success": False, "error": "No image provided", "results": None}
158
+
159
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
160
+ image.save(temp_file.name, "PNG")
161
+ temp_image_path = temp_file.name
162
+
163
+ try:
164
+ if custom_settings:
165
+ try:
166
+ config = json.loads(custom_settings)
167
+ except json.JSONDecodeError:
168
+ return {
169
+ "success": False,
170
+ "error": "Invalid JSON in custom_settings parameter",
171
+ "results": None,
172
+ }
173
+ else:
174
+ config = PIPELINE_CONFIGS[document_type]
175
+
176
+ collection = Collection([temp_image_path])
177
+
178
+ pipeline = Pipeline.from_config(config)
179
+ processed_collection = pipeline.run(collection)
180
+
181
+ results = extract_processing_results(
182
+ processed_collection, confidence_threshold
183
+ )
184
+
185
+ img_buffer = io.BytesIO()
186
+ image.save(img_buffer, format="PNG")
187
+ image_base64 = base64.b64encode(img_buffer.getvalue()).decode("utf-8")
188
+
189
+ processing_state = {
190
+ "collection": serialize_collection(processed_collection),
191
+ "config": config,
192
+ "image_base64": image_base64,
193
+ "image_size": image.size,
194
+ "document_type": document_type,
195
+ "confidence_threshold": confidence_threshold,
196
+ "timestamp": datetime.now().isoformat(),
197
+ }
198
+
199
+ return {
200
+ "success": True,
201
+ "results": results,
202
+ "processing_state": json.dumps(processing_state),
203
+ "metadata": {
204
+ "total_lines": len(results.get("text_lines", [])),
205
+ "average_confidence": calculate_average_confidence(results),
206
+ "document_type": document_type,
207
+ "image_dimensions": image.size,
208
+ },
209
+ }
210
+
211
+ finally:
212
+ if os.path.exists(temp_image_path):
213
+ os.unlink(temp_image_path)
214
+
215
+ except Exception as e:
216
+ return {
217
+ "success": False,
218
+ "error": f"HTR processing failed: {str(e)}",
219
+ "results": None,
220
+ }
221
+
222
+
223
+ def visualize_results(
224
+ processing_state: str,
225
+ visualization_type: Literal[
226
+ "overlay", "confidence_heatmap", "text_regions"
227
+ ] = "overlay",
228
+ show_confidence: bool = True,
229
+ highlight_low_confidence: bool = True,
230
+ image: Optional[Image.Image] = None,
231
+ ) -> Dict:
232
+ """
233
+ Generate interactive visualizations of HTR processing results.
234
+
235
+ Creates visual representations of text recognition results including bounding box
236
+ overlays, confidence heatmaps, and region segmentation displays. Supports multiple
237
+ visualization modes for different analysis needs.
238
+
239
+ Args:
240
+ processing_state (str): JSON string containing HTR processing results and metadata
241
+ visualization_type (str): Type of visualization to generate
242
+ show_confidence (bool): Whether to display confidence scores on visualization
243
+ highlight_low_confidence (bool): Whether to highlight low-confidence regions
244
+ image (Image.Image, optional): PIL Image object to use instead of state image
245
+
246
+ Returns:
247
+ dict: Visualization data including base64-encoded images and metadata
248
+ """
249
+ try:
250
+ state = json.loads(processing_state)
251
+ collection = deserialize_collection(state["collection"])
252
+ confidence_threshold = state["confidence_threshold"]
253
+
254
+ if image is not None:
255
+ original_image = image
256
+ else:
257
+ image_data = base64.b64decode(state["image_base64"])
258
+ original_image = Image.open(io.BytesIO(image_data))
259
+
260
+ if visualization_type == "overlay":
261
+ viz_image = create_text_overlay_visualization(
262
+ original_image, collection, show_confidence, highlight_low_confidence
263
+ )
264
+ elif visualization_type == "confidence_heatmap":
265
+ viz_image = create_confidence_heatmap(
266
+ original_image, collection, confidence_threshold
267
+ )
268
+ elif visualization_type == "text_regions":
269
+ viz_image = create_region_visualization(original_image, collection)
270
+
271
+ img_buffer = io.BytesIO()
272
+ viz_image.save(img_buffer, format="PNG")
273
+ img_base64 = base64.b64encode(img_buffer.getvalue()).decode("utf-8")
274
+
275
+ viz_metadata = generate_visualization_metadata(collection, visualization_type)
276
+
277
+ return {
278
+ "success": True,
279
+ "visualization": {
280
+ "image_base64": img_base64,
281
+ "image_format": "PNG",
282
+ "visualization_type": visualization_type,
283
+ "dimensions": viz_image.size,
284
+ },
285
+ "metadata": viz_metadata,
286
+ "interactive_elements": extract_interactive_elements(collection),
287
+ }
288
+
289
+ except Exception as e:
290
+ return {
291
+ "success": False,
292
+ "error": f"Visualization generation failed: {str(e)}",
293
+ "visualization": None,
294
+ }
295
+
296
+
297
+ def export_results(
298
+ processing_state: str,
299
+ output_formats: List[Literal["txt", "json", "alto", "page"]] = ["txt"],
300
+ include_metadata: bool = True,
301
+ confidence_filter: float = 0.0,
302
+ ) -> Dict:
303
+ """
304
+ Export HTR results to multiple formats including plain text, structured JSON, ALTO XML, and PAGE XML.
305
+
306
+ Supports HTRflow's native export functionality with configurable output formats and
307
+ filtering options. Maintains document structure and metadata across all export formats.
308
+
309
+ Args:
310
+ processing_state (str): JSON string containing HTR processing results
311
+ output_formats (List[str]): List of output formats to generate
312
+ include_metadata (bool): Whether to include processing metadata in exports
313
+ confidence_filter (float): Minimum confidence threshold for included text
314
+
315
+ Returns:
316
+ dict: Export results with content for each requested format
317
+ """
318
+ try:
319
+ # Parse processing state
320
+ state = json.loads(processing_state)
321
+ collection = deserialize_collection(state["collection"])
322
+ config = state["config"]
323
+
324
+ # Generate exports for each requested format
325
+ exports = {}
326
+
327
+ for format_type in output_formats:
328
+ if format_type == "txt":
329
+ exports["txt"] = export_plain_text(
330
+ collection, confidence_filter, include_metadata
331
+ )
332
+ elif format_type == "json":
333
+ exports["json"] = export_structured_json(
334
+ collection, confidence_filter, include_metadata
335
+ )
336
+ elif format_type == "alto":
337
+ exports["alto"] = export_alto_xml(
338
+ collection, confidence_filter, include_metadata
339
+ )
340
+ elif format_type == "page":
341
+ exports["page"] = export_page_xml(
342
+ collection, confidence_filter, include_metadata
343
+ )
344
+
345
+ # Calculate export statistics
346
+ export_stats = calculate_export_statistics(collection, confidence_filter)
347
+
348
+ return {
349
+ "success": True,
350
+ "exports": exports,
351
+ "statistics": export_stats,
352
+ "export_metadata": {
353
+ "formats_generated": output_formats,
354
+ "confidence_filter": confidence_filter,
355
+ "include_metadata": include_metadata,
356
+ "timestamp": datetime.now().isoformat(),
357
+ },
358
+ }
359
+
360
+ except Exception as e:
361
+ return {
362
+ "success": False,
363
+ "error": f"Export generation failed: {str(e)}",
364
+ "exports": None,
365
+ }
366
+
367
+
368
+ # Helper Functions
369
+ def extract_processing_results(
370
+ collection: Collection, confidence_threshold: float
371
+ ) -> Dict:
372
+ """Extract structured results from processed HTRflow Collection."""
373
+ results = {
374
+ "extracted_text": "",
375
+ "text_lines": [],
376
+ "regions": [],
377
+ "confidence_scores": [],
378
+ }
379
+
380
+ # Traverse collection hierarchy to extract text and metadata
381
+ for page in collection.pages:
382
+ for node in page.traverse():
383
+ if hasattr(node, "text") and node.text:
384
+ if (
385
+ hasattr(node, "confidence")
386
+ and node.confidence >= confidence_threshold
387
+ ):
388
+ results["text_lines"].append(
389
+ {
390
+ "text": node.text,
391
+ "confidence": node.confidence,
392
+ "bbox": getattr(node, "bbox", None),
393
+ "node_id": getattr(node, "id", None),
394
+ }
395
+ )
396
+ results["extracted_text"] += node.text + "\n"
397
+ results["confidence_scores"].append(node.confidence)
398
+
399
+ return results
400
+
401
+
402
+ def serialize_collection(collection: Collection) -> str:
403
+ """Serialize HTRflow Collection to JSON string for state storage."""
404
+ serialized_data = {"pages": [], "metadata": getattr(collection, "metadata", {})}
405
+
406
+ for page in collection.pages:
407
+ page_data = {
408
+ "nodes": [],
409
+ "image_path": getattr(page, "image_path", None),
410
+ "dimensions": getattr(page, "dimensions", None),
411
+ }
412
+
413
+ for node in page.traverse():
414
+ node_data = {
415
+ "text": getattr(node, "text", ""),
416
+ "confidence": getattr(node, "confidence", 1.0),
417
+ "bbox": getattr(node, "bbox", None),
418
+ "node_id": getattr(node, "id", None),
419
+ "node_type": type(node).__name__,
420
+ }
421
+ page_data["nodes"].append(node_data)
422
+
423
+ serialized_data["pages"].append(page_data)
424
+
425
+ return json.dumps(serialized_data)
426
+
427
+
428
+ def deserialize_collection(serialized_data: str):
429
+ """Deserialize JSON string back to HTRflow Collection."""
430
+ data = json.loads(serialized_data)
431
+
432
+ # Mock collection classes for state reconstruction
433
+ class MockCollection:
434
+ def __init__(self, data):
435
+ self.pages = []
436
+ for page_data in data.get("pages", []):
437
+ page = MockPage(page_data)
438
+ self.pages.append(page)
439
+
440
+ class MockPage:
441
+ def __init__(self, page_data):
442
+ self.nodes = []
443
+ for node_data in page_data.get("nodes", []):
444
+ node = MockNode(node_data)
445
+ self.nodes.append(node)
446
+
447
+ def traverse(self):
448
+ return self.nodes
449
+
450
+ class MockNode:
451
+ def __init__(self, node_data):
452
+ self.text = node_data.get("text", "")
453
+ self.confidence = node_data.get("confidence", 1.0)
454
+ self.bbox = node_data.get("bbox")
455
+ self.id = node_data.get("node_id")
456
+
457
+ return MockCollection(data)
458
+
459
+
460
+ def calculate_average_confidence(results: Dict) -> float:
461
+ """Calculate average confidence score from processing results."""
462
+ confidence_scores = results.get("confidence_scores", [])
463
+ if not confidence_scores:
464
+ return 0.0
465
+ return sum(confidence_scores) / len(confidence_scores)
466
+
467
+
468
+ def create_text_overlay_visualization(
469
+ image, collection, show_confidence, highlight_low_confidence
470
+ ):
471
+ """Create image with text bounding boxes and recognition results overlaid."""
472
+ viz_image = image.copy()
473
+ draw = ImageDraw.Draw(viz_image)
474
+
475
+ # Define visualization styles
476
+ bbox_color = (0, 255, 0) # Green for normal confidence
477
+ low_conf_color = (255, 165, 0) # Orange for low confidence
478
+ text_color = (255, 255, 255) # White text
479
+
480
+ try:
481
+ font = ImageFont.truetype("arial.ttf", 12)
482
+ except:
483
+ font = ImageFont.load_default()
484
+
485
+ # Draw bounding boxes and text for each recognized element
486
+ for page in collection.pages:
487
+ for node in page.traverse():
488
+ if (
489
+ hasattr(node, "bbox")
490
+ and hasattr(node, "text")
491
+ and node.bbox
492
+ and node.text
493
+ ):
494
+ bbox = node.bbox
495
+ confidence = getattr(node, "confidence", 1.0)
496
+
497
+ # Choose color based on confidence
498
+ if highlight_low_confidence and confidence < 0.7:
499
+ color = low_conf_color
500
+ else:
501
+ color = bbox_color
502
+
503
+ # Draw bounding box
504
+ draw.rectangle(bbox, outline=color, width=2)
505
+
506
+ # Add confidence score if requested
507
+ if show_confidence:
508
+ conf_text = f"{confidence:.2f}"
509
+ draw.text((bbox[0], bbox[1] - 15), conf_text, fill=color, font=font)
510
+
511
+ return viz_image
512
+
513
+
514
+ def create_confidence_heatmap(image, collection, confidence_threshold):
515
+ """Create confidence heatmap visualization."""
516
+ viz_image = image.copy()
517
+
518
+ # Create heatmap overlay based on confidence scores
519
+ for page in collection.pages:
520
+ for node in page.traverse():
521
+ if hasattr(node, "bbox") and hasattr(node, "confidence") and node.bbox:
522
+ confidence = node.confidence
523
+ # Color mapping: red (low) -> yellow (medium) -> green (high)
524
+ if confidence < 0.5:
525
+ color = (255, 0, 0, 100) # Red with transparency
526
+ elif confidence < 0.8:
527
+ color = (255, 255, 0, 100) # Yellow with transparency
528
+ else:
529
+ color = (0, 255, 0, 100) # Green with transparency
530
+
531
+ # Create overlay image for transparency
532
+ overlay = Image.new("RGBA", viz_image.size, (0, 0, 0, 0))
533
+ overlay_draw = ImageDraw.Draw(overlay)
534
+ overlay_draw.rectangle(node.bbox, fill=color)
535
+ viz_image = Image.alpha_composite(viz_image.convert("RGBA"), overlay)
536
+
537
+ return viz_image.convert("RGB")
538
+
539
+
540
+ def create_region_visualization(image, collection):
541
+ """Create region segmentation visualization."""
542
+ viz_image = image.copy()
543
+ draw = ImageDraw.Draw(viz_image)
544
+
545
+ # Draw different colors for different region types
546
+ region_colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]
547
+ region_count = 0
548
+
549
+ for page in collection.pages:
550
+ for node in page.traverse():
551
+ if hasattr(node, "bbox") and node.bbox:
552
+ color = region_colors[region_count % len(region_colors)]
553
+ draw.rectangle(node.bbox, outline=color, width=3)
554
+ region_count += 1
555
+
556
+ return viz_image
557
+
558
+
559
+ def generate_visualization_metadata(collection, visualization_type):
560
+ """Generate metadata for visualization results."""
561
+ total_elements = 0
562
+ confidence_stats = []
563
+
564
+ for page in collection.pages:
565
+ for node in page.traverse():
566
+ if hasattr(node, "text") and node.text:
567
+ total_elements += 1
568
+ if hasattr(node, "confidence"):
569
+ confidence_stats.append(node.confidence)
570
+
571
+ return {
572
+ "total_elements": total_elements,
573
+ "visualization_type": visualization_type,
574
+ "confidence_stats": {
575
+ "min": min(confidence_stats) if confidence_stats else 0,
576
+ "max": max(confidence_stats) if confidence_stats else 0,
577
+ "avg": sum(confidence_stats) / len(confidence_stats)
578
+ if confidence_stats
579
+ else 0,
580
+ },
581
+ }
582
+
583
+
584
+ def extract_interactive_elements(collection):
585
+ """Extract interactive elements for visualization."""
586
+ elements = []
587
+
588
+ for page in collection.pages:
589
+ for node in page.traverse():
590
+ if (
591
+ hasattr(node, "bbox")
592
+ and hasattr(node, "text")
593
+ and node.bbox
594
+ and node.text
595
+ ):
596
+ elements.append(
597
+ {
598
+ "bbox": node.bbox,
599
+ "text": node.text,
600
+ "confidence": getattr(node, "confidence", 1.0),
601
+ "node_id": getattr(node, "id", None),
602
+ }
603
+ )
604
+
605
+ return elements
606
+
607
+
608
+ def export_plain_text(
609
+ collection, confidence_filter: float, include_metadata: bool
610
+ ) -> str:
611
+ """Export recognition results as plain text."""
612
+ text_lines = []
613
+
614
+ if include_metadata:
615
+ text_lines.append(f"# HTR Export Results")
616
+ text_lines.append(f"# Confidence Filter: {confidence_filter}")
617
+ text_lines.append(f"# Export Time: {datetime.now().isoformat()}")
618
+ text_lines.append("")
619
+
620
+ # Extract text from collection hierarchy
621
+ for page in collection.pages:
622
+ for node in page.traverse():
623
+ if hasattr(node, "text") and node.text:
624
+ confidence = getattr(node, "confidence", 1.0)
625
+ if confidence >= confidence_filter:
626
+ text_lines.append(node.text)
627
+
628
+ return "\n".join(text_lines)
629
+
630
+
631
+ def export_structured_json(
632
+ collection, confidence_filter: float, include_metadata: bool
633
+ ) -> str:
634
+ """Export results as structured JSON with full hierarchy."""
635
+ result = {"document": {"pages": []}}
636
+
637
+ if include_metadata:
638
+ result["metadata"] = {
639
+ "confidence_filter": confidence_filter,
640
+ "export_time": datetime.now().isoformat(),
641
+ "total_pages": len(collection.pages),
642
+ }
643
+
644
+ # Build hierarchical structure
645
+ for page_idx, page in enumerate(collection.pages):
646
+ page_data = {"page_id": page_idx, "regions": []}
647
+
648
+ for node in page.traverse():
649
+ if hasattr(node, "text") and node.text:
650
+ confidence = getattr(node, "confidence", 1.0)
651
+ if confidence >= confidence_filter:
652
+ node_data = {
653
+ "text": node.text,
654
+ "confidence": confidence,
655
+ "bbox": getattr(node, "bbox", None),
656
+ "node_id": getattr(node, "id", None),
657
+ }
658
+ page_data["regions"].append(node_data)
659
+
660
+ result["document"]["pages"].append(page_data)
661
+
662
+ return json.dumps(result, indent=2, ensure_ascii=False)
663
+
664
+
665
+ def export_alto_xml(
666
+ collection, confidence_filter: float, include_metadata: bool
667
+ ) -> str:
668
+ """Export results as ALTO XML format."""
669
+ # Simplified ALTO XML generation
670
+ xml_lines = ['<?xml version="1.0" encoding="UTF-8"?>']
671
+ xml_lines.append('<alto xmlns="http://www.loc.gov/standards/alto/ns-v4#">')
672
+ xml_lines.append(" <Description>")
673
+ if include_metadata:
674
+ xml_lines.append(f" <sourceImageInformation>")
675
+ xml_lines.append(f" <fileName>htr_processed_image</fileName>")
676
+ xml_lines.append(f" </sourceImageInformation>")
677
+ xml_lines.append(" </Description>")
678
+ xml_lines.append(" <Layout>")
679
+ xml_lines.append(" <Page>")
680
+
681
+ for page in collection.pages:
682
+ for node in page.traverse():
683
+ if hasattr(node, "text") and node.text:
684
+ confidence = getattr(node, "confidence", 1.0)
685
+ if confidence >= confidence_filter:
686
+ bbox = getattr(node, "bbox", [0, 0, 100, 20])
687
+ xml_lines.append(
688
+ f' <TextLine HPOS="{bbox[0]}" VPOS="{bbox[1]}" WIDTH="{bbox[2] - bbox[0]}" HEIGHT="{bbox[3] - bbox[1]}">'
689
+ )
690
+ xml_lines.append(
691
+ f' <String CONTENT="{node.text}" WC="{confidence:.3f}"/>'
692
+ )
693
+ xml_lines.append(" </TextLine>")
694
+
695
+ xml_lines.append(" </Page>")
696
+ xml_lines.append(" </Layout>")
697
+ xml_lines.append("</alto>")
698
+
699
+ return "\n".join(xml_lines)
700
+
701
+
702
+ def export_page_xml(
703
+ collection, confidence_filter: float, include_metadata: bool
704
+ ) -> str:
705
+ """Export results as PAGE XML format."""
706
+ # Simplified PAGE XML generation
707
+ xml_lines = ['<?xml version="1.0" encoding="UTF-8"?>']
708
+ xml_lines.append(
709
+ '<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15">'
710
+ )
711
+ if include_metadata:
712
+ xml_lines.append(" <Metadata>")
713
+ xml_lines.append(f" <Created>{datetime.now().isoformat()}</Created>")
714
+ xml_lines.append(" </Metadata>")
715
+ xml_lines.append(" <Page>")
716
+
717
+ for page in collection.pages:
718
+ for node in page.traverse():
719
+ if hasattr(node, "text") and node.text:
720
+ confidence = getattr(node, "confidence", 1.0)
721
+ if confidence >= confidence_filter:
722
+ bbox = getattr(node, "bbox", [0, 0, 100, 20])
723
+ xml_lines.append(f" <TextRegion>")
724
+ xml_lines.append(
725
+ f' <Coords points="{bbox[0]},{bbox[1]} {bbox[2]},{bbox[1]} {bbox[2]},{bbox[3]} {bbox[0]},{bbox[3]}"/>'
726
+ )
727
+ xml_lines.append(f" <TextLine>")
728
+ xml_lines.append(f' <TextEquiv conf="{confidence:.3f}">')
729
+ xml_lines.append(f" <Unicode>{node.text}</Unicode>")
730
+ xml_lines.append(" </TextEquiv>")
731
+ xml_lines.append(" </TextLine>")
732
+ xml_lines.append(" </TextRegion>")
733
+
734
+ xml_lines.append(" </Page>")
735
+ xml_lines.append("</PcGts>")
736
+
737
+ return "\n".join(xml_lines)
738
+
739
+
740
+ def calculate_export_statistics(collection, confidence_filter: float) -> Dict:
741
+ """Calculate statistics for export results."""
742
+ total_text_elements = 0
743
+ filtered_text_elements = 0
744
+ confidence_scores = []
745
+ total_characters = 0
746
+
747
+ for page in collection.pages:
748
+ for node in page.traverse():
749
+ if hasattr(node, "text") and node.text:
750
+ total_text_elements += 1
751
+ confidence = getattr(node, "confidence", 1.0)
752
+ confidence_scores.append(confidence)
753
+
754
+ if confidence >= confidence_filter:
755
+ filtered_text_elements += 1
756
+ total_characters += len(node.text)
757
+
758
+ return {
759
+ "total_text_elements": total_text_elements,
760
+ "filtered_text_elements": filtered_text_elements,
761
+ "filter_retention_rate": filtered_text_elements / total_text_elements
762
+ if total_text_elements > 0
763
+ else 0,
764
+ "total_characters": total_characters,
765
+ "average_confidence": sum(confidence_scores) / len(confidence_scores)
766
+ if confidence_scores
767
+ else 0,
768
+ "confidence_range": {
769
+ "min": min(confidence_scores) if confidence_scores else 0,
770
+ "max": max(confidence_scores) if confidence_scores else 0,
771
+ },
772
+ }
773
+
774
+
775
+ # Main Gradio Application with MCP Server
776
+ def create_htrflow_mcp_server():
777
+ """Create the complete HTRflow MCP server with all three tools."""
778
+
779
+ demo = gr.TabbedInterface(
780
+ [
781
+ gr.Interface(
782
+ fn=process_htr,
783
+ inputs=[
784
+ gr.Image(type="pil", label="Upload Image"),
785
+ gr.Dropdown(
786
+ choices=[
787
+ "letter_english",
788
+ "letter_swedish",
789
+ "spread_english",
790
+ "spread_swedish",
791
+ ],
792
+ value="letter_english",
793
+ label="Document Type",
794
+ ),
795
+ gr.Slider(0.0, 1.0, value=0.8, label="Confidence Threshold"),
796
+ gr.Textbox(
797
+ label="Custom Settings (JSON)",
798
+ placeholder="Optional custom pipeline settings",
799
+ ),
800
+ ],
801
+ outputs=gr.JSON(label="Processing Results"),
802
+ title="HTR Processing Tool",
803
+ description="Process handwritten text using configurable HTRflow pipelines",
804
+ api_name="process_htr",
805
+ ),
806
+ gr.Interface(
807
+ fn=visualize_results,
808
+ inputs=[
809
+ gr.Textbox(
810
+ label="Processing State (JSON)",
811
+ placeholder="Paste processing results from HTR tool",
812
+ ),
813
+ gr.Dropdown(
814
+ choices=["overlay", "confidence_heatmap", "text_regions"],
815
+ value="overlay",
816
+ label="Visualization Type",
817
+ ),
818
+ gr.Checkbox(value=True, label="Show Confidence Scores"),
819
+ gr.Checkbox(value=True, label="Highlight Low Confidence"),
820
+ gr.Image(
821
+ type="pil",
822
+ label="Image (optional - will use image from processing state if not provided)",
823
+ ),
824
+ ],
825
+ outputs=gr.JSON(label="Visualization Results"),
826
+ title="Results Visualization Tool",
827
+ description="Generate interactive visualizations of HTR results",
828
+ api_name="visualize_results",
829
+ ),
830
+ gr.Interface(
831
+ fn=export_results,
832
+ inputs=[
833
+ gr.Textbox(
834
+ label="Processing State (JSON)",
835
+ placeholder="Paste processing results from HTR tool",
836
+ ),
837
+ gr.CheckboxGroup(
838
+ choices=["txt", "json", "alto", "page"],
839
+ value=["txt"],
840
+ label="Output Formats",
841
+ ),
842
+ gr.Checkbox(value=True, label="Include Metadata"),
843
+ gr.Slider(0.0, 1.0, value=0.0, label="Confidence Filter"),
844
+ ],
845
+ outputs=gr.JSON(label="Export Results"),
846
+ title="Export Tool",
847
+ description="Export HTR results to multiple formats",
848
+ api_name="export_results",
849
+ ),
850
+ ],
851
+ ["HTR Processing", "Results Visualization", "Export Results"],
852
+ title="HTRflow MCP Server",
853
+ )
854
+
855
+ return demo
856
+
857
+
858
+ # Launch MCP Server
859
+ if __name__ == "__main__":
860
+ demo = create_htrflow_mcp_server()
861
+ demo.launch(mcp_server=True, share=False, server_name="0.0.0.0", server_port=7860)
pyproject.toml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "app"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "gradio>=5.33.0",
9
+ "htrflow==0.2.5",
10
+ "pillow>=11.2.1",
11
+ "ruff>=0.11.13",
12
+ ]
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ htrflow==0.2.5
2
+ ruff
3
+ gradio>=5.33.0
4
+ pillow
uv.lock ADDED
The diff for this file is too large to render. See raw diff