Gleb Vinarskis
commited on
Commit
·
39b46f2
1
Parent(s):
d7605b7
changes
Browse files- .DS_Store +0 -0
- OCR_score.py +97 -5
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
OCR_score.py
CHANGED
@@ -1,12 +1,104 @@
|
|
1 |
from huggingface_hub import hf_hub_download
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
class FloretPipeline:
|
4 |
def __init__(self, language=None):
|
5 |
-
|
6 |
-
exec(open(hf_hub_download("Maslionok/pipeline1", "test.py")).read())
|
7 |
-
# language =
|
8 |
|
9 |
|
10 |
def __call__(self, text):
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from huggingface_hub import hf_hub_download
|
2 |
+
from transformers import Pipeline, AutoModelForSequenceClassification, AutoTokenizer
|
3 |
+
from transformers.pipelines import PIPELINE_REGISTRY, SUPPORTED_TASKS
|
4 |
+
from typing import List, Dict, Union, Optional, Tuple
|
5 |
+
import unicodedata
|
6 |
+
from huggingface_hub import hf_hub_download
|
7 |
+
from pybloomfilter import BloomFilter
|
8 |
+
import unicodedata
|
9 |
+
from typing import Optional
|
10 |
+
from huggingface_hub import hf_hub_download
|
11 |
+
from pybloomfilter import BloomFilter
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
|
16 |
class FloretPipeline:
|
17 |
def __init__(self, language=None):
|
18 |
+
self.language = language
|
|
|
|
|
19 |
|
20 |
|
21 |
def __call__(self, text):
|
22 |
+
if self.language == None:
|
23 |
+
exec(open(hf_hub_download("Maslionok/sudo_pipelines", "floret_language_recognition.py")).read())
|
24 |
+
self.language = floret_model(text)
|
25 |
+
|
26 |
+
bf = self.get_bloomfilter("impresso-project/OCR-quality-assessment-unigram", f"ocrqa-wp_v1.0.6-{self.language}.bloom")
|
27 |
+
output = self.filter_text(text, bf)
|
28 |
+
|
29 |
+
return output
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
# Define normalization table
|
36 |
+
QUOTES_PUNCT = "„•<>!\"#%&'’"
|
37 |
+
ASCII_PUNCT = "()*,./:;?"
|
38 |
+
BRACKETS_SPECIAL = "[]\\~_{}"
|
39 |
+
UNICODE_PUNCT = "\xa1\xab\xb7\xbb\xbf"
|
40 |
+
DASH_CARET = "—^`"
|
41 |
+
SPECIAL_SYMBOLS = "¦§£="
|
42 |
+
HYPHEN = "-"
|
43 |
+
DIGITS = "0123456789"
|
44 |
+
|
45 |
+
NORMALIZATION_TABLE = str.maketrans(
|
46 |
+
{
|
47 |
+
char: " "
|
48 |
+
for char in (
|
49 |
+
QUOTES_PUNCT
|
50 |
+
+ ASCII_PUNCT
|
51 |
+
+ BRACKETS_SPECIAL
|
52 |
+
+ UNICODE_PUNCT
|
53 |
+
+ DASH_CARET
|
54 |
+
+ SPECIAL_SYMBOLS
|
55 |
+
+ HYPHEN
|
56 |
+
)
|
57 |
+
}
|
58 |
+
| {char: "0" for char in DIGITS}
|
59 |
+
)
|
60 |
+
|
61 |
+
|
62 |
+
def normalize_text(self, s: str, unicode_normalize: Optional[str] = "NFKC") -> str:
|
63 |
+
"""Normalize text by replacing punctuation with spaces and digits with '0'."""
|
64 |
+
if unicode_normalize:
|
65 |
+
s = unicodedata.normalize(unicode_normalize, s).lower()
|
66 |
+
return s.translate(self.NORMALIZATION_TABLE)
|
67 |
+
|
68 |
+
|
69 |
+
def get_bloomfilter(model_id: str, filename: str):
|
70 |
+
return BloomFilter.open(hf_hub_download(repo_id=model_id, filename=filename))
|
71 |
+
|
72 |
+
|
73 |
+
def filter(self, text: str, bloom_filter: BloomFilter):
|
74 |
+
# Normalize and tokenize text
|
75 |
+
normalized_text = self.normalize_text(text)
|
76 |
+
tokens = normalized_text.split()
|
77 |
+
|
78 |
+
# Check tokens against the bloom filter
|
79 |
+
for token in tokens:
|
80 |
+
if token in bloom_filter:
|
81 |
+
print(f"'{token}' is in the bloom filter.")
|
82 |
+
else:
|
83 |
+
print(f"'{token}' is NOT in the bloom filter.")
|
84 |
+
|
85 |
+
|
86 |
+
def filter_text(self, DE_TEXT: str, bloom_filter: BloomFilter):
|
87 |
+
|
88 |
+
knowns = set()
|
89 |
+
unknowns = set()
|
90 |
+
|
91 |
+
# Normalize and tokenize text
|
92 |
+
normalized_text = self.normalize_text(DE_TEXT)
|
93 |
+
tokens = normalized_text.split()
|
94 |
+
|
95 |
+
# Check tokens against the bloom filter
|
96 |
+
for token in tokens:
|
97 |
+
if token in bloom_filter:
|
98 |
+
print(f"'{token}' is in the bloom filter.")
|
99 |
+
knowns.add(token)
|
100 |
+
else:
|
101 |
+
print(f"'{token}' is NOT in the bloom filter.")
|
102 |
+
unknowns.add(token)
|
103 |
+
result = result = {"knowns": knowns, "unknowns": unknowns}
|
104 |
+
return result
|