Gleb Vinarskis commited on
Commit
39b46f2
·
1 Parent(s): d7605b7
Files changed (2) hide show
  1. .DS_Store +0 -0
  2. OCR_score.py +97 -5
.DS_Store ADDED
Binary file (6.15 kB). View file
 
OCR_score.py CHANGED
@@ -1,12 +1,104 @@
1
  from huggingface_hub import hf_hub_download
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  class FloretPipeline:
4
  def __init__(self, language=None):
5
- if language == None:
6
- exec(open(hf_hub_download("Maslionok/pipeline1", "test.py")).read())
7
- # language =
8
 
9
 
10
  def __call__(self, text):
11
- output = self.model.predict(text, k=1)
12
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from huggingface_hub import hf_hub_download
2
+ from transformers import Pipeline, AutoModelForSequenceClassification, AutoTokenizer
3
+ from transformers.pipelines import PIPELINE_REGISTRY, SUPPORTED_TASKS
4
+ from typing import List, Dict, Union, Optional, Tuple
5
+ import unicodedata
6
+ from huggingface_hub import hf_hub_download
7
+ from pybloomfilter import BloomFilter
8
+ import unicodedata
9
+ from typing import Optional
10
+ from huggingface_hub import hf_hub_download
11
+ from pybloomfilter import BloomFilter
12
+
13
+
14
+
15
 
16
  class FloretPipeline:
17
  def __init__(self, language=None):
18
+ self.language = language
 
 
19
 
20
 
21
  def __call__(self, text):
22
+ if self.language == None:
23
+ exec(open(hf_hub_download("Maslionok/sudo_pipelines", "floret_language_recognition.py")).read())
24
+ self.language = floret_model(text)
25
+
26
+ bf = self.get_bloomfilter("impresso-project/OCR-quality-assessment-unigram", f"ocrqa-wp_v1.0.6-{self.language}.bloom")
27
+ output = self.filter_text(text, bf)
28
+
29
+ return output
30
+
31
+
32
+
33
+
34
+
35
+ # Define normalization table
36
+ QUOTES_PUNCT = "„•<>!\"#%&'’"
37
+ ASCII_PUNCT = "()*,./:;?"
38
+ BRACKETS_SPECIAL = "[]\\~_{}"
39
+ UNICODE_PUNCT = "\xa1\xab\xb7\xbb\xbf"
40
+ DASH_CARET = "—^`"
41
+ SPECIAL_SYMBOLS = "¦§£="
42
+ HYPHEN = "-"
43
+ DIGITS = "0123456789"
44
+
45
+ NORMALIZATION_TABLE = str.maketrans(
46
+ {
47
+ char: " "
48
+ for char in (
49
+ QUOTES_PUNCT
50
+ + ASCII_PUNCT
51
+ + BRACKETS_SPECIAL
52
+ + UNICODE_PUNCT
53
+ + DASH_CARET
54
+ + SPECIAL_SYMBOLS
55
+ + HYPHEN
56
+ )
57
+ }
58
+ | {char: "0" for char in DIGITS}
59
+ )
60
+
61
+
62
+ def normalize_text(self, s: str, unicode_normalize: Optional[str] = "NFKC") -> str:
63
+ """Normalize text by replacing punctuation with spaces and digits with '0'."""
64
+ if unicode_normalize:
65
+ s = unicodedata.normalize(unicode_normalize, s).lower()
66
+ return s.translate(self.NORMALIZATION_TABLE)
67
+
68
+
69
+ def get_bloomfilter(model_id: str, filename: str):
70
+ return BloomFilter.open(hf_hub_download(repo_id=model_id, filename=filename))
71
+
72
+
73
+ def filter(self, text: str, bloom_filter: BloomFilter):
74
+ # Normalize and tokenize text
75
+ normalized_text = self.normalize_text(text)
76
+ tokens = normalized_text.split()
77
+
78
+ # Check tokens against the bloom filter
79
+ for token in tokens:
80
+ if token in bloom_filter:
81
+ print(f"'{token}' is in the bloom filter.")
82
+ else:
83
+ print(f"'{token}' is NOT in the bloom filter.")
84
+
85
+
86
+ def filter_text(self, DE_TEXT: str, bloom_filter: BloomFilter):
87
+
88
+ knowns = set()
89
+ unknowns = set()
90
+
91
+ # Normalize and tokenize text
92
+ normalized_text = self.normalize_text(DE_TEXT)
93
+ tokens = normalized_text.split()
94
+
95
+ # Check tokens against the bloom filter
96
+ for token in tokens:
97
+ if token in bloom_filter:
98
+ print(f"'{token}' is in the bloom filter.")
99
+ knowns.add(token)
100
+ else:
101
+ print(f"'{token}' is NOT in the bloom filter.")
102
+ unknowns.add(token)
103
+ result = result = {"knowns": knowns, "unknowns": unknowns}
104
+ return result