from huggingface_hub import hf_hub_download from transformers import Pipeline, AutoModelForSequenceClassification, AutoTokenizer from transformers.pipelines import PIPELINE_REGISTRY, SUPPORTED_TASKS from typing import List, Dict, Union, Optional, Tuple import unicodedata from huggingface_hub import hf_hub_download from pybloomfilter import BloomFilter import unicodedata from typing import Optional from huggingface_hub import hf_hub_download from pybloomfilter import BloomFilter def get_bloomfilter(model_id: str, filename: str): return BloomFilter.open(hf_hub_download(repo_id=model_id, filename=filename)) class FloretPipeline: def __init__(self): pass def __call__(self, text, language=None): self.language = language if self.language == None: exec(open(hf_hub_download("Maslionok/sudo_pipelines", "floret_language_recognition.py")).read()) self.language = floret_model(text) if self.language not in self.SUPPORTED_LANGUAGES: raise ValueError(f"Unsupported language: {self.language}") bf = get_bloomfilter("impresso-project/OCR-quality-assessment-unigram", f"ocrqa-wp_v1.0.6-de.bloom") output = self.filter_text(text, bf) return output SUPPORTED_LANGUAGES = {"fr", "de"} # Define normalization table QUOTES_PUNCT = "„•<>!\"#%&'’" ASCII_PUNCT = "()*,./:;?" BRACKETS_SPECIAL = "[]\\~_{}" UNICODE_PUNCT = "\xa1\xab\xb7\xbb\xbf" DASH_CARET = "—^`" SPECIAL_SYMBOLS = "¦§£=" HYPHEN = "-" DIGITS = "0123456789" NORMALIZATION_TABLE = str.maketrans( { char: " " for char in ( QUOTES_PUNCT + ASCII_PUNCT + BRACKETS_SPECIAL + UNICODE_PUNCT + DASH_CARET + SPECIAL_SYMBOLS + HYPHEN ) } | {char: "0" for char in DIGITS} ) def normalize_text(self, s: str, unicode_normalize: Optional[str] = "NFKC") -> str: """Normalize text by replacing punctuation with spaces and digits with '0'.""" if unicode_normalize: s = unicodedata.normalize(unicode_normalize, s).lower() return s.translate(self.NORMALIZATION_TABLE) def filter(self, text: str, bloom_filter: BloomFilter): # Normalize and tokenize text normalized_text = self.normalize_text(text) tokens = normalized_text.split() # Check tokens against the bloom filter for token in tokens: if token in bloom_filter: print(f"'{token}' is in the bloom filter.") else: print(f"'{token}' is NOT in the bloom filter.") def filter_text(self, DE_TEXT: str, bloom_filter: BloomFilter): knowns = set() unknowns = set() # Normalize and tokenize text normalized_text = self.normalize_text(DE_TEXT) tokens = normalized_text.split() # Check tokens against the bloom filter for token in tokens: if token in bloom_filter: print(f"'{token}' is in the bloom filter.") knowns.add(token) else: print(f"'{token}' is NOT in the bloom filter.") unknowns.add(token) result = result = {"knowns": knowns, "unknowns": unknowns} # Compute the score score = len(knowns) / (len(knowns) + len(unknowns)) if (len(knowns) + len(unknowns)) > 0 else 0 score = float(f"{score:.3g}") output = ({"language": self.language, "score": score}) return output OCR_score = FloretPipeline()