adgw
/

Joblib
adgw's picture
init
5c8f9d2 verified
"""
Moduł zawierający stałe, listy słów i prekompilowane wyrażenia regularne
używane w całej bibliotece do analizy tekstu.
"""
import re
BAD_WORDS = ['burdel', 'burdelmama', 'chuj', 'chujnia', 'ciota', 'cipa', 'cyc', 'debil', 'dmuchać', 'do kurwy nędzy',
'dupa', 'dupek', 'duperele', 'dziwka', 'fiut', 'gówno', 'gówno prawda', 'huj', 'huj ci w dupę', 'jajco',
'ja pierdolę', 'jebać', 'jebany', 'kurwa', 'kurwy', 'kutafon', 'kutas', 'lizać pałę', 'obciągać chuja',
'obciągać fiuta', 'pierdolec', 'pierdolić', 'pierdolnąć', 'pierdolnięty', 'pizda', 'pojeb', 'pojebany',
'popierdolony', 'robić loda', 'ruchać', 'rzygać', 'skurwysyn', 'sraczka', 'srać', 'suka', 'syf', 'wkurwiać', 'zajebisty']
STOP_WORDS = ["a","aby","ach","acz","aczkolwiek","aj","albo","ale","ależ","ani","aż","bardziej","bardzo","bez","bo","bowiem",
"by","byli","bym","bynajmniej","być","był","była","było","były","będzie","będą","cali","cała","cały","chce","choć","ci",
"ciebie","cię","co","cokolwiek","coraz","coś","czasami","czasem","czemu","czy","czyli","często","daleko","dla","dlaczego",
"dlatego","do","dobrze","dokąd","dość","dr","dużo","dwa","dwaj","dwie","dwoje","dzisiaj","dziś","gdy","gdyby","gdyż","gdzie",
"gdziekolwiek","gdzieś","go","godz","hab","i","ich","ii","iii","ile","im","inna","inne","inny","innych","inż","iv","ix","iż",
"ja","jak","jakaś","jakby","jaki","jakichś","jakie","jakiś","jakiż","jakkolwiek","jako","jakoś","je","jeden","jedna","jednak",
"jednakże","jedno","jednym","jedynie","jego","jej","jemu","jest","jestem","jeszcze","jeśli","jeżeli","już","ją","każdy","kiedy",
"kierunku","kilka","kilku","kimś","kto","ktokolwiek","ktoś","która","które","którego","której","który","których","którym","którzy",
"ku","lat","lecz","lub","ma","mają","mam","mamy","mało","mgr","mi","miał","mimo","między","mnie","mną","mogą","moi","moim","moja",
"moje","może","możliwe","można","mu","musi","my","mój","na","nad","nam","nami","nas","nasi","nasz","nasza","nasze","naszego","naszych",
"natomiast","natychmiast","nawet","nic","nich","nie","niech","niego","niej","niemu","nigdy","nim","nimi","nią","niż","no","nowe","np",
"nr","o","o.o.","obok","od","ok","około","on","ona","one","oni","ono","oraz","oto","owszem","pan","pana","pani","pl","po","pod",
"podczas","pomimo","ponad","ponieważ","powinien","powinna","powinni","powinno","poza","prawie","prof","przecież","przed","przede","przedtem",
"przez","przy","raz","razie","roku","również","sam","sama","się","skąd","sobie","sobą","sposób","swoje","są","ta","tak","taka","taki","takich",
"takie","także","tam","te","tego","tej","tel","temu","ten","teraz","też","to","tobie","tobą","toteż","totobą","trzeba","tu","tutaj","twoi","twoim",
"twoja","twoje","twym","twój","ty","tych","tylko","tym","tys","tzw","u","ul","vi","vii","viii","vol","w","wam","wami","was","wasi","wasz","wasza",
"wasze","we","według","wie","wiele","wielu","więc","więcej","wszyscy","wszystkich","wszystkie","wszystkim","wszystko","wtedy","www","wy","właśnie",
"wśród","xi","xii","xiii","xiv","xv","z","za","zapewne","zawsze","zaś","ze","zeznowu","znowu","znów","został","zł","żaden","żadna","żadne","żadnych",
"że","żeby"]
PII_REGEX_PATTERNS = {
'date_reg' : re.compile(r'(?<!\w)([1-2]\d{3}[\/.\\-](12|11|[0-1]?[0-9])[\/.\\-](3[0-1]|2[0-9]|1[0-9]|0?[1-9]))|((?<!\w)(3[0-1]|2[0-9]|1[0-9]|0?[1-9])([\/.\\-](12|11|1[0-9]|0?[1-9])|([\/.\\ -](stycz\w+|lut\w+|mar[cz]\w+|kwie\w+|maj\w|czerw\w+|lip\w+|sierp\w+|wrze[sś]\w+|paź\w+|listop\w+|grud\w+)))(([\/.\\ -]([0-9]){2}|[\/.\\ -]([1-2][0-9]{3})))?)(?![\d])'),
'address_reg' : re.compile(r"(?<=\W)(?:ul\.|pl\.|al\.|bulw\.|os\.|aleja|bulwar|ulica|skwer|park|rondo)[^\S\r\n]?"
r"(?:(?:świętego|trasa|bł\.|im\.|gen\.|dr\.|ks\.|ks\.[^\S\r\n]bp\.|kpt\.|ks\.[^\S\r\n]kan\.|"
r"ks\.[^\S\r\n]+kard\.|marsz\.|mjr\.|o\.|ppłk\.|prof\.|św\.|[^\S\r\n])*)"
r"(?:(?:\d{1,3}\s+)?[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+(?:\s*[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+)*"
r"(?:\s*\d{1,3})*){1,}(?:(?:[^\S\r\n]+(?:i|z|im\.)[^\S\r\n]+)?"
r"(?:[^\S\r\n]*(?:\d{1,3}[^\S\r\n]+)?[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+(?:\s*[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+)*"
r"(?:\s*\d{1,3})*)*)*(?!\n\d{2}-\d{3})"),
'post_code_reg' : re.compile(r'(?<=\s)([0-9]{2}-[0-9]{3})(?!\w)'),
'ip_reg' : re.compile(r'(?<=\s)((25[0-5]|2[0-4][0-9]|\d?[1-9][0-9]|\d)(\.|$)){3}(25[0-5]|2[0-4][0-9]|\d?[1-9][0-9]|\d)(?!:\s)'),
'nip_reg' : re.compile(r'(?:(?<=NIP: )|(?<=NIP ))([0-9]{10}|\d{3}-\d{2}-\d{2}-\d{3}|\d{3}-\d{3}-\d{2}-\d{2})(?!\w)'),
'regon_reg' : re.compile(r'(?:(?<=REGON: )|(?<=REGON ))([0-9]{9})(?!\w)'),
'phone_reg' : re.compile(r'(?<=\s)((?P<a>\()?(\+|00)?48(?(2)\)|)?)?(\s{0,}(4|5|6|7|8)\d{2})((?P<char>[ -]?)\d{3})((?P=char)\d{3})(?!\w)'),
'domestic_phone_reg' : re.compile(r'(?!\s)((?P<a>\()?(\+|00)?48(?(2)\)|)?)?(\s*\d{2})?((\s*[2-9]\d{2})((?P<char>[ -]+)\d{2})((?P=char)\d{2})|(\s*[2-9]\d{1})((?P<char2>[ -]+)\d{3})((?P=char2)\d{2}))\s'),
'iban_reg' : re.compile(r'(?<=\s)(?:PL)?(?:\d{2}(?:[\s]*\d{4}){6})(?!\w)'),
'email_reg' : re.compile(r'(?<=\s)([a-zA-Z0-9][\w.+-]*@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,20})(?!\w)'),
'pesel_reg' : re.compile(r'(?<=\s)(\d{2})(1[0-2]|2[1-9]|3[1-2]|0[1-9])(3[0-1]|2\d|1\d|0[1-9])(\d{5})(?!\w)'),
'currency_reg' : re.compile(r'(?<=\s)(([1-9]\d{0,2}[,. ]){0,2}[,. ]?([1-9]\d{1,2}|\d{1,3})([,.]\d{2}|\d+)\s*(zł|Zł|pln|PLN|\$|€|£)|(\$|€|£)\s*([1-9]\d{0,2}[,. ]){0,2}[,. ]?([1-9]\d{1,2}|\d{1,3})([,.]\d{2}|\d+))'),
'isbn_reg' : re.compile(r'(?<=\s)978(?:-?\d){10}(?!\w)'),
'issn_reg' : re.compile(r'(?<=\s)((ISSN|eISSN) [\S]{4}\-[\S]{4})(?!\w)')
}
MARKDOWN_PATTERNS = {
'header': re.compile(r'^#+\s', re.MULTILINE),
'bold': re.compile(r'\*\*[^*\n]+?\*\*'),
'italic': re.compile(r'\*[^*\n]+?\*'),
'unordered_list': re.compile(r'^\s*[-*+]\s', re.MULTILINE),
'ordered_list': re.compile(r'^\s*\d+\.\s', re.MULTILINE),
'link': re.compile(r'\[([^\]]+)\]\(([^\)]+)\)'),
'image': re.compile(r'!\[([^\]]*)\]\(([^\)]+)\)'),
'inline_code': re.compile(r'`[^`\n]+`'),
'code_block': re.compile(r'```[\s\S]*?```'),
'blockquote': re.compile(r'^>\s', re.MULTILINE),
'horizontal_rule': re.compile(r'^([-*_])\1{2,}$', re.MULTILINE)
}
PUNCTUATION_PATTERN = re.compile(r'[.,!?;:]')
NON_WORD_CHARS_PATTERN = re.compile(r'[^\w\s]')
EXCESSIVE_SPACES_PATTERN = re.compile(r' {4,}')
CAMEL_CASE_PATTERN = re.compile(r"\b[a-ząęćłńóśżź]+[A-ZĄĘĆŁŃÓŚŻŹ]+[a-ząęćłńóśżź]+[a-ząęćłńóśżźA-ZĄĘĆŁŃÓŚŻŹ]*\b")
ALLOWED_CHARS_PATTERN = re.compile(r'[a-zA-Z0-9ąćęłńóśźż\s.,;:\-!?]')
COMMON_CHARACTERS = list(" \t\n!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^`°abcdefghijklmnopqrstuvwxyz{|}~ÓóĄąĆćĘꣳŃńŚśŹźŻż")
SPACY_MODEL_PL = 'pl_core_news_md'
NLP_MAX_LENGTH = 5_000_000
COLUMN_ORDER = [
'characters',
'words',
'sentences',
'avg_sentence_length',
'nouns',
'verbs',
'adjectives',
'adverbs',
'punctuations',
'symbols',
'stopwords',
'oovs',
'avg_word_length',
'noun_ratio',
'verb_ratio',
'adj_ratio',
'lexical_density',
'gunning_fog',
'camel_case',
'pos_x',
'pos_num',
'capitalized_words',
'unique_characters_all',
'unique_characters_lower',
'characters_out_of_common',
'word_isupper<5',
'word_isupper>5',
'count_caps',
'single_char_count',
'single_char_ratio',
'digit_count',
'digit_ratio',
'punct_frequency',
'count_digit_to_caps',
'bracet_count',
'bracket_ratio',
'average_lines',
'short_line_count_3',
'short_line_count_5',
'short_line_count_10',
'short_line_ratio_3',
'short_line_ratio_5',
'short_line_ratio_10',
'lexical_diversity',
'contextual_word_repetitions_count',
'contextual_word_repetitions_ratio',
'html_tags',
'bbcode_tags',
'urls',
'text_to_markup_ratio',
'emoticons',
'slang_words',
'slang_words_ratio',
'incomplete_sentences',
'excessive_chars',
'blank_lines',
'blank_lines_ratio',
'duplicated_lines',
'duplicate_line_ratio',
'count_special_chars',
'tabs',
'multispaces',
'short_line_count_20',
'date_reg',
'address_reg',
'post_code_reg',
'ip_reg',
'nip_reg',
'regon_reg',
'phone_reg',
'iban_reg',
'email_reg',
'pesel_reg',
'currency_reg',
'isbn_reg',
'issn_reg',
'short_line_ratio_20',
'ellipsis_fractions',
'line_counts',
'non_alpha_word_fractions',
'lorem_ipsum_ratio',
'mean_word_length',
'stop_word_ratio',
'entropy',
'javascript_counts_per_line',
'lines_with_bullet',
'ratio_of_bulletpoints',
'overall_uppercase_ratio',
'bad_word_count',
'fraction_duplicate_5_ngram',
'fraction_duplicate_6_ngram',
'fraction_duplicate_7_ngram',
'fraction_duplicate_8_ngram',
'fraction_duplicate_9_ngram',
'fraction_duplicate_10_ngram',
'fraction_top_2_ngram',
'fraction_top_3_ngram',
'fraction_top_4_ngram',
'fraction_top_5_ngram',
'symbol_to_word_ratio',
'avg_paragraph_length',
'paragraph_length_variance',
'unique_sentence_beginnings_ratio',
'semicolons_per_sentence',
'dashes_per_sentence',
'colons_per_sentence',
'formal_words_ratio',
'commas_per_sentence',
'short_sentences_ratio',
'long_sentences_ratio',
'cohesive_words_per_sentence',
'quotes_and_references_per_sentence',
'headers_per_1000_chars_md',
'average_header_level_md',
'bold_per_1000_chars_md',
'italic_per_1000_chars_md',
'unordered_list_items_per_1000_chars_md',
'ordered_list_items_per_1000_chars_md',
'links_per_1000_chars_md',
'images_per_1000_chars_md',
'inline_code_fragments_per_1000_chars_md',
'code_blocks_per_1000_chars_md',
'blockquotes_per_1000_chars_md',
'horizontal_rules_per_1000_chars_md',
'char_ratio_#',
'char_ratio_*',
'char_ratio_-',
'char_ratio_+',
'char_ratio_[',
'char_ratio_]',
'char_ratio_(',
'char_ratio_)',
'char_ratio_`',
'char_ratio_>',
'char_ratio__',
'char_ratio_!',
'special_chars_ratio_md',
'lowercase_ratio_md',
'uppercase_ratio_md',
'digit_ratio_md',
'whitespace_ratio_md',
'table_pipe_count',
'table_pipe_ratio',
'table_pipe_per_1000_chars',
'table_lines_count',
'table_lines_ratio',
'table_header_separators_count',
'avg_pipes_per_table_line',
'estimated_avg_columns',
'word_count',
'unique_word_count',
'top_word_count',
'top_word_ratio',
'top_5_ratio',
'top_10_ratio',
'hapax_legomena_ratio',
'looping_suspicion',
'polish_diacritics_count',
'polish_diacritics_ratio',
'polish_diacritics_per_word',
'diacritics_to_letters_ratio',
'replacement_char_count',
'replacement_char_ratio',
'not_allowed_chars_count',
'not_allowed_chars_ratio',
'encoding_suspicion',
'single_char_word_count',
'single_char_unique_count',
'single_char_upper_count',
'single_char_lower_count',
'single_char_upper_unique_count',
'single_char_lower_unique_count',
'single_char_top_1_codepoint',
'single_char_top_2_codepoint',
'single_char_top_3_codepoint',
'question_sentence_ratio',
'single_word_line_ratio',
'repeated_word_line_ratio',
'lix',
'rix',
'diacritics_std_dev',
'ner_count',
'ner_person_ratio',
'ner_org_ratio',
'ner_loc_ratio',
'ner_misc_ratio',
'case_diversity',
'tense_diversity',
'mood_diversity',
'top_words_total_count',
'top_words_noun_ratio',
'top_words_verb_ratio',
'top_words_adj_ratio',
'top_words_other_ratio',
'top_words_noun_prop_of_all_nouns',
'top_words_verb_prop_of_all_verbs',
'top_words_adj_prop_of_all_adjs',
'top_words_other_prop_of_all_others',
'avg_dependency_tree_depth',
'digit_start_lines'
]