""" Moduł zawierający stałe, listy słów i prekompilowane wyrażenia regularne używane w całej bibliotece do analizy tekstu. """ import re BAD_WORDS = ['burdel', 'burdelmama', 'chuj', 'chujnia', 'ciota', 'cipa', 'cyc', 'debil', 'dmuchać', 'do kurwy nędzy', 'dupa', 'dupek', 'duperele', 'dziwka', 'fiut', 'gówno', 'gówno prawda', 'huj', 'huj ci w dupę', 'jajco', 'ja pierdolę', 'jebać', 'jebany', 'kurwa', 'kurwy', 'kutafon', 'kutas', 'lizać pałę', 'obciągać chuja', 'obciągać fiuta', 'pierdolec', 'pierdolić', 'pierdolnąć', 'pierdolnięty', 'pizda', 'pojeb', 'pojebany', 'popierdolony', 'robić loda', 'ruchać', 'rzygać', 'skurwysyn', 'sraczka', 'srać', 'suka', 'syf', 'wkurwiać', 'zajebisty'] STOP_WORDS = ["a","aby","ach","acz","aczkolwiek","aj","albo","ale","ależ","ani","aż","bardziej","bardzo","bez","bo","bowiem", "by","byli","bym","bynajmniej","być","był","była","było","były","będzie","będą","cali","cała","cały","chce","choć","ci", "ciebie","cię","co","cokolwiek","coraz","coś","czasami","czasem","czemu","czy","czyli","często","daleko","dla","dlaczego", "dlatego","do","dobrze","dokąd","dość","dr","dużo","dwa","dwaj","dwie","dwoje","dzisiaj","dziś","gdy","gdyby","gdyż","gdzie", "gdziekolwiek","gdzieś","go","godz","hab","i","ich","ii","iii","ile","im","inna","inne","inny","innych","inż","iv","ix","iż", "ja","jak","jakaś","jakby","jaki","jakichś","jakie","jakiś","jakiż","jakkolwiek","jako","jakoś","je","jeden","jedna","jednak", "jednakże","jedno","jednym","jedynie","jego","jej","jemu","jest","jestem","jeszcze","jeśli","jeżeli","już","ją","każdy","kiedy", "kierunku","kilka","kilku","kimś","kto","ktokolwiek","ktoś","która","które","którego","której","który","których","którym","którzy", "ku","lat","lecz","lub","ma","mają","mam","mamy","mało","mgr","mi","miał","mimo","między","mnie","mną","mogą","moi","moim","moja", "moje","może","możliwe","można","mu","musi","my","mój","na","nad","nam","nami","nas","nasi","nasz","nasza","nasze","naszego","naszych", "natomiast","natychmiast","nawet","nic","nich","nie","niech","niego","niej","niemu","nigdy","nim","nimi","nią","niż","no","nowe","np", "nr","o","o.o.","obok","od","ok","około","on","ona","one","oni","ono","oraz","oto","owszem","pan","pana","pani","pl","po","pod", "podczas","pomimo","ponad","ponieważ","powinien","powinna","powinni","powinno","poza","prawie","prof","przecież","przed","przede","przedtem", "przez","przy","raz","razie","roku","również","sam","sama","się","skąd","sobie","sobą","sposób","swoje","są","ta","tak","taka","taki","takich", "takie","także","tam","te","tego","tej","tel","temu","ten","teraz","też","to","tobie","tobą","toteż","totobą","trzeba","tu","tutaj","twoi","twoim", "twoja","twoje","twym","twój","ty","tych","tylko","tym","tys","tzw","u","ul","vi","vii","viii","vol","w","wam","wami","was","wasi","wasz","wasza", "wasze","we","według","wie","wiele","wielu","więc","więcej","wszyscy","wszystkich","wszystkie","wszystkim","wszystko","wtedy","www","wy","właśnie", "wśród","xi","xii","xiii","xiv","xv","z","za","zapewne","zawsze","zaś","ze","zeznowu","znowu","znów","został","zł","żaden","żadna","żadne","żadnych", "że","żeby"] PII_REGEX_PATTERNS = { 'date_reg' : re.compile(r'(?\()?(\+|00)?48(?(2)\)|)?)?(\s{0,}(4|5|6|7|8)\d{2})((?P[ -]?)\d{3})((?P=char)\d{3})(?!\w)'), 'domestic_phone_reg' : re.compile(r'(?!\s)((?P\()?(\+|00)?48(?(2)\)|)?)?(\s*\d{2})?((\s*[2-9]\d{2})((?P[ -]+)\d{2})((?P=char)\d{2})|(\s*[2-9]\d{1})((?P[ -]+)\d{3})((?P=char2)\d{2}))\s'), 'iban_reg' : re.compile(r'(?<=\s)(?:PL)?(?:\d{2}(?:[\s]*\d{4}){6})(?!\w)'), 'email_reg' : re.compile(r'(?<=\s)([a-zA-Z0-9][\w.+-]*@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,20})(?!\w)'), 'pesel_reg' : re.compile(r'(?<=\s)(\d{2})(1[0-2]|2[1-9]|3[1-2]|0[1-9])(3[0-1]|2\d|1\d|0[1-9])(\d{5})(?!\w)'), 'currency_reg' : re.compile(r'(?<=\s)(([1-9]\d{0,2}[,. ]){0,2}[,. ]?([1-9]\d{1,2}|\d{1,3})([,.]\d{2}|\d+)\s*(zł|Zł|pln|PLN|\$|€|£)|(\$|€|£)\s*([1-9]\d{0,2}[,. ]){0,2}[,. ]?([1-9]\d{1,2}|\d{1,3})([,.]\d{2}|\d+))'), 'isbn_reg' : re.compile(r'(?<=\s)978(?:-?\d){10}(?!\w)'), 'issn_reg' : re.compile(r'(?<=\s)((ISSN|eISSN) [\S]{4}\-[\S]{4})(?!\w)') } MARKDOWN_PATTERNS = { 'header': re.compile(r'^#+\s', re.MULTILINE), 'bold': re.compile(r'\*\*[^*\n]+?\*\*'), 'italic': re.compile(r'\*[^*\n]+?\*'), 'unordered_list': re.compile(r'^\s*[-*+]\s', re.MULTILINE), 'ordered_list': re.compile(r'^\s*\d+\.\s', re.MULTILINE), 'link': re.compile(r'\[([^\]]+)\]\(([^\)]+)\)'), 'image': re.compile(r'!\[([^\]]*)\]\(([^\)]+)\)'), 'inline_code': re.compile(r'`[^`\n]+`'), 'code_block': re.compile(r'```[\s\S]*?```'), 'blockquote': re.compile(r'^>\s', re.MULTILINE), 'horizontal_rule': re.compile(r'^([-*_])\1{2,}$', re.MULTILINE) } PUNCTUATION_PATTERN = re.compile(r'[.,!?;:]') NON_WORD_CHARS_PATTERN = re.compile(r'[^\w\s]') EXCESSIVE_SPACES_PATTERN = re.compile(r' {4,}') CAMEL_CASE_PATTERN = re.compile(r"\b[a-ząęćłńóśżź]+[A-ZĄĘĆŁŃÓŚŻŹ]+[a-ząęćłńóśżź]+[a-ząęćłńóśżźA-ZĄĘĆŁŃÓŚŻŹ]*\b") ALLOWED_CHARS_PATTERN = re.compile(r'[a-zA-Z0-9ąćęłńóśźż\s.,;:\-!?]') COMMON_CHARACTERS = list(" \t\n!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^`°abcdefghijklmnopqrstuvwxyz{|}~ÓóĄąĆćĘꣳŃńŚśŹźŻż") SPACY_MODEL_PL = 'pl_core_news_md' NLP_MAX_LENGTH = 5_000_000 COLUMN_ORDER = [ 'characters', 'words', 'sentences', 'avg_sentence_length', 'nouns', 'verbs', 'adjectives', 'adverbs', 'punctuations', 'symbols', 'stopwords', 'oovs', 'avg_word_length', 'noun_ratio', 'verb_ratio', 'adj_ratio', 'lexical_density', 'gunning_fog', 'camel_case', 'pos_x', 'pos_num', 'capitalized_words', 'unique_characters_all', 'unique_characters_lower', 'characters_out_of_common', 'word_isupper<5', 'word_isupper>5', 'count_caps', 'single_char_count', 'single_char_ratio', 'digit_count', 'digit_ratio', 'punct_frequency', 'count_digit_to_caps', 'bracet_count', 'bracket_ratio', 'average_lines', 'short_line_count_3', 'short_line_count_5', 'short_line_count_10', 'short_line_ratio_3', 'short_line_ratio_5', 'short_line_ratio_10', 'lexical_diversity', 'contextual_word_repetitions_count', 'contextual_word_repetitions_ratio', 'html_tags', 'bbcode_tags', 'urls', 'text_to_markup_ratio', 'emoticons', 'slang_words', 'slang_words_ratio', 'incomplete_sentences', 'excessive_chars', 'blank_lines', 'blank_lines_ratio', 'duplicated_lines', 'duplicate_line_ratio', 'count_special_chars', 'tabs', 'multispaces', 'short_line_count_20', 'date_reg', 'address_reg', 'post_code_reg', 'ip_reg', 'nip_reg', 'regon_reg', 'phone_reg', 'iban_reg', 'email_reg', 'pesel_reg', 'currency_reg', 'isbn_reg', 'issn_reg', 'short_line_ratio_20', 'ellipsis_fractions', 'line_counts', 'non_alpha_word_fractions', 'lorem_ipsum_ratio', 'mean_word_length', 'stop_word_ratio', 'entropy', 'javascript_counts_per_line', 'lines_with_bullet', 'ratio_of_bulletpoints', 'overall_uppercase_ratio', 'bad_word_count', 'fraction_duplicate_5_ngram', 'fraction_duplicate_6_ngram', 'fraction_duplicate_7_ngram', 'fraction_duplicate_8_ngram', 'fraction_duplicate_9_ngram', 'fraction_duplicate_10_ngram', 'fraction_top_2_ngram', 'fraction_top_3_ngram', 'fraction_top_4_ngram', 'fraction_top_5_ngram', 'symbol_to_word_ratio', 'avg_paragraph_length', 'paragraph_length_variance', 'unique_sentence_beginnings_ratio', 'semicolons_per_sentence', 'dashes_per_sentence', 'colons_per_sentence', 'formal_words_ratio', 'commas_per_sentence', 'short_sentences_ratio', 'long_sentences_ratio', 'cohesive_words_per_sentence', 'quotes_and_references_per_sentence', 'headers_per_1000_chars_md', 'average_header_level_md', 'bold_per_1000_chars_md', 'italic_per_1000_chars_md', 'unordered_list_items_per_1000_chars_md', 'ordered_list_items_per_1000_chars_md', 'links_per_1000_chars_md', 'images_per_1000_chars_md', 'inline_code_fragments_per_1000_chars_md', 'code_blocks_per_1000_chars_md', 'blockquotes_per_1000_chars_md', 'horizontal_rules_per_1000_chars_md', 'char_ratio_#', 'char_ratio_*', 'char_ratio_-', 'char_ratio_+', 'char_ratio_[', 'char_ratio_]', 'char_ratio_(', 'char_ratio_)', 'char_ratio_`', 'char_ratio_>', 'char_ratio__', 'char_ratio_!', 'special_chars_ratio_md', 'lowercase_ratio_md', 'uppercase_ratio_md', 'digit_ratio_md', 'whitespace_ratio_md', 'table_pipe_count', 'table_pipe_ratio', 'table_pipe_per_1000_chars', 'table_lines_count', 'table_lines_ratio', 'table_header_separators_count', 'avg_pipes_per_table_line', 'estimated_avg_columns', 'word_count', 'unique_word_count', 'top_word_count', 'top_word_ratio', 'top_5_ratio', 'top_10_ratio', 'hapax_legomena_ratio', 'looping_suspicion', 'polish_diacritics_count', 'polish_diacritics_ratio', 'polish_diacritics_per_word', 'diacritics_to_letters_ratio', 'replacement_char_count', 'replacement_char_ratio', 'not_allowed_chars_count', 'not_allowed_chars_ratio', 'encoding_suspicion', 'single_char_word_count', 'single_char_unique_count', 'single_char_upper_count', 'single_char_lower_count', 'single_char_upper_unique_count', 'single_char_lower_unique_count', 'single_char_top_1_codepoint', 'single_char_top_2_codepoint', 'single_char_top_3_codepoint', 'question_sentence_ratio', 'single_word_line_ratio', 'repeated_word_line_ratio', 'lix', 'rix', 'diacritics_std_dev', 'ner_count', 'ner_person_ratio', 'ner_org_ratio', 'ner_loc_ratio', 'ner_misc_ratio', 'case_diversity', 'tense_diversity', 'mood_diversity', 'top_words_total_count', 'top_words_noun_ratio', 'top_words_verb_ratio', 'top_words_adj_ratio', 'top_words_other_ratio', 'top_words_noun_prop_of_all_nouns', 'top_words_verb_prop_of_all_verbs', 'top_words_adj_prop_of_all_adjs', 'top_words_other_prop_of_all_others', 'avg_dependency_tree_depth', 'digit_start_lines' ]