adgw
/

Joblib
File size: 13,599 Bytes
5c8f9d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
"""

Moduł zawierający stałe, listy słów i prekompilowane wyrażenia regularne

używane w całej bibliotece do analizy tekstu.

"""

import re

BAD_WORDS  = ['burdel', 'burdelmama', 'chuj', 'chujnia', 'ciota', 'cipa', 'cyc', 'debil', 'dmuchać', 'do kurwy nędzy',
             'dupa', 'dupek', 'duperele', 'dziwka', 'fiut', 'gówno', 'gówno prawda', 'huj', 'huj ci w dupę', 'jajco',
             'ja pierdolę', 'jebać', 'jebany', 'kurwa', 'kurwy', 'kutafon', 'kutas', 'lizać pałę', 'obciągać chuja',
             'obciągać fiuta', 'pierdolec', 'pierdolić', 'pierdolnąć', 'pierdolnięty', 'pizda', 'pojeb', 'pojebany',
             'popierdolony', 'robić loda', 'ruchać', 'rzygać', 'skurwysyn', 'sraczka', 'srać', 'suka', 'syf', 'wkurwiać', 'zajebisty']

STOP_WORDS  = ["a","aby","ach","acz","aczkolwiek","aj","albo","ale","ależ","ani","aż","bardziej","bardzo","bez","bo","bowiem",
              "by","byli","bym","bynajmniej","być","był","była","było","były","będzie","będą","cali","cała","cały","chce","choć","ci",
              "ciebie","cię","co","cokolwiek","coraz","coś","czasami","czasem","czemu","czy","czyli","często","daleko","dla","dlaczego",
              "dlatego","do","dobrze","dokąd","dość","dr","dużo","dwa","dwaj","dwie","dwoje","dzisiaj","dziś","gdy","gdyby","gdyż","gdzie",
              "gdziekolwiek","gdzieś","go","godz","hab","i","ich","ii","iii","ile","im","inna","inne","inny","innych","inż","iv","ix","iż",
              "ja","jak","jakaś","jakby","jaki","jakichś","jakie","jakiś","jakiż","jakkolwiek","jako","jakoś","je","jeden","jedna","jednak",
              "jednakże","jedno","jednym","jedynie","jego","jej","jemu","jest","jestem","jeszcze","jeśli","jeżeli","już","ją","każdy","kiedy",
              "kierunku","kilka","kilku","kimś","kto","ktokolwiek","ktoś","która","które","którego","której","który","których","którym","którzy",
              "ku","lat","lecz","lub","ma","mają","mam","mamy","mało","mgr","mi","miał","mimo","między","mnie","mną","mogą","moi","moim","moja",
              "moje","może","możliwe","można","mu","musi","my","mój","na","nad","nam","nami","nas","nasi","nasz","nasza","nasze","naszego","naszych",
              "natomiast","natychmiast","nawet","nic","nich","nie","niech","niego","niej","niemu","nigdy","nim","nimi","nią","niż","no","nowe","np",
              "nr","o","o.o.","obok","od","ok","około","on","ona","one","oni","ono","oraz","oto","owszem","pan","pana","pani","pl","po","pod",
              "podczas","pomimo","ponad","ponieważ","powinien","powinna","powinni","powinno","poza","prawie","prof","przecież","przed","przede","przedtem",
              "przez","przy","raz","razie","roku","również","sam","sama","się","skąd","sobie","sobą","sposób","swoje","są","ta","tak","taka","taki","takich",
              "takie","także","tam","te","tego","tej","tel","temu","ten","teraz","też","to","tobie","tobą","toteż","totobą","trzeba","tu","tutaj","twoi","twoim",
              "twoja","twoje","twym","twój","ty","tych","tylko","tym","tys","tzw","u","ul","vi","vii","viii","vol","w","wam","wami","was","wasi","wasz","wasza",
              "wasze","we","według","wie","wiele","wielu","więc","więcej","wszyscy","wszystkich","wszystkie","wszystkim","wszystko","wtedy","www","wy","właśnie",
              "wśród","xi","xii","xiii","xiv","xv","z","za","zapewne","zawsze","zaś","ze","zeznowu","znowu","znów","został","zł","żaden","żadna","żadne","żadnych",
              "że","żeby"]

PII_REGEX_PATTERNS  = {
    'date_reg' : re.compile(r'(?<!\w)([1-2]\d{3}[\/.\\-](12|11|[0-1]?[0-9])[\/.\\-](3[0-1]|2[0-9]|1[0-9]|0?[1-9]))|((?<!\w)(3[0-1]|2[0-9]|1[0-9]|0?[1-9])([\/.\\-](12|11|1[0-9]|0?[1-9])|([\/.\\ -](stycz\w+|lut\w+|mar[cz]\w+|kwie\w+|maj\w|czerw\w+|lip\w+|sierp\w+|wrze[sś]\w+|paź\w+|listop\w+|grud\w+)))(([\/.\\ -]([0-9]){2}|[\/.\\ -]([1-2][0-9]{3})))?)(?![\d])'),
    'address_reg' : re.compile(r"(?<=\W)(?:ul\.|pl\.|al\.|bulw\.|os\.|aleja|bulwar|ulica|skwer|park|rondo)[^\S\r\n]?"
                                r"(?:(?:świętego|trasa|bł\.|im\.|gen\.|dr\.|ks\.|ks\.[^\S\r\n]bp\.|kpt\.|ks\.[^\S\r\n]kan\.|"
                                r"ks\.[^\S\r\n]+kard\.|marsz\.|mjr\.|o\.|ppłk\.|prof\.|św\.|[^\S\r\n])*)"
                                r"(?:(?:\d{1,3}\s+)?[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+(?:\s*[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+)*"
                                r"(?:\s*\d{1,3})*){1,}(?:(?:[^\S\r\n]+(?:i|z|im\.)[^\S\r\n]+)?"
                                r"(?:[^\S\r\n]*(?:\d{1,3}[^\S\r\n]+)?[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+(?:\s*[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+)*"
                                r"(?:\s*\d{1,3})*)*)*(?!\n\d{2}-\d{3})"),
    'post_code_reg' : re.compile(r'(?<=\s)([0-9]{2}-[0-9]{3})(?!\w)'),
    'ip_reg' : re.compile(r'(?<=\s)((25[0-5]|2[0-4][0-9]|\d?[1-9][0-9]|\d)(\.|$)){3}(25[0-5]|2[0-4][0-9]|\d?[1-9][0-9]|\d)(?!:\s)'),
    'nip_reg' : re.compile(r'(?:(?<=NIP: )|(?<=NIP ))([0-9]{10}|\d{3}-\d{2}-\d{2}-\d{3}|\d{3}-\d{3}-\d{2}-\d{2})(?!\w)'),
    'regon_reg' : re.compile(r'(?:(?<=REGON: )|(?<=REGON ))([0-9]{9})(?!\w)'),
    'phone_reg' : re.compile(r'(?<=\s)((?P<a>\()?(\+|00)?48(?(2)\)|)?)?(\s{0,}(4|5|6|7|8)\d{2})((?P<char>[ -]?)\d{3})((?P=char)\d{3})(?!\w)'),
    'domestic_phone_reg' : re.compile(r'(?!\s)((?P<a>\()?(\+|00)?48(?(2)\)|)?)?(\s*\d{2})?((\s*[2-9]\d{2})((?P<char>[ -]+)\d{2})((?P=char)\d{2})|(\s*[2-9]\d{1})((?P<char2>[ -]+)\d{3})((?P=char2)\d{2}))\s'),
    'iban_reg' : re.compile(r'(?<=\s)(?:PL)?(?:\d{2}(?:[\s]*\d{4}){6})(?!\w)'),
    'email_reg' : re.compile(r'(?<=\s)([a-zA-Z0-9][\w.+-]*@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,20})(?!\w)'),
    'pesel_reg' : re.compile(r'(?<=\s)(\d{2})(1[0-2]|2[1-9]|3[1-2]|0[1-9])(3[0-1]|2\d|1\d|0[1-9])(\d{5})(?!\w)'),
    'currency_reg' : re.compile(r'(?<=\s)(([1-9]\d{0,2}[,. ]){0,2}[,. ]?([1-9]\d{1,2}|\d{1,3})([,.]\d{2}|\d+)\s*(zł|Zł|pln|PLN|\$|€|£)|(\$|€|£)\s*([1-9]\d{0,2}[,. ]){0,2}[,. ]?([1-9]\d{1,2}|\d{1,3})([,.]\d{2}|\d+))'),
    'isbn_reg' : re.compile(r'(?<=\s)978(?:-?\d){10}(?!\w)'),
    'issn_reg' : re.compile(r'(?<=\s)((ISSN|eISSN) [\S]{4}\-[\S]{4})(?!\w)')
}

MARKDOWN_PATTERNS = {
    'header': re.compile(r'^#+\s', re.MULTILINE),
    'bold': re.compile(r'\*\*[^*\n]+?\*\*'),
    'italic': re.compile(r'\*[^*\n]+?\*'),
    'unordered_list': re.compile(r'^\s*[-*+]\s', re.MULTILINE),
    'ordered_list': re.compile(r'^\s*\d+\.\s', re.MULTILINE),
    'link': re.compile(r'\[([^\]]+)\]\(([^\)]+)\)'),
    'image': re.compile(r'!\[([^\]]*)\]\(([^\)]+)\)'),
    'inline_code': re.compile(r'`[^`\n]+`'),
    'code_block': re.compile(r'```[\s\S]*?```'),
    'blockquote': re.compile(r'^>\s', re.MULTILINE),
    'horizontal_rule': re.compile(r'^([-*_])\1{2,}$', re.MULTILINE)
}

PUNCTUATION_PATTERN = re.compile(r'[.,!?;:]')
NON_WORD_CHARS_PATTERN = re.compile(r'[^\w\s]')
EXCESSIVE_SPACES_PATTERN = re.compile(r' {4,}')
CAMEL_CASE_PATTERN = re.compile(r"\b[a-ząęćłńóśżź]+[A-ZĄĘĆŁŃÓŚŻŹ]+[a-ząęćłńóśżź]+[a-ząęćłńóśżźA-ZĄĘĆŁŃÓŚŻŹ]*\b")
ALLOWED_CHARS_PATTERN = re.compile(r'[a-zA-Z0-9ąćęłńóśźż\s.,;:\-!?]')
COMMON_CHARACTERS = list(" \t\n!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^`°abcdefghijklmnopqrstuvwxyz{|}~ÓóĄąĆćĘꣳŃńŚśŹźŻż")
SPACY_MODEL_PL = 'pl_core_news_md'
NLP_MAX_LENGTH = 5_000_000


COLUMN_ORDER = [
        'characters',
        'words',
        'sentences',
        'avg_sentence_length',
        'nouns',
        'verbs',
        'adjectives',
        'adverbs',
        'punctuations',
        'symbols',
        'stopwords',
        'oovs',
        'avg_word_length',
        'noun_ratio',
        'verb_ratio',
        'adj_ratio',
        'lexical_density',
        'gunning_fog',
        'camel_case',
        'pos_x',
        'pos_num',
        'capitalized_words',
        'unique_characters_all',
        'unique_characters_lower',
        'characters_out_of_common',
        'word_isupper<5',
        'word_isupper>5',
        'count_caps',
        'single_char_count',
        'single_char_ratio',
        'digit_count',
        'digit_ratio',
        'punct_frequency',
        'count_digit_to_caps',
        'bracet_count',
        'bracket_ratio',
        'average_lines',
        'short_line_count_3',
        'short_line_count_5',
        'short_line_count_10',
        'short_line_ratio_3',
        'short_line_ratio_5',
        'short_line_ratio_10',
        'lexical_diversity',
        'contextual_word_repetitions_count',
        'contextual_word_repetitions_ratio',
        'html_tags',
        'bbcode_tags',
        'urls',
        'text_to_markup_ratio',
        'emoticons',
        'slang_words',
        'slang_words_ratio',
        'incomplete_sentences',
        'excessive_chars',
        'blank_lines',
        'blank_lines_ratio',
        'duplicated_lines',
        'duplicate_line_ratio',
        'count_special_chars',
        'tabs',
        'multispaces',
        'short_line_count_20',
        'date_reg',
        'address_reg',
        'post_code_reg',
        'ip_reg',
        'nip_reg',
        'regon_reg',
        'phone_reg',
        'iban_reg',
        'email_reg',
        'pesel_reg',
        'currency_reg',
        'isbn_reg',
        'issn_reg',
        'short_line_ratio_20',
        'ellipsis_fractions',
        'line_counts',
        'non_alpha_word_fractions',
        'lorem_ipsum_ratio',
        'mean_word_length',
        'stop_word_ratio',
        'entropy',
        'javascript_counts_per_line',
        'lines_with_bullet',
        'ratio_of_bulletpoints',
        'overall_uppercase_ratio',
        'bad_word_count',
        'fraction_duplicate_5_ngram',
        'fraction_duplicate_6_ngram',
        'fraction_duplicate_7_ngram',
        'fraction_duplicate_8_ngram',
        'fraction_duplicate_9_ngram',
        'fraction_duplicate_10_ngram',
        'fraction_top_2_ngram',
        'fraction_top_3_ngram',
        'fraction_top_4_ngram',
        'fraction_top_5_ngram',
        'symbol_to_word_ratio',
        'avg_paragraph_length',
        'paragraph_length_variance',
        'unique_sentence_beginnings_ratio',
        'semicolons_per_sentence',
        'dashes_per_sentence',
        'colons_per_sentence',
        'formal_words_ratio',
        'commas_per_sentence',
        'short_sentences_ratio',
        'long_sentences_ratio',
        'cohesive_words_per_sentence',
        'quotes_and_references_per_sentence',
        'headers_per_1000_chars_md',
        'average_header_level_md',
        'bold_per_1000_chars_md',
        'italic_per_1000_chars_md',
        'unordered_list_items_per_1000_chars_md',
        'ordered_list_items_per_1000_chars_md',
        'links_per_1000_chars_md',
        'images_per_1000_chars_md',
        'inline_code_fragments_per_1000_chars_md',
        'code_blocks_per_1000_chars_md',
        'blockquotes_per_1000_chars_md',
        'horizontal_rules_per_1000_chars_md',
        'char_ratio_#',
        'char_ratio_*',
        'char_ratio_-',
        'char_ratio_+',
        'char_ratio_[',
        'char_ratio_]',
        'char_ratio_(',
        'char_ratio_)',
        'char_ratio_`',
        'char_ratio_>',
        'char_ratio__',
        'char_ratio_!',
        'special_chars_ratio_md',
        'lowercase_ratio_md',
        'uppercase_ratio_md',
        'digit_ratio_md',
        'whitespace_ratio_md',
        'table_pipe_count',
        'table_pipe_ratio',
        'table_pipe_per_1000_chars',
        'table_lines_count',
        'table_lines_ratio',
        'table_header_separators_count',
        'avg_pipes_per_table_line',
        'estimated_avg_columns',
        'word_count',
        'unique_word_count',
        'top_word_count',
        'top_word_ratio',
        'top_5_ratio',
        'top_10_ratio',
        'hapax_legomena_ratio',
        'looping_suspicion',
        'polish_diacritics_count',
        'polish_diacritics_ratio',
        'polish_diacritics_per_word',
        'diacritics_to_letters_ratio',
        'replacement_char_count',
        'replacement_char_ratio',
        'not_allowed_chars_count',
        'not_allowed_chars_ratio',
        'encoding_suspicion',
        'single_char_word_count',
        'single_char_unique_count',
        'single_char_upper_count',
        'single_char_lower_count',
        'single_char_upper_unique_count',
        'single_char_lower_unique_count',
        'single_char_top_1_codepoint',
        'single_char_top_2_codepoint',
        'single_char_top_3_codepoint',
        'question_sentence_ratio',
        'single_word_line_ratio',
        'repeated_word_line_ratio',
        'lix',
        'rix',
        'diacritics_std_dev',
        'ner_count',
        'ner_person_ratio',
        'ner_org_ratio',
        'ner_loc_ratio',
        'ner_misc_ratio',
        'case_diversity',
        'tense_diversity',
        'mood_diversity',
        'top_words_total_count',
        'top_words_noun_ratio',
        'top_words_verb_ratio',
        'top_words_adj_ratio',
        'top_words_other_ratio',
        'top_words_noun_prop_of_all_nouns',
        'top_words_verb_prop_of_all_verbs',
        'top_words_adj_prop_of_all_adjs',
        'top_words_other_prop_of_all_others',
        'avg_dependency_tree_depth',
        'digit_start_lines'
]