jwengr commited on
Commit
56210ab
Β·
verified Β·
1 Parent(s): 61ddbf5

Upload folder using huggingface_hub

Browse files
hangul_augmentator/config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "HangulAugmentator"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modeling_hangul_augmentator.HangulAugmentatorConfig",
7
+ "AutoModel": "modeling_hangul_augmentator.HangulAugmentator"
8
+ },
9
+ "do_add_jong": true,
10
+ "do_link_before": true,
11
+ "do_link_next": true,
12
+ "do_replace_similar": true,
13
+ "model_type": "hangul_augmentator",
14
+ "p": 0.5,
15
+ "torch_dtype": "float32",
16
+ "transformers_version": "4.48.0"
17
+ }
hangul_augmentator/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2b29affbce2da50bace6c60697df30b796ff62cba44ab8755d6b264abebc0de
3
+ size 108
hangul_augmentator/modeling_hangul_augmentator.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import torch
3
+ import torch.nn as nn
4
+
5
+ from jamotools import split_syllables, join_jamos
6
+ from transformers import PretrainedConfig, PreTrainedModel, AutoConfig
7
+
8
+ class HangulAugmentatorConfig(PretrainedConfig):
9
+ model_type = "hangul_augmentator"
10
+
11
+ def __init__(
12
+ self,
13
+ p=0.5,
14
+ do_link_next=True,
15
+ do_link_before=True,
16
+ do_replace_similar=True,
17
+ do_add_jong=True,
18
+ **kwargs
19
+ ):
20
+ super().__init__(**kwargs)
21
+ self.p = p
22
+ self.do_link_next = do_link_next
23
+ self.do_link_before = do_link_before
24
+ self.do_replace_similar = do_replace_similar
25
+ self.do_add_jong = do_add_jong
26
+
27
+
28
+ class HangulAugmentator(PreTrainedModel):
29
+ config_class = HangulAugmentatorConfig
30
+
31
+ def __init__(self, config):
32
+ super().__init__(config)
33
+ self.temp_module = torch.nn.Parameter(torch.ones(1))
34
+
35
+ self.ja_similar_dict = {
36
+ 'γ…‚': ['γ…ƒ', 'ㅍ'],
37
+ 'γ„±': ['γ„²', 'γ…‹'],
38
+ 'γ„·': ['γ„Έ', 'γ…Œ'],
39
+ 'γ„²': ['γ„²', 'γ…‹'],
40
+ 'γ……': ['γ…†'],
41
+ 'γ…ˆ': ['γ…‰', 'γ…Š'],
42
+ 'γ…Œ': ['γ„Έ', 'γ…Œ', 'γ„·'],
43
+ 'γ…‹': ['γ„²', 'γ„±'],
44
+ 'ㅍ': ['γ…ƒ', 'γ…‚'],
45
+ 'γ…ƒ': ['ㅍ', 'γ…‚'],
46
+ 'γ„Έ': ['γ…Œ', 'γ„·'],
47
+ 'γ…Š': ['γ…‰', 'γ…‰', 'γ…ˆ'],
48
+ 'γ…†': ['γ……'],
49
+ 'γ…‰': ['γ…‰', 'γ…ˆ'],
50
+ }
51
+ self.mo_similar_dict = {
52
+ 'γ…•': ['γ…“'],
53
+ 'ㅏ': ['γ…‘'],
54
+ 'ㅐ': ['γ…”', 'γ…’'],
55
+ 'γ…—': ['γ…›'],
56
+ 'γ…™': ['γ…š', 'γ…ž'],
57
+ 'γ…‘': ['γ…œ'],
58
+ 'γ…£': ['γ…Ÿ'],
59
+ 'γ…œ': ['γ… '],
60
+ 'γ…“': [ 'γ…•'],
61
+ 'γ…”': ['γ…–', 'γ…ž'],
62
+ 'γ…›': ['γ…—'],
63
+ 'γ…š': ['γ…™', 'γ…ž'],
64
+ 'γ… ': ['γ…œ'],
65
+ 'ㅝ': ['γ…“'],
66
+ 'γ…–': ['γ…’'],
67
+ 'γ…’': ['γ…Ÿ'],
68
+ 'γ…‘': ['ㅏ'],
69
+ 'γ…ž': ['γ…™', 'γ…š', 'γ…”'],
70
+ 'γ…’': ['γ…–']
71
+ }
72
+ self.jong_link_dict = {
73
+ 'γ…‚': ['γ…‚', 'γ…‚'],
74
+ 'ㅍ': ['ㅍ', 'ㅍ'],
75
+ 'γ„±': ['γ„±', 'γ„±'],
76
+ 'γ…Š': ['γ…Š', 'γ…Š'],
77
+ 'γ…Ž': ['γ…Ž', 'γ…Ž'],
78
+ 'γ…‡': ['γ…‡', 'γ…‡'],
79
+ 'γ…Œ': ['γ…Œ', 'γ…Œ'],
80
+ 'γ„½': ['γ„Ή', 'γ……'],
81
+ 'γ„Ώ': ['γ„Ή', 'ㅍ'],
82
+ 'γ„΅': ['γ„΄', 'γ…ˆ'],
83
+ 'γ„²': ['γ„²', 'γ„²'],
84
+ 'γ…‹': ['γ…‹', 'γ…‹'],
85
+ 'γ„΄': ['γ„΄', 'γ„΄'],
86
+ 'γ„·': ['γ„·', 'γ„·'],
87
+ 'γ…€': ['γ„Ή', 'γ…Ž'],
88
+ 'γ…ˆ': ['γ…ˆ', 'γ…ˆ'],
89
+ 'γ„Ί': ['γ„Ή', 'γ„±'],
90
+ 'γ„Ό': ['γ„Ή', 'γ…‚'],
91
+ 'γ……': ['γ……', 'γ……'],
92
+ 'γ„Ά': ['γ„΄', 'γ…Ž'],
93
+ 'γ„Ή': ['γ„Ή', 'γ„Ή'],
94
+ 'ㅁ': ['ㅁ', 'ㅁ'],
95
+ 'γ„³': ['γ„±', 'γ……'],
96
+ 'γ…†': ['γ…†', 'γ…†'],
97
+ 'γ„Ύ': ['γ„Ή', 'γ…Œ'],
98
+ 'γ…„': ['γ…‚', 'γ……'],
99
+ 'γ„»': ['γ„Ή', 'ㅁ']
100
+ }
101
+
102
+ self.jong_similar_dict = {
103
+ 'γ„Ή': [
104
+ 'γ„±', 'γ„²', 'γ„³', 'γ„΄', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ί', 'γ„»', 'γ„Ό', 'γ„½',
105
+ 'γ„Ύ', 'γ„Ώ', 'γ…€', 'ㅁ', 'γ…‚', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…Œ', 'ㅍ', 'γ…Ž'
106
+ ],
107
+ 'γ„²': [
108
+ 'γ„±', 'γ„³', 'γ„΄', 'γ„Ή', 'γ„»', 'ㅁ', 'γ…‚', 'γ……', 'γ…†', 'γ…‡', 'γ…‹', 'γ…Œ', 'γ…Ž'
109
+ ],
110
+ 'γ……': [
111
+ 'γ„±', 'γ„²', 'γ„΄', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„½', 'γ…€',
112
+ 'ㅁ', 'γ…‚', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…Œ', 'ㅍ', 'γ…Ž'
113
+ ],
114
+ 'ㅁ': [
115
+ 'γ„±', 'γ„²', 'γ„³', 'γ„΄', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„Ό',
116
+ 'γ„Ύ', 'γ„Ώ', 'γ…€', 'γ…‚', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž'
117
+ ],
118
+ 'γ„΄': [
119
+ 'γ„±', 'γ„²', 'γ„³', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„Ό',
120
+ 'γ„Ύ', 'γ…€', 'ㅁ', 'γ…‚', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž'
121
+ ],
122
+ 'γ…‡': ['γ…Ž'],
123
+ 'γ…†': [
124
+ 'γ„±', 'γ„²', 'γ„³', 'γ„΄', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»',
125
+ 'γ„Ό', 'γ…€', 'ㅁ', 'γ…‚', 'γ…„', 'γ……', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž'
126
+ ],
127
+ 'γ…„': ['γ…‚', 'ㅍ'],
128
+ 'γ…‚': [
129
+ 'γ„±', 'γ„΄', 'γ„΅', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„Ό', 'γ…€',
130
+ 'ㅁ', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…Œ', 'ㅍ', 'γ…Ž'
131
+ ],
132
+ 'γ„Ά': ['γ„΄', 'γ„΅'],
133
+ 'ㅍ': [
134
+ 'γ„±', 'γ„²', 'γ„΄', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»',
135
+ 'γ„Ό', 'γ…€', 'ㅁ', 'γ…‚', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…Œ', 'γ…Ž'
136
+ ],
137
+ 'γ„±': [
138
+ 'γ„²', 'γ„³', 'γ„΄', 'γ„΅', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»',
139
+ 'γ„Ό', 'γ…€', 'ㅁ', 'γ…‚', 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…‹', 'γ…Œ', 'ㅍ', 'γ…Ž'
140
+ ],
141
+ 'γ…Œ': [
142
+ 'γ„±', 'γ„²', 'γ„΄', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'ㅁ', 'γ…‚',
143
+ 'γ…„', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'ㅍ', 'γ…Ž'
144
+ ],
145
+ 'γ„Ό': ['γ„Ή', 'γ„Ί', 'γ„»', 'γ…€'],
146
+ 'γ„·': [
147
+ 'γ„±', 'γ„²', 'γ„΄', 'γ„Ή', 'γ„Ί', 'γ„»', 'γ„Ό', 'γ…€',
148
+ 'ㅁ', 'γ…‚', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'γ…Œ'
149
+ ],
150
+ 'γ…ˆ': [
151
+ 'γ„±', 'γ„΄', 'γ„Ά', 'γ„·', 'γ„Ή', 'γ„Ί', 'γ„»', 'ㅁ',
152
+ 'γ…‚', 'γ……', 'γ…†', 'γ…‡', 'γ…Š', 'γ…Œ', 'γ…Ž'
153
+ ],
154
+ 'γ…Ž': ['γ„±', 'γ„³', 'γ„΄', 'γ„Ή', 'γ„»', 'ㅁ', 'γ…‚', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Š', 'ㅍ'],
155
+ 'γ…Š': ['γ„·', 'ㅁ', 'γ……', 'γ…†', 'γ…‡', 'γ…ˆ', 'γ…Œ'],
156
+ 'γ„΅': ['γ„΄', 'γ„Ά'],
157
+ 'γ„»': ['γ„Ή', 'γ„Ί', 'γ„½', 'γ…€'],
158
+ 'γ…€': ['γ„Ή', 'γ„Ί', 'γ„»'],
159
+ 'γ„Ί': ['γ„Ή', 'γ„»', 'γ„Ό', 'γ…€'],
160
+ 'γ…‹': ['γ„±', 'γ„³', 'γ…„'],
161
+ 'γ„³': ['γ„±'],
162
+ 'γ„Ύ': ['γ„Ή'],
163
+ }
164
+
165
+ def __call__(self, sentence):
166
+ if self.config.do_link_next:
167
+ sentence = self.link_next(sentence, self.config.p)
168
+ if self.config.do_link_before:
169
+ sentence = self.link_before(sentence, self.config.p)
170
+
171
+ if self.config.do_replace_similar:
172
+ chars = []
173
+ for char in sentence:
174
+ if self.config.p>=random.random():
175
+ chars.append(self.replace_similar(char))
176
+ else:
177
+ chars.append(char)
178
+ sentence = ''.join(chars)
179
+
180
+ if self.config.do_add_jong:
181
+ chars = []
182
+ for char in sentence:
183
+ if self.config.p>=random.random():
184
+ chars.append(self.add_jong(char))
185
+ else:
186
+ chars.append(char)
187
+ sentence = ''.join(chars)
188
+ return sentence
189
+
190
+
191
+ def _link_next(self, char1, char2):
192
+ if len((char1+char2).strip())!=2:
193
+ return char1, char2
194
+ if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3):
195
+ return char1, char2
196
+ char1_jamo = list(split_syllables(char1))
197
+ if len(char1_jamo)!=3:
198
+ return char1, char2
199
+ char2_jamo = list(split_syllables(char2))
200
+ if char2_jamo[0]!='γ…‡':
201
+ return char1, char2
202
+ new_jong, new_cho = self.jong_link_dict[char1_jamo[-1]]
203
+ new_char1 = join_jamos(char1_jamo[:2] + [new_jong])[:1]
204
+ new_char2 = join_jamos([new_cho] + char2_jamo[1:])[:1]
205
+ return new_char1, new_char2
206
+
207
+ def link_next(self, sentence, p):
208
+ chars = list(sentence)
209
+ for i in range(len(chars)-1):
210
+ if p>=random.random():
211
+ new_char1, new_char2 = self._link_next(chars[i], chars[i+1])
212
+ chars[i], chars[i+1] = new_char1, new_char2
213
+ new_sentence = ''.join(chars)
214
+ return new_sentence
215
+
216
+ def _link_before(self, char1, char2):
217
+ if len((char1+char2).strip())!=2:
218
+ return char1, char2
219
+ if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3):
220
+ return char1, char2
221
+ char1_jamo = list(split_syllables(char1))
222
+ if len(char1_jamo)!=2:
223
+ return char1, char2
224
+ char2_jamo = list(split_syllables(char2))
225
+ new_char1 = join_jamos(char1_jamo[:2] + char2_jamo[:1])[:1]
226
+ return new_char1, char2
227
+
228
+ def link_before(self, sentence, p):
229
+ chars = list(sentence)
230
+ for i in range(len(chars)-1):
231
+ if p>=random.random():
232
+ new_char1, new_char2 = self._link_before(chars[i], chars[i+1])
233
+ chars[i], chars[i+1] = new_char1, new_char2
234
+ new_sentence = ''.join(chars)
235
+ return new_sentence
236
+
237
+ def replace_similar(self, char):
238
+ if len(char.strip())!=1:
239
+ return char
240
+ if not 0xAC00<= ord(char) <=0xD7A3:
241
+ return char
242
+ jamo = list(split_syllables(char))
243
+ jamo[0] = random.choice(self.ja_similar_dict.get(jamo[0],jamo[0]))
244
+ jamo[1] = random.choice(self.mo_similar_dict.get(jamo[1],jamo[1]))
245
+ if len(jamo)==3:
246
+ jamo[2] = random.choice(self.jong_similar_dict.get(jamo[2],jamo[2]))
247
+ return join_jamos(jamo)[:1]
248
+
249
+ def add_jong(self, char):
250
+ if len(char.strip())!=1:
251
+ return char
252
+ if not 0xAC00<= ord(char) <=0xD7A3:
253
+ return char
254
+ jamo = list(split_syllables(char))
255
+ if len(jamo)==3:
256
+ return char
257
+ new_jong = random.choice(list(self.jong_link_dict.keys()))
258
+ new_char = join_jamos(jamo[:2]+[new_jong])[:1]
259
+ return new_char
260
+ def _link_next(self, char1, char2):
261
+ if len((char1+char2).strip())!=2:
262
+ return char1, char2
263
+ if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3):
264
+ return char1, char2
265
+ char1_jamo = list(split_syllables(char1))
266
+ if len(char1_jamo)!=3:
267
+ return char1, char2
268
+ char2_jamo = list(split_syllables(char2))
269
+ if char2_jamo[0]!='γ…‡':
270
+ return char1, char2
271
+ new_jong, new_cho = self.jong_link_dict[char1_jamo[-1]]
272
+ new_char1 = join_jamos(char1_jamo[:2] + [new_jong])[:1]
273
+ new_char2 = join_jamos([new_cho] + char2_jamo[1:])[:1]
274
+ return new_char1, new_char2
275
+
276
+ def link_next(self, sentence, p):
277
+ chars = list(sentence)
278
+ for i in range(len(chars)-1):
279
+ if p>=random.random():
280
+ new_char1, new_char2 = self._link_next(chars[i], chars[i+1])
281
+ chars[i], chars[i+1] = new_char1, new_char2
282
+ new_sentence = ''.join(chars)
283
+ return new_sentence
284
+
285
+ def _link_before(self, char1, char2):
286
+ if len((char1+char2).strip())!=2:
287
+ return char1, char2
288
+ if not (0xAC00<= ord(char1) <=0xD7A3 and 0xAC00<= ord(char2) <=0xD7A3):
289
+ return char1, char2
290
+ char1_jamo = list(split_syllables(char1))
291
+ if len(char1_jamo)!=2:
292
+ return char1, char2
293
+ char2_jamo = list(split_syllables(char2))
294
+ new_char1 = join_jamos(char1_jamo[:2] + char2_jamo[:1])[:1]
295
+ return new_char1, char2
296
+
297
+ def link_before(self, sentence, p):
298
+ chars = list(sentence)
299
+ for i in range(len(chars)-1):
300
+ if p>=random.random():
301
+ new_char1, new_char2 = self._link_before(chars[i], chars[i+1])
302
+ chars[i], chars[i+1] = new_char1, new_char2
303
+ new_sentence = ''.join(chars)
304
+ return new_sentence
305
+
306
+ def replace_similar(self, char):
307
+ if len(char.strip())!=1:
308
+ return char
309
+ if not 0xAC00<= ord(char) <=0xD7A3:
310
+ return char
311
+ jamo = list(split_syllables(char))
312
+ jamo[0] = random.choice(self.ja_similar_dict.get(jamo[0],jamo[0]))
313
+ jamo[1] = random.choice(self.mo_similar_dict.get(jamo[1],jamo[1]))
314
+ if len(jamo)==3:
315
+ jamo[2] = random.choice(self.jong_similar_dict.get(jamo[2],jamo[2]))
316
+ return join_jamos(jamo)[:1]
317
+
318
+ def add_jong(self, char):
319
+ if len(char.strip())!=1:
320
+ return char
321
+ if not 0xAC00<= ord(char) <=0xD7A3:
322
+ return char
323
+ jamo = list(split_syllables(char))
324
+ if len(jamo)==3:
325
+ return char
326
+ new_jong = random.choice(list(self.jong_link_dict.keys()))
327
+ new_char = join_jamos(jamo[:2]+[new_jong])[:1]
328
+ return new_char