Fill-Mask
Transformers
Safetensors
PyTorch
Kazakh
Russian
English
bert
File size: 1,188 Bytes
6d56cee
 
4fd097f
 
3229c59
6d56cee
 
 
 
3229c59
6d56cee
3229c59
 
6d56cee
 
 
3229c59
 
 
6d56cee
 
 
3229c59
6d56cee
 
 
 
3229c59
6d56cee
 
 
 
 
 
3229c59
6d56cee
 
3229c59
6d56cee
 
 
 
3229c59
6d56cee
 
 
 
 
 
 
 
3229c59
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import re
import unicodedata
import random


input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt"
train_file = "train.txt"
dev_file = "dev.txt"


def normalize_text(text):
    text = unicodedata.normalize("NFC", text) 
    text = text.lower()  
    return text

def clean_text(text):
    text = re.sub(r"<[^>]+>", " ", text)  
    text = re.sub(r"\[\[.*?\]\]", " ", text)  
    text = re.sub(r"\s+", " ", text) 
    text = text.strip()
    return text


cleaned_lines = []
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if len(line) < 10:  
            continue
        line = normalize_text(line)
        line = clean_text(line)
        if line:
            cleaned_lines.append(line)


random.shuffle(cleaned_lines)


split_index = int(0.8 * len(cleaned_lines))
train_lines = cleaned_lines[:split_index]
dev_lines = cleaned_lines[split_index:]


with open(train_file, "w", encoding="utf-8") as f:
    for line in train_lines:
        f.write(line + "\n")

with open(dev_file, "w", encoding="utf-8") as f:
    for line in dev_lines:
        f.write(line + "\n")

print(f"Train: {len(train_lines)}, Dev: {len(dev_lines)}")