File size: 1,188 Bytes
6d56cee 4fd097f 3229c59 6d56cee 3229c59 6d56cee 3229c59 6d56cee 3229c59 6d56cee 3229c59 6d56cee 3229c59 6d56cee 3229c59 6d56cee 3229c59 6d56cee 3229c59 6d56cee 3229c59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import re
import unicodedata
import random
input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt"
train_file = "train.txt"
dev_file = "dev.txt"
def normalize_text(text):
text = unicodedata.normalize("NFC", text)
text = text.lower()
return text
def clean_text(text):
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\[\[.*?\]\]", " ", text)
text = re.sub(r"\s+", " ", text)
text = text.strip()
return text
cleaned_lines = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if len(line) < 10:
continue
line = normalize_text(line)
line = clean_text(line)
if line:
cleaned_lines.append(line)
random.shuffle(cleaned_lines)
split_index = int(0.8 * len(cleaned_lines))
train_lines = cleaned_lines[:split_index]
dev_lines = cleaned_lines[split_index:]
with open(train_file, "w", encoding="utf-8") as f:
for line in train_lines:
f.write(line + "\n")
with open(dev_file, "w", encoding="utf-8") as f:
for line in dev_lines:
f.write(line + "\n")
print(f"Train: {len(train_lines)}, Dev: {len(dev_lines)}")
|