File size: 1,188 Bytes

6d56cee
 
4fd097f
 
3229c59
6d56cee
 
 
 
3229c59
6d56cee
3229c59
 
6d56cee
 
 
3229c59
 
 
6d56cee
 
 
3229c59
6d56cee
 
 
 
3229c59
6d56cee
 
 
 
 
 
3229c59
6d56cee
 
3229c59
6d56cee
 
 
 
3229c59
6d56cee
 
 
 
 
 
 
 
3229c59

import re
import unicodedata
import random


input_file = "/kaggle/input/kaz-rus-eng-wiki/combined.txt"
train_file = "train.txt"
dev_file = "dev.txt"


def normalize_text(text):
    text = unicodedata.normalize("NFC", text) 
    text = text.lower()  
    return text

def clean_text(text):
    text = re.sub(r"<[^>]+>", " ", text)  
    text = re.sub(r"\[\[.*?\]\]", " ", text)  
    text = re.sub(r"\s+", " ", text) 
    text = text.strip()
    return text


cleaned_lines = []
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if len(line) < 10:  
            continue
        line = normalize_text(line)
        line = clean_text(line)
        if line:
            cleaned_lines.append(line)


random.shuffle(cleaned_lines)


split_index = int(0.8 * len(cleaned_lines))
train_lines = cleaned_lines[:split_index]
dev_lines = cleaned_lines[split_index:]


with open(train_file, "w", encoding="utf-8") as f:
    for line in train_lines:
        f.write(line + "\n")

with open(dev_file, "w", encoding="utf-8") as f:
    for line in dev_lines:
        f.write(line + "\n")

print(f"Train: {len(train_lines)}, Dev: {len(dev_lines)}")