Eraly-ml
/

KazBERT

+# Import required libraries
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import os
+import json
+import random
+import nltk
+from tokenizers import Tokenizer, models, pre_tokenizers, trainers
+# Download NLTK's punkt tokenizer if not already downloaded
+nltk.download('punkt')
+# ------------------------------------------------------------------------------
+# SECTION 1: Define file paths for datasets
+# ------------------------------------------------------------------------------
+# File paths
+kazakh_path = '/kaggle/input/eng-kaz/kk_wiki_articles.txt'
+english_path = '/kaggle/input/eng-kaz/test-00000-of-00001.parquet'
+russian_json_path = "hf://datasets/Den4ikAI/russian_cleared_wikipedia/wiki_dataset.json"
+# ------------------------------------------------------------------------------
+# SECTION 2: Load and preprocess the Kazakh dataset
+# ------------------------------------------------------------------------------
+# Load Kazakh dataset (each line is an article)
+with open(kazakh_path, "r", encoding="utf-8") as f:
+    kazakh_texts = f.readlines()
+# Strip extra spaces and remove empty lines
+kazakh_texts = [line.strip() for line in kazakh_texts if line.strip()]
+print(f"Number of Kazakh articles: {len(kazakh_texts)}")
+# ------------------------------------------------------------------------------
+# SECTION 3: Load and preprocess the English dataset (Parquet format)
+# ------------------------------------------------------------------------------
+# Load the English dataset from a Parquet file
+english_df = pd.read_parquet(english_path)
+print("English dataset columns:", english_df.columns.tolist())
+# Assume the text is stored in the column 'text'
+if 'text' in english_df.columns:
+    english_texts = english_df['text'].dropna().tolist()
+else:
+    # If the column name is different, use the first column
+    english_texts = english_df.iloc[:, 0].dropna().tolist()
+print(f"Number of English articles: {len(english_texts)}")
+# ------------------------------------------------------------------------------
+# SECTION 4: Load and preprocess the Russian dataset (JSON lines)
+# ------------------------------------------------------------------------------
+# Load Russian dataset (JSON, with lines=True)
+russian_df = pd.read_json(russian_json_path, lines=True)
+print("Russian dataset columns:", russian_df.columns.tolist())
+# Assume the text is stored in the 'text' column
+if 'text' in russian_df.columns:
+    russian_texts = russian_df['text'].dropna().tolist()
+else:
+    russian_texts = russian_df.iloc[:, 0].dropna().tolist()
+print(f"Number of Russian articles: {len(russian_texts)}")
+# ------------------------------------------------------------------------------
+# SECTION 5: Combine all articles and save to a combined file
+# ------------------------------------------------------------------------------
+# Combine all texts from the three datasets into one list
+all_texts = kazakh_texts + english_texts + russian_texts
+print(f"Total number of articles: {len(all_texts)}")
+# Save the combined articles to a file "combined.txt"
+with open("combined.txt", "w", encoding="utf-8") as f:
+    for article in all_texts:
+        f.write(article + "\n")
+print("Combined dataset saved to combined.txt")
+# ------------------------------------------------------------------------------
+# SECTION 6: Split data into training and validation sets
+# ------------------------------------------------------------------------------
+# Split data into train (80%) and validation (20%) sets
+train_texts, val_texts = train_test_split(all_texts, test_size=0.2, random_state=42)
+print(f"Number of training examples: {len(train_texts)}, Number of validation examples: {len(val_texts)}")
+# Save the training data to "train.txt"
+with open("train.txt", "w", encoding="utf-8") as f:
+    for article in train_texts:
+        f.write(article + "\n")
+# Save the validation data to "valid.txt"
+with open("valid.txt", "w", encoding="utf-8") as f:
+    for article in val_texts:
+        f.write(article + "\n")
+print("Files train.txt and valid.txt have been saved")
+# ------------------------------------------------------------------------------
+# SECTION 7: Create pretraining data with masked sentences for masked language modeling
+# ------------------------------------------------------------------------------
+# Read the complete training text from "train.txt"
+with open("/kaggle/input/kaz-rus-eng-wiki/train.txt", "r", encoding="utf-8") as f:
+    text = f.read()
+# Tokenize the text into sentences using NLTK
+sentences = nltk.sent_tokenize(text)
+output_data = []
+for sentence in sentences:
+    sentence = sentence.strip()
+    # Select sentences that end with a period
+    if sentence.endswith('.'):
+        words = sentence.split()
+        if len(words) < 2:
+            masked_sentence = sentence
+        else:
+            # Randomly choose one word to replace with the [MASK] token
+            idx = random.randint(0, len(words) - 1)
+            words[idx] = "[MASK]"
+            masked_sentence = " ".join(words)
+        output_data.append({
+            "original_sentence": sentence,
+            "masked_sentence": masked_sentence
+        })
+# Save the pretraining examples in JSON format to "train_pretrain.json"
+with open("train_pretrain.json", "w", encoding="utf-8") as f:
+    json.dump(output_data, f, ensure_ascii=False, indent=4)
+print(f"Saved {len(output_data)} examples to train_pretrain.json")
+# ------------------------------------------------------------------------------
+# SECTION 8: Train a WordPiece tokenizer using the tokenizers library
+# ------------------------------------------------------------------------------
+# Read the text file for tokenizer training (using the validation file here)
+with open("/kaggle/working/valid.txt", "r", encoding="utf-8") as f:
+    texts = f.readlines()
+# Create a WordPiece tokenizer with an unknown token
+tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
+tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
+# Define special tokens
+special_tokens = ["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
+# Setup the WordPiece trainer with vocabulary size and minimum frequency
+trainer = trainers.WordPieceTrainer(
+    vocab_size=30_000,
+    min_frequency=2,
+    special_tokens=special_tokens
+)
+# Train the tokenizer on the texts
+tokenizer.train_from_iterator(texts, trainer)
+# Save the vocabulary to "vocab.txt"
+with open("vocab.txt", "w", encoding="utf-8") as f:
+    for token, _ in sorted(tokenizer.get_vocab().items(), key=lambda x: x[1]):
+        f.write(token + "\n")
+# Save the tokenizer model in JSON format to "tokenizer.json"
+tokenizer.save("tokenizer.json")
+# Create and save the special tokens map as JSON
+special_tokens_map = {
+    "unk_token": "[UNK]",
+    "sep_token": "[SEP]",
+    "pad_token": "[PAD]",
+    "cls_token": "[CLS]",
+    "mask_token": "[MASK]"
+}
+with open("special_tokens_map.json", "w", encoding="utf-8") as f:
+    json.dump(special_tokens_map, f, indent=4)
+# Create and save the tokenizer configuration as JSON
+tokenizer_config = {
+    "do_lower_case": False,
+    "vocab_size": 30_000,
+    "model_max_length": 512,
+    "special_tokens_map_file": "special_tokens_map.json"
+}
+with open("tokenizer_config.json", "w", encoding="utf-8") as f:
+    json.dump(tokenizer_config, f, indent=4)
+print("✅ Tokenizer training completed! Files 'tokenizer.json', 'vocab.txt', 'special_tokens_map.json', and 'tokenizer_config.json' have been saved.")