Fill-Mask
Transformers
Safetensors
PyTorch
Kazakh
Russian
English
bert
Eraly-ml commited on
Commit
4fd097f
·
verified ·
1 Parent(s): 820ab97

data and tokenizer pip-line

Browse files
Files changed (1) hide show
  1. data-pipline.py +188 -0
data-pipline.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import required libraries
2
+ import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+ import os
5
+ import json
6
+ import random
7
+ import nltk
8
+ from tokenizers import Tokenizer, models, pre_tokenizers, trainers
9
+
10
+ # Download NLTK's punkt tokenizer if not already downloaded
11
+ nltk.download('punkt')
12
+
13
+ # ------------------------------------------------------------------------------
14
+ # SECTION 1: Define file paths for datasets
15
+ # ------------------------------------------------------------------------------
16
+
17
+ # File paths
18
+ kazakh_path = '/kaggle/input/eng-kaz/kk_wiki_articles.txt'
19
+ english_path = '/kaggle/input/eng-kaz/test-00000-of-00001.parquet'
20
+ russian_json_path = "hf://datasets/Den4ikAI/russian_cleared_wikipedia/wiki_dataset.json"
21
+
22
+ # ------------------------------------------------------------------------------
23
+ # SECTION 2: Load and preprocess the Kazakh dataset
24
+ # ------------------------------------------------------------------------------
25
+
26
+ # Load Kazakh dataset (each line is an article)
27
+ with open(kazakh_path, "r", encoding="utf-8") as f:
28
+ kazakh_texts = f.readlines()
29
+
30
+ # Strip extra spaces and remove empty lines
31
+ kazakh_texts = [line.strip() for line in kazakh_texts if line.strip()]
32
+ print(f"Number of Kazakh articles: {len(kazakh_texts)}")
33
+
34
+ # ------------------------------------------------------------------------------
35
+ # SECTION 3: Load and preprocess the English dataset (Parquet format)
36
+ # ------------------------------------------------------------------------------
37
+
38
+ # Load the English dataset from a Parquet file
39
+ english_df = pd.read_parquet(english_path)
40
+ print("English dataset columns:", english_df.columns.tolist())
41
+
42
+ # Assume the text is stored in the column 'text'
43
+ if 'text' in english_df.columns:
44
+ english_texts = english_df['text'].dropna().tolist()
45
+ else:
46
+ # If the column name is different, use the first column
47
+ english_texts = english_df.iloc[:, 0].dropna().tolist()
48
+ print(f"Number of English articles: {len(english_texts)}")
49
+
50
+ # ------------------------------------------------------------------------------
51
+ # SECTION 4: Load and preprocess the Russian dataset (JSON lines)
52
+ # ------------------------------------------------------------------------------
53
+
54
+ # Load Russian dataset (JSON, with lines=True)
55
+ russian_df = pd.read_json(russian_json_path, lines=True)
56
+ print("Russian dataset columns:", russian_df.columns.tolist())
57
+
58
+ # Assume the text is stored in the 'text' column
59
+ if 'text' in russian_df.columns:
60
+ russian_texts = russian_df['text'].dropna().tolist()
61
+ else:
62
+ russian_texts = russian_df.iloc[:, 0].dropna().tolist()
63
+ print(f"Number of Russian articles: {len(russian_texts)}")
64
+
65
+ # ------------------------------------------------------------------------------
66
+ # SECTION 5: Combine all articles and save to a combined file
67
+ # ------------------------------------------------------------------------------
68
+
69
+ # Combine all texts from the three datasets into one list
70
+ all_texts = kazakh_texts + english_texts + russian_texts
71
+ print(f"Total number of articles: {len(all_texts)}")
72
+
73
+ # Save the combined articles to a file "combined.txt"
74
+ with open("combined.txt", "w", encoding="utf-8") as f:
75
+ for article in all_texts:
76
+ f.write(article + "\n")
77
+ print("Combined dataset saved to combined.txt")
78
+
79
+ # ------------------------------------------------------------------------------
80
+ # SECTION 6: Split data into training and validation sets
81
+ # ------------------------------------------------------------------------------
82
+
83
+ # Split data into train (80%) and validation (20%) sets
84
+ train_texts, val_texts = train_test_split(all_texts, test_size=0.2, random_state=42)
85
+ print(f"Number of training examples: {len(train_texts)}, Number of validation examples: {len(val_texts)}")
86
+
87
+ # Save the training data to "train.txt"
88
+ with open("train.txt", "w", encoding="utf-8") as f:
89
+ for article in train_texts:
90
+ f.write(article + "\n")
91
+
92
+ # Save the validation data to "valid.txt"
93
+ with open("valid.txt", "w", encoding="utf-8") as f:
94
+ for article in val_texts:
95
+ f.write(article + "\n")
96
+
97
+ print("Files train.txt and valid.txt have been saved")
98
+
99
+ # ------------------------------------------------------------------------------
100
+ # SECTION 7: Create pretraining data with masked sentences for masked language modeling
101
+ # ------------------------------------------------------------------------------
102
+
103
+ # Read the complete training text from "train.txt"
104
+ with open("/kaggle/input/kaz-rus-eng-wiki/train.txt", "r", encoding="utf-8") as f:
105
+ text = f.read()
106
+
107
+ # Tokenize the text into sentences using NLTK
108
+ sentences = nltk.sent_tokenize(text)
109
+
110
+ output_data = []
111
+ for sentence in sentences:
112
+ sentence = sentence.strip()
113
+ # Select sentences that end with a period
114
+ if sentence.endswith('.'):
115
+ words = sentence.split()
116
+ if len(words) < 2:
117
+ masked_sentence = sentence
118
+ else:
119
+ # Randomly choose one word to replace with the [MASK] token
120
+ idx = random.randint(0, len(words) - 1)
121
+ words[idx] = "[MASK]"
122
+ masked_sentence = " ".join(words)
123
+ output_data.append({
124
+ "original_sentence": sentence,
125
+ "masked_sentence": masked_sentence
126
+ })
127
+
128
+ # Save the pretraining examples in JSON format to "train_pretrain.json"
129
+ with open("train_pretrain.json", "w", encoding="utf-8") as f:
130
+ json.dump(output_data, f, ensure_ascii=False, indent=4)
131
+
132
+ print(f"Saved {len(output_data)} examples to train_pretrain.json")
133
+
134
+ # ------------------------------------------------------------------------------
135
+ # SECTION 8: Train a WordPiece tokenizer using the tokenizers library
136
+ # ------------------------------------------------------------------------------
137
+
138
+ # Read the text file for tokenizer training (using the validation file here)
139
+ with open("/kaggle/working/valid.txt", "r", encoding="utf-8") as f:
140
+ texts = f.readlines()
141
+
142
+ # Create a WordPiece tokenizer with an unknown token
143
+ tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
144
+ tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
145
+
146
+ # Define special tokens
147
+ special_tokens = ["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
148
+
149
+ # Setup the WordPiece trainer with vocabulary size and minimum frequency
150
+ trainer = trainers.WordPieceTrainer(
151
+ vocab_size=30_000,
152
+ min_frequency=2,
153
+ special_tokens=special_tokens
154
+ )
155
+
156
+ # Train the tokenizer on the texts
157
+ tokenizer.train_from_iterator(texts, trainer)
158
+
159
+ # Save the vocabulary to "vocab.txt"
160
+ with open("vocab.txt", "w", encoding="utf-8") as f:
161
+ for token, _ in sorted(tokenizer.get_vocab().items(), key=lambda x: x[1]):
162
+ f.write(token + "\n")
163
+
164
+ # Save the tokenizer model in JSON format to "tokenizer.json"
165
+ tokenizer.save("tokenizer.json")
166
+
167
+ # Create and save the special tokens map as JSON
168
+ special_tokens_map = {
169
+ "unk_token": "[UNK]",
170
+ "sep_token": "[SEP]",
171
+ "pad_token": "[PAD]",
172
+ "cls_token": "[CLS]",
173
+ "mask_token": "[MASK]"
174
+ }
175
+ with open("special_tokens_map.json", "w", encoding="utf-8") as f:
176
+ json.dump(special_tokens_map, f, indent=4)
177
+
178
+ # Create and save the tokenizer configuration as JSON
179
+ tokenizer_config = {
180
+ "do_lower_case": False,
181
+ "vocab_size": 30_000,
182
+ "model_max_length": 512,
183
+ "special_tokens_map_file": "special_tokens_map.json"
184
+ }
185
+ with open("tokenizer_config.json", "w", encoding="utf-8") as f:
186
+ json.dump(tokenizer_config, f, indent=4)
187
+
188
+ print("✅ Tokenizer training completed! Files 'tokenizer.json', 'vocab.txt', 'special_tokens_map.json', and 'tokenizer_config.json' have been saved.")