ngia commited on
Commit
d91ea77
·
1 Parent(s): 0c55d1c

deploy on hugging face spaces for inference

Browse files
Files changed (13) hide show
  1. .gitignore +20 -0
  2. README.md +73 -13
  3. app.py +44 -0
  4. data_collector.py +61 -0
  5. inference.py +43 -0
  6. model.py +318 -0
  7. process_raw_data.py +64 -0
  8. pyproject.toml +20 -0
  9. requirements.txt +8 -0
  10. tokenize_dataset.py +57 -0
  11. tokenizer.py +133 -0
  12. train.py +353 -0
  13. utils.py +20 -0
.gitignore ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+ .cache
9
+
10
+ # Virtual environments
11
+ .venv
12
+ .env
13
+
14
+ #folders
15
+ trained_tokenizers/
16
+ checkpoints/
17
+ work_dir/
18
+ data/
19
+ pyproject_copy.toml
20
+
README.md CHANGED
@@ -1,13 +1,73 @@
1
- ---
2
- title: Translation En Fr
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.21.0
8
- app_file: app.py
9
- pinned: false
10
- short_description: Translation language model from English to French
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training Transformers from Scratch for Language Translation (English to French)
2
+
3
+ ## Overview
4
+ This project focuses on training a Transformer model from scratch to perform English-to-French translation. It follows a structured approach, from data collection to model deployment using Gradio.
5
+
6
+ ![Transformer Architecture](https://dassignies.law/wp-content/uploads/2024/04/DASSIGNIES-avocat-intelligence-artificielle-cybersecurite-strategie-protection-actifs-immateriels-formations-expertises-blog-transformer-architecture.webp)
7
+
8
+ ## Project Steps
9
+
10
+ 1. **Data Collection**
11
+ - Gather parallel English-French text data for training.
12
+
13
+ 2. **Dataset Creation and Upload to Hugging Face**
14
+ - Preprocess and structure the dataset.
15
+ - Upload the dataset to the Hugging Face Hub for easy access.
16
+
17
+ 3. **Training Tokenizers**
18
+ - Train separate tokenizers for English and French.
19
+ - Save and store trained tokenizers.
20
+
21
+ 4. **Creating a Tokenized Dataset**
22
+ - Tokenize the dataset using the trained tokenizers.
23
+ - Publish the tokenized dataset on Hugging Face.
24
+
25
+ 5. **Building the Transformer Model from Scratch**
26
+ - Implement custom Transformer components, including:
27
+ - Encoder
28
+ - Decoder
29
+ - Embedding Layer
30
+ - Positional Encoding
31
+
32
+ 6. **Model Training and Evaluation**
33
+ - Train the model using the prepared dataset.
34
+ - Use Weights & Biases (Wandb) for real-time metric visualization.
35
+
36
+ 7. **Inference**
37
+ - Test the trained model with sample English inputs.
38
+ - Generate translated French text.
39
+
40
+ 8. **Web Interface with Gradio**
41
+ - Develop an interactive UI using Gradio for easy model inference.
42
+
43
+ ## Installation
44
+
45
+ To use the application, install the required dependencies using either `uv` or `pip`:
46
+
47
+ Using `uv`:
48
+ ```bash
49
+ uv pip install -r requirements.txt
50
+ ```
51
+
52
+ Using `pip`:
53
+ ```bash
54
+ pip install -r requirements.txt
55
+ ```
56
+
57
+ ## Running the Application
58
+
59
+ To launch the application, run:
60
+ ```bash
61
+ python app.py
62
+ ```
63
+
64
+ This will start a Gradio interface where users can input English text and receive French translations.
65
+
66
+ ## Repository Structure
67
+ - **data_collector.py** - Script for data collection.
68
+ - **tokenize_dataset.py** - Prepares and tokenizes dataset.
69
+ - **model.py** - Contains the Transformer model implementation.
70
+ - **train.py** - Training script.
71
+ - **inference.py** - Inference script for model predictions.
72
+ - **app.py** - Web interface with Gradio.
73
+ - **requirements.txt** - List of dependencies.
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from tokenizer import CustomTokenizer
3
+ from model import Transformer, TransformerConfig
4
+ import gradio as gr
5
+
6
+
7
+ # load tokenizers
8
+ path_to_src_tokenizer = "trained_tokenizers/vocab_en.json"
9
+ path_to_tgt_tokenizer = "trained_tokenizers/vocab_fr.json"
10
+
11
+ src_tokenizer = CustomTokenizer(path_to_vocab=path_to_src_tokenizer)
12
+ tgt_tokenizer = CustomTokenizer(path_to_vocab=path_to_tgt_tokenizer)
13
+
14
+
15
+
16
+ #load model
17
+ config = TransformerConfig(max_seq_length=512)
18
+ model = Transformer(config=config)
19
+
20
+ path_to_checkpoints = "checkpoints/model.safetensors"
21
+ model.load_weights_from_checkpoints(path_to_checkpoints=path_to_checkpoints)
22
+ model.eval()
23
+
24
+
25
+ def translate(input_text, skip_special_tokens=True):
26
+ src_ids = torch.tensor(src_tokenizer.encode(input_text)).unsqueeze(0)
27
+ output_ids = model.inference(src_ids=src_ids, tgt_start_id=tgt_tokenizer.bos_token_id, tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=512)
28
+ output_tokens = tgt_tokenizer.decode(input=output_ids, skip_special_tokens=skip_special_tokens)
29
+ return output_tokens
30
+
31
+
32
+
33
+ with gr.Blocks() as demo:
34
+ gr.Markdown("## Traduction Anglais → Français")
35
+
36
+ with gr.Row():
37
+ texte_input = gr.Textbox(label="Texte en anglais", lines=4)
38
+ texte_output = gr.Textbox(label="Texte traduit (Français)",lines=4, interactive=False)
39
+
40
+ bouton = gr.Button("Traduire")
41
+ bouton.click(translate, inputs=texte_input, outputs=texte_output)
42
+
43
+ demo.launch()
44
+
data_collector.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ from torch.utils.data import Dataset
4
+ from model import *
5
+
6
+
7
+ class DataCollector(Dataset):
8
+ def __init__(self, dataset, english_tokenizer, french_tokenizer, max_length=512):
9
+ self.dataset = dataset
10
+ self.english_tokenizer = english_tokenizer
11
+ self.french_tokenizer = french_tokenizer
12
+ self.max_length = max_length
13
+
14
+ def __len__(self):
15
+ return len(self.dataset)
16
+
17
+ def __getitem__(self, index):
18
+ english_input_ids = torch.tensor(self.dataset[index]['src_ids'])
19
+ french_input_ids = torch.tensor(self.dataset[index]['tgt_ids'])
20
+
21
+ # Padder manuellement avec torch.nn.functional.pad ou en utilisant torch.cat
22
+ src_pad_token = self.english_tokenizer.pad_token_id
23
+ tgt_pad_token = self.french_tokenizer.pad_token_id
24
+
25
+ # Pour l'anglais
26
+ if len(english_input_ids) < self.max_length:
27
+ pad_length = self.max_length - len(english_input_ids)
28
+ english_input_ids = torch.cat([english_input_ids, torch.full((pad_length,), src_pad_token, dtype=english_input_ids.dtype)])
29
+ else:
30
+ english_input_ids = english_input_ids[:self.max_length]
31
+
32
+ # Pour le français
33
+ if len(french_input_ids) < self.max_length:
34
+ pad_length = self.max_length - len(french_input_ids)
35
+ french_input_ids = torch.cat([french_input_ids, torch.full((pad_length,), tgt_pad_token, dtype=french_input_ids.dtype)])
36
+ else:
37
+ french_input_ids = french_input_ids[:self.max_length]
38
+
39
+ # Créer les masques de padding
40
+ src_pad_mask = (english_input_ids != src_pad_token)
41
+ tgt_pad_mask = (french_input_ids != tgt_pad_token)
42
+
43
+ # Pour les tâches de traduction ou LM, on décale la cible
44
+ input_tgt = french_input_ids[:-1].clone()
45
+ label_tgt = french_input_ids[1:].clone()
46
+ input_tgt_mask = (input_tgt != tgt_pad_token)
47
+ label_tgt[label_tgt == tgt_pad_token] = -100
48
+
49
+ return {
50
+ "src_input_ids": english_input_ids, # Taille fixe: (self.max_length,)
51
+ "src_pad_mask": src_pad_mask,
52
+ "tgt_input_ids": french_input_ids, # Taille fixe: (self.max_length,)
53
+ "tgt_pad_mask": torch.cat([input_tgt_mask, torch.full((1,), 0, dtype=french_input_ids.dtype)]),
54
+ "tgt_labels": torch.cat([label_tgt, torch.full((1,), -100, dtype=french_input_ids.dtype)])
55
+ }
56
+
57
+
58
+
59
+
60
+
61
+
inference.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from model import Transformer, TransformerConfig
3
+ from safetensors.torch import load_file
4
+ from tokenizer import CustomTokenizer
5
+ from datasets import load_dataset
6
+ path_to_model_safetensors = "checkpoints/model.safetensors"
7
+ path_to_src_tokenizer = "trained_tokenizers/vocab_en.json"
8
+ path_to_tgt_tokenizer = "trained_tokenizers/vocab_fr.json"
9
+
10
+ config = TransformerConfig(device='cpu', max_seq_length=512)
11
+ model = Transformer(config=config)
12
+
13
+ #load weights dict
14
+ model.load_weights_from_checkpoints(path_to_model_safetensors)
15
+ model.eval()
16
+
17
+ src_tokenizer = CustomTokenizer(path_to_vocab=path_to_src_tokenizer)
18
+ tgt_tokenizer = CustomTokenizer(path_to_vocab=path_to_tgt_tokenizer)
19
+
20
+
21
+ english_text = "I'm very sick and i want to see a doctor."
22
+
23
+ src_ids = torch.tensor(src_tokenizer.encode(english_text)).unsqueeze(0)
24
+
25
+ translated_ids = model.inference(src_ids=src_ids, tgt_start_id=tgt_tokenizer.eos_token_id, tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=512)
26
+ translated_tokens = tgt_tokenizer.decode(translated_ids, skip_special_tokens=True)
27
+ print(f"English: {english_text} \nFrench: {translated_tokens}")
28
+
29
+
30
+ dataset = load_dataset("bilalfaye/english-wolof-french-translation", split="train")
31
+ samples = dataset.shuffle().select(range(50))
32
+
33
+ for i in range(50):
34
+ sample = samples[i]
35
+ src_ids = torch.tensor(src_tokenizer.encode(sample["en"])).unsqueeze(0)
36
+ output_ids = model.inference(src_ids=src_ids, tgt_start_id=tgt_tokenizer.eos_token_id, tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=512)
37
+ predicted_tokens = tgt_tokenizer.decode(output_ids, skip_special_tokens=True)
38
+ print(f"English: {sample["en"]}")
39
+ print(f"French (labels): {sample["fr"]}")
40
+ print(f"French (predicted): {predicted_tokens}")
41
+ print("--------------------------------\n\n")
42
+
43
+
model.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import torch
3
+ from torch import nn
4
+ import torch.nn.functional as F
5
+ import os
6
+ from utils import get_file_FROM_HF
7
+ from safetensors.torch import load_file
8
+
9
+
10
+
11
+ @dataclass
12
+ class TransformerConfig:
13
+ src_vocab_size: int = 32000
14
+ tgt_vocab_size: int = 32000
15
+ max_seq_length: int = 64
16
+ d_model: int = 512
17
+ num_heads: int = 8
18
+ num_encoder_layers: int = 6
19
+ num_decoder_layers: int = 6
20
+ dropout_p: float = 0.1
21
+ dff: int = 2048
22
+ device: str = 'cpu'
23
+
24
+
25
+
26
+ # Source Embedding block
27
+ class SourceEmbedding(nn.Module):
28
+ def __init__(self, config: TransformerConfig):
29
+ super().__init__()
30
+ self.src_embedding = nn.Embedding(num_embeddings=config.src_vocab_size, embedding_dim=config.d_model)
31
+
32
+ def forward(self, x):
33
+ x = self.src_embedding(x)
34
+ return x
35
+
36
+
37
+ # Target Embedding block
38
+ class TargetEmbedding(nn.Module):
39
+ def __init__(self, config: TransformerConfig):
40
+ super().__init__()
41
+ self.tgt_embedding = nn.Embedding(num_embeddings=config.tgt_vocab_size, embedding_dim=config.d_model)
42
+
43
+ def forward(self, x):
44
+ x = self.tgt_embedding(x)
45
+ return x
46
+
47
+ # Position Encoding (PE)
48
+
49
+ class PositionEncoding(nn.Module):
50
+ def __init__(self, config: TransformerConfig, require_grad=False):
51
+ super().__init__()
52
+ self.PE = torch.zeros(config.max_seq_length, config.d_model)
53
+ pos = torch.arange(0, config.max_seq_length).reshape(-1, 1)
54
+ i = torch.arange(0, config.d_model, step=2)
55
+
56
+ denominator = torch.pow(10000, (2*i) / config.d_model)
57
+ self.PE[:, 0::2] = torch.sin(pos/denominator)
58
+ self.PE[:, 1::2] = torch.cos(pos/denominator)
59
+
60
+
61
+ self.PE = nn.Parameter(self.PE, requires_grad=require_grad)
62
+
63
+ def forward(self, x):
64
+ max_seq_length = x.shape[1]
65
+ return x + self.PE[:max_seq_length]
66
+
67
+
68
+
69
+ # Muti Head Attention block for (Multi Head Attention, Masked Multi Head Attention and Cross Multi Heads Attention)
70
+ class MultiheadAttention(nn.Module):
71
+ def __init__(self, config:TransformerConfig):
72
+ super().__init__()
73
+ self.config = config
74
+
75
+ # check if the d_model is divided by num_heads to get the head dim
76
+ assert config.d_model % self.config.num_heads == 0, "The d_model is not divided by the num of heads"
77
+ self.head_dim = self.config.d_model // self.config.num_heads
78
+
79
+
80
+ self.q_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
81
+ self.k_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
82
+ self.v_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
83
+
84
+ self.out_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
85
+
86
+
87
+ def forward(self, src, tgt=None, attention_mask=None, causal=False):
88
+ batch, src_seq_length, d_model = src.shape
89
+ if tgt is None:
90
+ q = self.q_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
91
+ k = self.k_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
92
+ v = self.v_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
93
+
94
+ #MASKED MULTI HEAD ATTENTION
95
+ if attention_mask is not None:
96
+ attention_mask = attention_mask.bool()
97
+ attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1,1,src_seq_length,1).to(self.config.device)
98
+
99
+ if causal and attention_mask is not None:
100
+ # compute new mask (pad mask + causal mask)
101
+ causal_mask = ~torch.triu(torch.ones((src_seq_length, src_seq_length), dtype=torch.bool), diagonal=1)
102
+ causal_mask = causal_mask.unsqueeze(0).unsqueeze(0).to(self.config.device)
103
+
104
+ combined_mask = causal_mask.int() * attention_mask.int()
105
+ attention_mask = combined_mask.bool().to(self.config.device)
106
+ # torch.set_printoptions(threshold=torch.inf)
107
+
108
+
109
+ attention_out = F.scaled_dot_product_attention(q,k,v,
110
+ attn_mask=attention_mask,
111
+ dropout_p=self.config.dropout_p if self.training else 0.0,
112
+ is_causal=False)
113
+
114
+ # CROSS ATTENTION
115
+ else:
116
+ tgt_seq_length = tgt.shape[1]
117
+ q = self.q_proj(tgt).reshape(batch, tgt_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
118
+ k = self.k_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
119
+ v = self.v_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
120
+
121
+ if attention_mask is not None:
122
+ attention_mask = attention_mask.bool()
123
+ attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1,1,tgt_seq_length,1)
124
+
125
+ attention_out = F.scaled_dot_product_attention(q,k,v,
126
+ attn_mask=attention_mask,
127
+ dropout_p=self.config.dropout_p if self.training else 0.0,
128
+ is_causal=False)
129
+
130
+ attention_out = attention_out.transpose(1,2).flatten(2)
131
+ attention_out = self.out_proj(attention_out)
132
+ return attention_out
133
+
134
+
135
+ # Position Wise Feed Forward Network (MLP)
136
+ class FeedForward(nn.Module):
137
+ def __init__(self, config: TransformerConfig):
138
+ super().__init__()
139
+ self.hidden_layer = nn.Linear(in_features=config.d_model, out_features=config.dff) #eg: 512 -> 2048
140
+ self.hidden_dropout = nn.Dropout(p=config.dropout_p)
141
+ self.output_layer = nn.Linear(in_features=config.dff, out_features=config.d_model) #eg : 2048 - > 512
142
+ self.output_dropout = nn.Dropout(p=config.dropout_p)
143
+
144
+
145
+
146
+ def forward(self, x):
147
+ x = self.hidden_layer(x)
148
+ x = F.gelu(x)
149
+ x = self.hidden_dropout(x)
150
+ x = self.output_layer(x)
151
+ x = self.output_dropout(x)
152
+ return x
153
+
154
+
155
+ # Encoder block
156
+ class EncoderBlock(nn.Module):
157
+ def __init__(self, config: TransformerConfig):
158
+ super().__init__()
159
+ self.multi_head_attention = MultiheadAttention(config=config)
160
+ self.feed_forward = FeedForward(config=config)
161
+ self.layer_norm_1 = nn.LayerNorm(config.d_model)
162
+ self.layer_norm_2 = nn.LayerNorm(config.d_model)
163
+ self.dropout = nn.Dropout(config.dropout_p)
164
+
165
+ def forward(self, x, attention_mask=None):
166
+ x = x + self.dropout(self.multi_head_attention(src=x, attention_mask=attention_mask))
167
+ x = self.layer_norm_1(x)
168
+
169
+ x = x + self.feed_forward(x)
170
+ x = self.layer_norm_2(x)
171
+ return x
172
+
173
+ # Decoder block
174
+
175
+ class DecoderBlock(nn.Module):
176
+ def __init__(self, config: TransformerConfig):
177
+ super().__init__()
178
+ self.masked_multi_head_attention = MultiheadAttention(config=config)
179
+ self.dropout_masked = nn.Dropout(config.dropout_p)
180
+
181
+ self.cross_multi_head_attention = MultiheadAttention(config=config)
182
+ self.dropout_cross = nn.Dropout(config.dropout_p)
183
+
184
+ self.feed_forward = FeedForward(config=config)
185
+
186
+ self.layer_norm_1 = nn.LayerNorm(config.d_model)
187
+ self.layer_norm_2 = nn.LayerNorm(config.d_model)
188
+ self.layer_norm_3 = nn.LayerNorm(config.d_model)
189
+
190
+
191
+ def forward(self, src,tgt, src_attention_mask=None, tgt_attention_mask=None):
192
+
193
+ tgt = tgt + self.dropout_masked(self.masked_multi_head_attention(tgt, attention_mask=tgt_attention_mask, causal=True))
194
+ tgt = self.layer_norm_1(tgt)
195
+
196
+ tgt = tgt + self.dropout_cross(self.cross_multi_head_attention(src, tgt, attention_mask=src_attention_mask))
197
+ tgt = self.layer_norm_2(tgt)
198
+
199
+ tgt = tgt + self.feed_forward(tgt)
200
+ return tgt
201
+
202
+
203
+ # Transformer (put it all together)
204
+ class Transformer(nn.Module):
205
+ def __init__(self, config: TransformerConfig):
206
+ super().__init__()
207
+
208
+ self.src_embedding = SourceEmbedding(config=config)
209
+ self.tgt_embedding = TargetEmbedding(config=config)
210
+
211
+ self.position_encoding = PositionEncoding(config=config)
212
+
213
+ self.encoder = nn.ModuleList(
214
+ [EncoderBlock(config=config) for _ in range(config.num_encoder_layers)]
215
+ )
216
+
217
+ self.decoder = nn.ModuleList(
218
+ [DecoderBlock(config=config) for _ in range(config.num_decoder_layers)]
219
+ )
220
+
221
+ self.output = nn.Linear(config.d_model, config.tgt_vocab_size)
222
+
223
+ ## Init weights
224
+ self.apply(_init_weights_)
225
+
226
+
227
+
228
+ def forward(self, src_ids, tgt_ids, src_attention_mask=None, tgt_attention_mask=None):
229
+
230
+ # embed token ids
231
+ src_embed = self.src_embedding(src_ids)
232
+ tgt_embed = self.tgt_embedding(tgt_ids)
233
+
234
+ # add position encoding
235
+ src_embed = self.position_encoding(src_embed)
236
+ tgt_embed = self.position_encoding(tgt_embed)
237
+
238
+ for layer in self.encoder:
239
+ src_embed = layer(src_embed, src_attention_mask)
240
+
241
+ for layer in self.decoder:
242
+ tgt_embed = layer(src_embed, tgt_embed, src_attention_mask, tgt_attention_mask)
243
+
244
+ pred = self.output(tgt_embed)
245
+
246
+ return pred
247
+
248
+ @torch.no_grad()
249
+ def inference(self, src_ids, tgt_start_id, tgt_end_id, max_seq_length):
250
+ tgt_ids = torch.tensor([tgt_start_id], device=src_ids.device).reshape(1,1)
251
+
252
+ #Encode the source
253
+ src_embed = self.src_embedding(src_ids)
254
+ src_embed = self.position_encoding(src_embed)
255
+ for layer in self.encoder:
256
+ src_embed = layer(src_embed)
257
+
258
+ #Generate Target
259
+ for i in range(max_seq_length):
260
+ tgt_embed = self.tgt_embedding(tgt_ids)
261
+ tgt_embed = self.position_encoding(tgt_embed)
262
+ for layer in self.decoder:
263
+ tgt_embed = layer(src_embed, tgt_embed)
264
+
265
+ tgt_embed = tgt_embed[:, -1]
266
+
267
+ pred = self.output(tgt_embed)
268
+ pred = pred.argmax(axis=-1).unsqueeze(0)
269
+ tgt_ids = torch.cat([tgt_ids, pred], axis=-1)
270
+
271
+ if torch.all(pred == tgt_end_id):
272
+ break
273
+
274
+ return tgt_ids.squeeze().cpu().tolist()
275
+
276
+ def load_weights_from_checkpoints(self, path_to_checkpoints):
277
+ if not os.path.exists(path_to_checkpoints):
278
+ print("------------------- LOADING MODEL CHECKPOINTS FROM HUGGING FACE --------------------------")
279
+ folder = os.path.dirname(path_to_checkpoints)
280
+ os.makedirs(folder, exist_ok=True)
281
+ path_to_checkpoints = get_file_FROM_HF(repo_id="ngia/ml-translation-en-fr", file_path="final_checkpoint/model.safetensors", local_dir=folder)
282
+
283
+ chekpoints = load_file(filename=path_to_checkpoints)
284
+ self.load_state_dict(chekpoints)
285
+ return self
286
+
287
+
288
+
289
+
290
+ def _init_weights_(module):
291
+
292
+ """
293
+ Simple weight intialization taken directly from the huggingface
294
+ `modeling_roberta.py` implementation!
295
+ """
296
+ if isinstance(module, nn.Linear):
297
+ module.weight.data.normal_(mean=0.0, std=0.02)
298
+ if module.bias is not None:
299
+ module.bias.data.zero_()
300
+ elif isinstance(module, nn.Embedding):
301
+ module.weight.data.normal_(mean=0.0, std=0.02)
302
+ if module.padding_idx is not None:
303
+ module.weight.data[module.padding_idx].zero_()
304
+ elif isinstance(module, nn.LayerNorm):
305
+ module.bias.data.zero_()
306
+ module.weight.data.fill_(1.0)
307
+
308
+
309
+
310
+
311
+ if __name__ == "__main__":
312
+ config = TransformerConfig()
313
+ model = Transformer(config=config)
314
+
315
+ english = torch.randint(low=0, high=1000, size=(1,3))
316
+ res = model.inference(src_ids=english, tgt_start_id=1, tgt_end_id=2, max_seq_length=config.max_seq_length)
317
+ print(res)
318
+
process_raw_data.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, concatenate_datasets, load_from_disk
2
+ import torch
3
+
4
+ import os
5
+
6
+
7
+ def create_dataset(root_data_path, save_data_path, cache_data_path, test_size=0.01):
8
+
9
+ list_datasets = []
10
+
11
+ for directory in os.listdir(root_data_path):
12
+
13
+ path_to_dir = os.path.join(root_data_path, directory)
14
+
15
+ if os.path.isdir(path_to_dir):
16
+
17
+ print(f"Processing: {path_to_dir}")
18
+
19
+ english_text = None
20
+ french_text = None
21
+
22
+ for file_dir in os.listdir(path_to_dir):
23
+
24
+ if file_dir.endswith(".en"):
25
+ english_text = os.path.join(path_to_dir, file_dir)
26
+
27
+ if file_dir.endswith(".fr"):
28
+ french_text = os.path.join(path_to_dir, file_dir)
29
+
30
+ if english_text is not None and french_text is not None:
31
+ english_dataset = load_dataset("text", data_files=english_text, cache_dir=cache_data_path)["train"]
32
+ french_dataset = load_dataset("text", data_files=french_text, cache_dir=cache_data_path)["train"]
33
+
34
+ english_dataset = english_dataset.rename_column("text", "english_src")
35
+ dataset = english_dataset.add_column("french_tgt", french_dataset["text"])
36
+
37
+ list_datasets.append(dataset)
38
+
39
+
40
+ hf_dataset = concatenate_datasets(list_datasets)
41
+ hf_dataset = hf_dataset.train_test_split(test_size=test_size)
42
+
43
+ hf_dataset.save_to_disk(save_data_path)
44
+ print(f"Dataset successfully saved in: {save_data_path}")
45
+
46
+
47
+ def push_dataset_into_hf_hub(save_data_path):
48
+ dataset = load_from_disk(dataset_path=save_data_path)
49
+ dataset = dataset.shuffle()
50
+ dataset.push_to_hub(repo_id="ngia/translation-en-fr")
51
+ print("Successfully pushed on Hugging Face Hub")
52
+
53
+
54
+ if __name__ == "__main__":
55
+ root_data_path = "data/raw_data/"
56
+ save_data_path = "data/saved_data/"
57
+ cache_data_path = "data/cached_data/"
58
+
59
+ create_dataset(root_data_path=root_data_path, save_data_path=save_data_path, cache_data_path=cache_data_path)
60
+ dataset = load_from_disk(dataset_path=save_data_path)
61
+ print(dataset["train"][10])
62
+
63
+ push_dataset_into_hf_hub(save_data_path=save_data_path)
64
+
pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "translator-en-fr"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12.0"
7
+ dependencies = [
8
+ "accelerate>=1.4.0",
9
+ "datasets>=3.3.2",
10
+ "gradio>=5.21.0",
11
+ "huggingface-hub>=0.29.1",
12
+ "matplotlib>=3.10.1",
13
+ "sentencepiece>=0.2.0",
14
+ "streamlit>=1.43.2",
15
+ "torch>=2.6.0",
16
+ "torchvision>=0.21.0",
17
+ "transformers>=4.49.0",
18
+ "wandb>=0.19.8",
19
+ ]
20
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ accelerate
2
+ datasets
3
+ huggingface-hub
4
+ sentencepiece
5
+ transformers
6
+ wandb
7
+ matplotlib
8
+ gradio
tokenize_dataset.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizer import CustomTokenizer
2
+ from datasets import load_from_disk
3
+
4
+
5
+ def tokenize_dataset(path_to_dataset,
6
+ path_to_save,
7
+ num_workers=24,
8
+ truncate=False,
9
+ max_length=512,
10
+ min_length=3):
11
+
12
+ english_tokenizer = CustomTokenizer(path_to_vocab="trained_tokenizers/vocab_en.json", truncate=truncate, max_length=max_length)
13
+ french_tokenizer = CustomTokenizer(path_to_vocab="trained_tokenizers/vocab_fr.json", truncate=truncate, max_length=max_length)
14
+
15
+ dataset = load_from_disk(path_to_dataset)
16
+
17
+ def _tokenize_text(examples):
18
+
19
+ english_text = examples["english_src"]
20
+ french_text = examples["french_tgt"]
21
+
22
+ src_ids = english_tokenizer.encode(english_text)
23
+ tgt_ids = french_tokenizer.encode(french_text)
24
+
25
+ batch = {
26
+ "src_ids": src_ids,
27
+ "tgt_ids": tgt_ids
28
+ }
29
+ return batch
30
+
31
+ tokenized_dataset = dataset.map(_tokenize_text, batched=True, num_proc=num_workers)
32
+ tokenized_dataset = tokenized_dataset.remove_columns(["english_src", "french_tgt"])
33
+
34
+ filter_func = lambda batch: [len(e) >= min_length for e in batch["tgt_ids"]]
35
+ tokenized_dataset = tokenized_dataset.filter(filter_func, batched=True)
36
+
37
+ print(tokenized_dataset)
38
+
39
+ tokenized_dataset.save_to_disk(path_to_save)
40
+ print("Tokenized dataset is successfully saved into the disk")
41
+
42
+
43
+ if __name__ == "__main__":
44
+ path_to_dataset = "data/saved_data"
45
+ path_to_save = "data/tokenized_dataset"
46
+ tokenize_dataset(path_to_dataset=path_to_dataset, path_to_save=path_to_save)
47
+
48
+ #push dataset into the hub:
49
+ tokenized_dataset = load_from_disk(dataset_path=path_to_save)
50
+ tokenized_dataset.push_to_hub("ngia/tokenized-translation-en-fr")
51
+ print("Tokenized dataset is successfully pushed into Hugging Face hub")
52
+
53
+
54
+
55
+
56
+
57
+
tokenizer.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer, normalizers, decoders
2
+ from tokenizers.models import WordPiece
3
+ from tokenizers.trainers import WordPieceTrainer
4
+ from tokenizers.normalizers import NFC, Lowercase
5
+ from tokenizers.pre_tokenizers import Whitespace
6
+ from tokenizers.processors import TemplateProcessing
7
+ from utils import get_file_FROM_HF
8
+
9
+ import glob
10
+ import os
11
+
12
+
13
+
14
+ def train_tokenizer(path_to_data, lang):
15
+
16
+ special_token_dict = {
17
+ "pad_token" : "[PAD]",
18
+ "start_token": "[BOS]",
19
+ "end_token": "[EOS]",
20
+ "unknown_token": "[UNK]"
21
+ }
22
+
23
+ tokenizer = Tokenizer(WordPiece(unk_token=special_token_dict["unknown_token"]))
24
+ tokenizer.normalizer = normalizers.Sequence([NFC(), Lowercase()])
25
+ tokenizer.pre_tokenizer = Whitespace()
26
+
27
+ files = []
28
+
29
+ if lang == "fr":
30
+ print("---------Training French Tokenizer--------------")
31
+ files = glob.glob(os.path.join(path_to_data, "**/*.fr"))
32
+
33
+ elif lang == "en":
34
+ print("---------Training English Tokenizer--------------")
35
+ files = glob.glob(os.path.join(path_to_data, "**/*.en"))
36
+
37
+ trainer = WordPieceTrainer(vocab_size=32000, special_tokens=list(special_token_dict.values()))
38
+ tokenizer.train(files, trainer)
39
+ tokenizer.save(f"trained_tokenizers/vocab_{lang}.json")
40
+ print(f"Tokenizer is successfully saved into trained_tokenizers/vocab_{lang}.json")
41
+
42
+
43
+
44
+ class CustomTokenizer:
45
+
46
+ def __init__(self, path_to_vocab, truncate=False, max_length=512):
47
+ self.path_to_vocab = path_to_vocab
48
+ self.truncate = truncate
49
+ self.max_length = max_length
50
+ self.tokenizer = self.config_tokenizer()
51
+ self.vocab_size = self.tokenizer.get_vocab_size()
52
+
53
+ self.pad_token = "[PAD]"
54
+ self.pad_token_id = self.tokenizer.token_to_id("[PAD]")
55
+
56
+ self.bos_token = "[BOS]"
57
+ self.bos_token_id = self.tokenizer.token_to_id("[BOS]")
58
+
59
+ self.eos_token = "[EOS]"
60
+ self.eos_token_id = self.tokenizer.token_to_id("[EOS]")
61
+
62
+ self.unk_token = "[UNK]"
63
+ self.unk_token_id = self.tokenizer.token_to_id("[UNK]")
64
+
65
+ self.post_processor = TemplateProcessing(
66
+ single="[BOS] $A [EOS]",
67
+ special_tokens=[
68
+ (self.bos_token, self.bos_token_id),
69
+ (self.eos_token, self.eos_token_id)
70
+ ]
71
+ )
72
+
73
+ if self.truncate:
74
+ self.max_length = max_length - self.post_processor.num_special_tokens_to_add(is_pair=False)
75
+
76
+
77
+ def config_tokenizer(self):
78
+ if not os.path.exists(self.path_to_vocab):
79
+ self.path_to_vocab = self.load_file_from_hugging_face()
80
+ tokenizer = Tokenizer.from_file(self.path_to_vocab)
81
+ tokenizer.decoder = decoders.WordPiece()
82
+ return tokenizer
83
+
84
+
85
+
86
+
87
+ def encode(self, input):
88
+
89
+ def _parse_process_tokenized(tokenized):
90
+ if self.truncate:
91
+ tokenized.truncate(self.max_length, direction="right")
92
+ tokenized = self.post_processor.process(tokenized)
93
+ return tokenized.ids
94
+
95
+ if isinstance(input, str):
96
+ tokenized = self.tokenizer.encode(input)
97
+ tokenized = _parse_process_tokenized(tokenized)
98
+
99
+ if isinstance(input, (list, tuple)):
100
+ tokenized = self.tokenizer.encode_batch(input)
101
+ tokenized = [_parse_process_tokenized(t) for t in tokenized]
102
+
103
+ return tokenized
104
+
105
+ def decode(self, input, skip_special_tokens=True):
106
+ if isinstance(input, list):
107
+ if all(isinstance(item, list) for item in input):
108
+ decoded = self.tokenizer.decode_batch(input, skip_special_tokens=skip_special_tokens)
109
+ elif all(isinstance(item, int) for item in input):
110
+ decoded = self.tokenizer.decode(input, skip_special_tokens=skip_special_tokens)
111
+
112
+ return decoded
113
+
114
+ def load_file_from_hugging_face(self):
115
+ filename = os.path.basename(self.path_to_vocab)
116
+ if filename == "vocab_en.json":
117
+ print("------------------- LOADING SOURCE TOKENIZER FROM HUGGING FACE --------------------------")
118
+
119
+ elif filename == "vocab_fr.json":
120
+ print("------------------- LOADING TARGET TOKENIZER FROM HUGGING FACE --------------------------")
121
+
122
+ os.makedirs("trained_tokenizers/", exist_ok=True)
123
+ path_to_tokenizer = get_file_FROM_HF(repo_id="ngia/ml-translation-en-fr", file_path=filename, local_dir="trained_tokenizers/")
124
+ return path_to_tokenizer
125
+
126
+
127
+ if __name__ == "__main__":
128
+
129
+ path_to_data_root = "/home/ngam/Documents/translator-en-fr/data/raw_data"
130
+ #replace False by True if you want to train a new tokenizer
131
+ if False:
132
+ train_tokenizer(path_to_data_root, lang='fr')
133
+ train_tokenizer(path_to_data_root, lang='en')
train.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import torch
4
+
5
+ from model import Transformer, TransformerConfig
6
+ from data_collector import DataCollector
7
+ from torch.utils.data import DataLoader
8
+ from datasets import load_from_disk, load_dataset,DatasetDict,concatenate_datasets
9
+ from transformers import get_scheduler
10
+ from tokenizer import CustomTokenizer
11
+ from tqdm import tqdm
12
+ from accelerate import Accelerator
13
+ import wandb
14
+ from huggingface_hub import HfApi, create_repo
15
+ import shutil
16
+
17
+
18
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
19
+
20
+
21
+ # MODEL CONFIG
22
+ src_vocab_size: int = 32000
23
+ tgt_vocab_size: int = 32000
24
+ max_seq_length: int = 512
25
+ d_model: int = 512
26
+ num_heads: int = 8
27
+ num_encoder_layers: int = 6
28
+ num_decoder_layers: int = 6
29
+ dropout_p: float = 0.1
30
+ dff: int = 2048
31
+
32
+ config = TransformerConfig(
33
+ src_vocab_size=src_vocab_size,
34
+ tgt_vocab_size=tgt_vocab_size,
35
+ max_seq_length=max_seq_length,
36
+ d_model=d_model,
37
+ num_heads=num_heads,
38
+ num_encoder_layers=num_encoder_layers,
39
+ num_decoder_layers=num_decoder_layers,
40
+ dropout_p=0.1,
41
+ dff=dff
42
+ )
43
+
44
+
45
+ # TOKENIZER CONFIG
46
+ src_tokenizer_path = "trained_tokenizers/vocab_en.json"
47
+ tgt_tokenizer_path = "trained_tokenizers/vocab_fr.json"
48
+
49
+ src_tokenizer = CustomTokenizer(path_to_vocab=src_tokenizer_path, max_length=config.max_seq_length)
50
+ tgt_tokenizer = CustomTokenizer(path_to_vocab=tgt_tokenizer_path, max_length=config.max_seq_length)
51
+
52
+
53
+ # DATALOADER CONFIG
54
+ path_to_data = "data/tokenized_dataset"
55
+ batch_size = 64
56
+ gradient_accumulation_steps = 2
57
+ # num_workers = 4
58
+
59
+ # Training Config
60
+ learning_rate = 1e-4
61
+ training_steps = 170000
62
+ warmup_steps = 2000
63
+ scheduler_type = "cosine"
64
+ evaluation_steps = 5000
65
+ bias_norm_weight_decay = False
66
+ weight_decay = 0.001
67
+ betas = (0.9, 0.98)
68
+ adam_eps = 1e-6
69
+
70
+
71
+ #Logging Config
72
+ working_directory = "work_dir"
73
+ experiment_name = "Seq2Seq_Neural_Machine_Translation"
74
+ logging_interval = 1
75
+
76
+ #Resume from checkpoint
77
+ resume_from_checkpoint = "checkpoint_170000"
78
+
79
+
80
+ #Prepare Accelerator
81
+ path_to_experiment = os.path.join(working_directory, experiment_name)
82
+ accelerator = Accelerator(project_dir=path_to_experiment,
83
+ log_with="wandb")
84
+
85
+ accelerator.init_trackers(experiment_name)
86
+
87
+ #config model device
88
+ config.device = accelerator.device
89
+
90
+
91
+ # Prepare Dataloaders
92
+ dataset = load_dataset("ngia/tokenized-translation-en-fr")
93
+
94
+
95
+ accelerator.print("Dataset:", dataset)
96
+ min_batch_size = batch_size // gradient_accumulation_steps
97
+ train_dataset = DataCollector(dataset=dataset["train"], english_tokenizer=src_tokenizer, french_tokenizer=tgt_tokenizer, max_length=config.max_seq_length)
98
+ test_dataset = DataCollector(dataset=dataset["test"], english_tokenizer=tgt_tokenizer, french_tokenizer=tgt_tokenizer, max_length=config.max_seq_length)
99
+
100
+ train_loader = DataLoader(dataset=train_dataset, batch_size=min_batch_size, shuffle=True)
101
+ test_loader = DataLoader(dataset=test_dataset, batch_size=min_batch_size, shuffle=False)
102
+
103
+
104
+ # Prepare model
105
+ model = Transformer(config=config)
106
+ model_parameters = filter(lambda p: p.requires_grad, model.parameters())
107
+ params = sum([np.prod(p.size()) for p in model_parameters])
108
+ accelerator.print("Number of trainable parameters:", params)
109
+
110
+
111
+
112
+ # Prepare Optimizer
113
+ optimizer = torch.optim.AdamW(model.parameters(),
114
+ lr=learning_rate,
115
+ betas=betas,
116
+ eps=adam_eps)
117
+
118
+
119
+ # Define scheduler
120
+ scheduler = get_scheduler(
121
+ name=scheduler_type,
122
+ optimizer=optimizer,
123
+ num_warmup_steps=warmup_steps,
124
+ num_training_steps=training_steps
125
+ )
126
+
127
+ # Define Loss Function
128
+ loss_fn = torch.nn.CrossEntropyLoss()
129
+
130
+
131
+ ### Define a Sample Sentence for Testing ###
132
+ src_ids = torch.tensor(src_tokenizer.encode("I want to learn how to training a machine translation")).unsqueeze(0)
133
+
134
+
135
+ model, optimizer, trainloader, testloader, scheduler = accelerator.prepare(
136
+ model, optimizer, train_loader, test_loader, scheduler
137
+ )
138
+
139
+
140
+ accelerator.register_for_checkpointing(scheduler)
141
+
142
+ if resume_from_checkpoint is not None:
143
+
144
+ path_to_checkpoint = os.path.join(path_to_experiment, resume_from_checkpoint)
145
+
146
+ with accelerator.main_process_first():
147
+ accelerator.load_state(path_to_checkpoint)
148
+
149
+ completed_steps = int(resume_from_checkpoint.split("_")[-1])
150
+ accelerator.print(f"Resuming from Iteration: {completed_steps}")
151
+ else:
152
+ completed_steps = 0
153
+
154
+
155
+
156
+ def push_model_HF(repo_id, path_to_experiment, step):
157
+ """Save model and tokenizer locally, then push to Hugging Face Hub."""
158
+
159
+ # Push to Hugging Face Hub
160
+ api = HfApi()
161
+ create_repo(repo_id, exist_ok=True)
162
+
163
+ api.upload_folder(
164
+ folder_path=path_to_experiment,
165
+ repo_id=repo_id,
166
+ repo_type="model" # or "dataset" if it's a dataset
167
+ )
168
+
169
+ print(f"Checkpoint {step} pushed to {repo_id}")
170
+
171
+
172
+ #copy tokenizers
173
+ shutil.copy2("trained_tokenizers/vocab_en.json", f"{path_to_experiment}/vocab_en.json")
174
+ shutil.copy2("trained_tokenizers/vocab_fr.json", f"{path_to_experiment}/vocab_fr.json")
175
+
176
+
177
+ #push model on HF
178
+ push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps)
179
+
180
+
181
+ train = True
182
+ progress_bar = tqdm(range(completed_steps, training_steps), disable= not accelerator.is_local_main_process)
183
+
184
+ save_dir = ""
185
+
186
+ while train:
187
+ accumulate_steps = 0
188
+ accumulate_loss = 0
189
+ accuracy = 0
190
+
191
+ for batch in trainloader:
192
+ src_input_ids = batch["src_input_ids"].to(accelerator.device)
193
+ src_pad_mask = batch["src_pad_mask"].to(accelerator.device)
194
+ tgt_input_ids = batch["tgt_input_ids"].to(accelerator.device)
195
+ tgt_pad_mask = batch["tgt_pad_mask"].to(accelerator.device)
196
+ tgt_labels = batch["tgt_labels"].to(accelerator.device)
197
+
198
+ model_output = model(
199
+ src_input_ids,
200
+ tgt_input_ids,
201
+ src_pad_mask,
202
+ tgt_pad_mask
203
+ )
204
+
205
+ model_output = model_output.flatten(0,1)
206
+ tgt_labels = tgt_labels.flatten()
207
+ loss = loss_fn(model_output, tgt_labels)
208
+
209
+ ### Scale Loss and Accumulate ###
210
+ loss = loss / gradient_accumulation_steps
211
+ accumulate_loss += loss
212
+
213
+ ### Compute Gradients ###
214
+ accelerator.backward(loss)
215
+
216
+ ### Compute Accuracy (ignoring -100 padding labels) ###
217
+ model_output = model_output.argmax(axis=-1)
218
+ mask = (tgt_labels != -100)
219
+ output = model_output[mask]
220
+ tgt_outputs = tgt_labels[mask]
221
+ acc = (output == tgt_outputs).sum() / len(output)
222
+ accuracy += acc / gradient_accumulation_steps
223
+
224
+ ### Iterate Accumulation ###
225
+ accumulate_steps += 1
226
+
227
+ if accumulate_steps % gradient_accumulation_steps == 0:
228
+
229
+ ### Clip and Update Model ###
230
+ accelerator.clip_grad_norm_(model.parameters(), max_norm=1.0)
231
+ optimizer.step()
232
+ optimizer.zero_grad(set_to_none=True)
233
+ scheduler.step()
234
+
235
+ ### Log Results ###
236
+ if completed_steps % logging_interval == 0:
237
+
238
+ accumulate_loss = accumulate_loss.detach()
239
+ accuracy = accuracy.detach()
240
+
241
+ if accelerator.num_processes > 1:
242
+ accumulate_loss = torch.mean(accelerator.gather_for_metrics(accumulate_loss))
243
+ accuracy = torch.mean(accelerator.gather_for_metrics(accuracy))
244
+
245
+ log = {"train_loss": accumulate_loss,
246
+ "training_acc": accuracy,
247
+ "learning_rate": scheduler.get_last_lr()[0]}
248
+
249
+ accelerator.log(log, step=completed_steps)
250
+ logging_string = f"[{completed_steps}/{training_steps}] Training Loss: {accumulate_loss} | Training Acc: {accuracy}"
251
+ if accelerator.is_main_process:
252
+ progress_bar.write(logging_string)
253
+
254
+
255
+ if completed_steps % evaluation_steps == 0:
256
+ model.eval()
257
+ print("Evaluating!")
258
+
259
+ test_losses = []
260
+ test_accs = []
261
+
262
+ for batch in tqdm(testloader, disable=not accelerator.is_main_process):
263
+
264
+ src_input_ids = batch["src_input_ids"].to(accelerator.device)
265
+ src_pad_mask = batch["src_pad_mask"].to(accelerator.device)
266
+ tgt_input_ids = batch["tgt_input_ids"].to(accelerator.device)
267
+ tgt_pad_mask = batch["tgt_pad_mask"].to(accelerator.device)
268
+ tgt_labels = batch["tgt_labels"].to(accelerator.device)
269
+
270
+ with torch.inference_mode():
271
+ model_output = model(src_input_ids,
272
+ tgt_input_ids,
273
+ src_pad_mask,
274
+ tgt_pad_mask)
275
+
276
+ ### Flatten for Loss ###
277
+ model_output = model_output.flatten(0,1)
278
+ tgt_labels = tgt_labels.flatten()
279
+
280
+ ### Compute Loss ###
281
+ loss = loss_fn(model_output, tgt_labels)
282
+
283
+ ### Compute Accuracy (make sure to ignore -100 targets) ###
284
+ model_output = model_output.argmax(axis=-1)
285
+ mask = (tgt_labels != -100)
286
+ model_output = model_output[mask]
287
+ tgt_labels = tgt_labels[mask]
288
+ accuracy = (model_output == tgt_labels).sum() / len(model_output)
289
+
290
+ ### Store Results ###
291
+ loss = loss.detach()
292
+ accuracy = accuracy.detach()
293
+
294
+ if accelerator.num_processes > 1:
295
+ loss = torch.mean(accelerator.gather_for_metrics(loss))
296
+ accuracy = torch.mean(accelerator.gather_for_metrics(accuracy))
297
+
298
+ ### Store Metrics ###
299
+ test_losses.append(loss.item())
300
+ test_accs.append(accuracy.item())
301
+
302
+ test_loss = np.mean(test_losses)
303
+ test_acc = np.mean(test_accs)
304
+
305
+ log = {"test_loss": test_loss,
306
+ "test_acc": test_acc}
307
+
308
+ logging_string = f"Testing Loss: {test_loss} | Testing Acc: {test_acc}"
309
+ if accelerator.is_main_process:
310
+ progress_bar.write(logging_string)
311
+
312
+
313
+ ### Log and Save Model ###
314
+ accelerator.log(log, step=completed_steps)
315
+ accelerator.save_state(os.path.join(path_to_experiment, f"checkpoint_{completed_steps}"))
316
+
317
+ push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps)
318
+
319
+
320
+ ### Testing Sentence ###
321
+ if accelerator.is_main_process:
322
+ src_ids = src_ids.to(accelerator.device)
323
+ unrwapped = accelerator.unwrap_model(model)
324
+ translated = unrwapped.inference(src_ids,
325
+ tgt_start_id=tgt_tokenizer.bos_token_id,
326
+ tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=config.max_seq_length)
327
+
328
+ translated = tgt_tokenizer.decode(translated, skip_special_tokens=False)
329
+
330
+ if accelerator.is_main_process:
331
+ progress_bar.write(f"Translation: {translated}")
332
+
333
+ model.train()
334
+
335
+ if completed_steps >= training_steps:
336
+ train = False
337
+ accelerator.save_state(os.path.join(path_to_experiment, f"final_checkpoint"))
338
+ push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps)
339
+ break
340
+
341
+ ### Iterate Completed Steps ###
342
+ completed_steps += 1
343
+ progress_bar.update(1)
344
+
345
+ ### Reset Accumulated Variables ###
346
+ accumulate_loss = 0
347
+ accuracy = 0
348
+
349
+
350
+
351
+
352
+ accelerator.end_training()
353
+
utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import list_repo_files, hf_hub_download
2
+
3
+ def get_files_from_HF(repo_id, folder_name, local_dir):
4
+
5
+ files = list_repo_files(repo_id)
6
+
7
+ folder_files = [f for f in files if f.startswith(folder_name)]
8
+
9
+ for file in folder_files:
10
+ file_path = hf_hub_download(repo_id=repo_id, filename=file, local_dir=local_dir)
11
+ print(f"Downloaded: {file_path} to {local_dir}")
12
+
13
+
14
+
15
+ def get_file_FROM_HF(repo_id, file_path, local_dir):
16
+ file_path = hf_hub_download(repo_id=repo_id, filename=file_path, local_dir=local_dir)
17
+ return file_path
18
+
19
+
20
+