Spaces:

ngia
/

translation-en-fr

Sleeping

App Files Files Community

ngia commited on Mar 15

Commit

d91ea77

1 Parent(s): 0c55d1c

deploy on hugging face spaces for inference

Browse files

Files changed (13) hide show

.gitignore +20 -0
README.md +73 -13
app.py +44 -0
data_collector.py +61 -0
inference.py +43 -0
model.py +318 -0
process_raw_data.py +64 -0
pyproject.toml +20 -0
requirements.txt +8 -0
tokenize_dataset.py +57 -0
tokenizer.py +133 -0
train.py +353 -0
utils.py +20 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,20 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+.cache
+# Virtual environments
+.venv
+.env
+#folders
+trained_tokenizers/
+checkpoints/
+work_dir/
+data/
+pyproject_copy.toml

README.md CHANGED Viewed

@@ -1,13 +1,73 @@
----
-title: Translation En Fr
-emoji: 🔥
-colorFrom: yellow
-colorTo: green
-sdk: gradio
-sdk_version: 5.21.0
-app_file: app.py
-pinned: false
-short_description: Translation language model from English to French
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Training Transformers from Scratch for Language Translation (English to French)
+## Overview
+This project focuses on training a Transformer model from scratch to perform English-to-French translation. It follows a structured approach, from data collection to model deployment using Gradio.
+![Transformer Architecture](https://dassignies.law/wp-content/uploads/2024/04/DASSIGNIES-avocat-intelligence-artificielle-cybersecurite-strategie-protection-actifs-immateriels-formations-expertises-blog-transformer-architecture.webp)
+## Project Steps
+1. **Data Collection**
+   - Gather parallel English-French text data for training.
+2. **Dataset Creation and Upload to Hugging Face**
+   - Preprocess and structure the dataset.
+   - Upload the dataset to the Hugging Face Hub for easy access.
+3. **Training Tokenizers**
+   - Train separate tokenizers for English and French.
+   - Save and store trained tokenizers.
+4. **Creating a Tokenized Dataset**
+   - Tokenize the dataset using the trained tokenizers.
+   - Publish the tokenized dataset on Hugging Face.
+5. **Building the Transformer Model from Scratch**
+   - Implement custom Transformer components, including:
+     - Encoder
+     - Decoder
+     - Embedding Layer
+     - Positional Encoding
+6. **Model Training and Evaluation**
+   - Train the model using the prepared dataset.
+   - Use Weights & Biases (Wandb) for real-time metric visualization.
+7. **Inference**
+   - Test the trained model with sample English inputs.
+   - Generate translated French text.
+8. **Web Interface with Gradio**
+   - Develop an interactive UI using Gradio for easy model inference.
+## Installation
+To use the application, install the required dependencies using either `uv` or `pip`:
+Using `uv`:
+```bash
+uv pip install -r requirements.txt
+```
+Using `pip`:
+```bash
+pip install -r requirements.txt
+```
+## Running the Application
+To launch the application, run:
+```bash
+python app.py
+```
+This will start a Gradio interface where users can input English text and receive French translations.
+## Repository Structure
+- **data_collector.py** - Script for data collection.
+- **tokenize_dataset.py** - Prepares and tokenizes dataset.
+- **model.py** - Contains the Transformer model implementation.
+- **train.py** - Training script.
+- **inference.py** - Inference script for model predictions.
+- **app.py** - Web interface with Gradio.
+- **requirements.txt** - List of dependencies.

app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+from tokenizer import CustomTokenizer
+from model import Transformer, TransformerConfig
+import gradio as gr
+# load tokenizers
+path_to_src_tokenizer = "trained_tokenizers/vocab_en.json"
+path_to_tgt_tokenizer = "trained_tokenizers/vocab_fr.json"
+src_tokenizer = CustomTokenizer(path_to_vocab=path_to_src_tokenizer)
+tgt_tokenizer = CustomTokenizer(path_to_vocab=path_to_tgt_tokenizer)
+#load model
+config = TransformerConfig(max_seq_length=512)
+model = Transformer(config=config)
+path_to_checkpoints = "checkpoints/model.safetensors"
+model.load_weights_from_checkpoints(path_to_checkpoints=path_to_checkpoints)
+model.eval()
+def translate(input_text, skip_special_tokens=True):
+    src_ids = torch.tensor(src_tokenizer.encode(input_text)).unsqueeze(0)
+    output_ids = model.inference(src_ids=src_ids, tgt_start_id=tgt_tokenizer.bos_token_id, tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=512)
+    output_tokens = tgt_tokenizer.decode(input=output_ids, skip_special_tokens=skip_special_tokens)
+    return output_tokens
+with gr.Blocks() as demo:
+    gr.Markdown("## Traduction Anglais → Français")
+    with gr.Row():
+        texte_input = gr.Textbox(label="Texte en anglais", lines=4)
+        texte_output = gr.Textbox(label="Texte traduit (Français)",lines=4, interactive=False)
+    bouton = gr.Button("Traduire")
+    bouton.click(translate, inputs=texte_input, outputs=texte_output)
+demo.launch()

data_collector.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+from torch.utils.data import Dataset
+from model import *
+class DataCollector(Dataset):
+    def __init__(self, dataset, english_tokenizer, french_tokenizer, max_length=512):
+        self.dataset = dataset
+        self.english_tokenizer = english_tokenizer
+        self.french_tokenizer = french_tokenizer
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, index):
+        english_input_ids = torch.tensor(self.dataset[index]['src_ids'])
+        french_input_ids = torch.tensor(self.dataset[index]['tgt_ids'])
+        # Padder manuellement avec torch.nn.functional.pad ou en utilisant torch.cat
+        src_pad_token = self.english_tokenizer.pad_token_id
+        tgt_pad_token = self.french_tokenizer.pad_token_id
+        # Pour l'anglais
+        if len(english_input_ids) < self.max_length:
+            pad_length = self.max_length - len(english_input_ids)
+            english_input_ids = torch.cat([english_input_ids, torch.full((pad_length,), src_pad_token, dtype=english_input_ids.dtype)])
+        else:
+            english_input_ids = english_input_ids[:self.max_length]
+        # Pour le français
+        if len(french_input_ids) < self.max_length:
+            pad_length = self.max_length - len(french_input_ids)
+            french_input_ids = torch.cat([french_input_ids, torch.full((pad_length,), tgt_pad_token, dtype=french_input_ids.dtype)])
+        else:
+            french_input_ids = french_input_ids[:self.max_length]
+        # Créer les masques de padding
+        src_pad_mask = (english_input_ids != src_pad_token)
+        tgt_pad_mask = (french_input_ids != tgt_pad_token)
+        # Pour les tâches de traduction ou LM, on décale la cible
+        input_tgt = french_input_ids[:-1].clone()
+        label_tgt = french_input_ids[1:].clone()
+        input_tgt_mask = (input_tgt != tgt_pad_token)
+        label_tgt[label_tgt == tgt_pad_token] = -100
+        return {
+            "src_input_ids": english_input_ids,   # Taille fixe: (self.max_length,)
+            "src_pad_mask": src_pad_mask,
+            "tgt_input_ids": french_input_ids,      # Taille fixe: (self.max_length,)
+            "tgt_pad_mask": torch.cat([input_tgt_mask, torch.full((1,), 0, dtype=french_input_ids.dtype)]),
+            "tgt_labels": torch.cat([label_tgt, torch.full((1,), -100, dtype=french_input_ids.dtype)])
+        }

inference.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from model import Transformer, TransformerConfig
+from safetensors.torch import load_file
+from tokenizer import CustomTokenizer
+from datasets import load_dataset
+path_to_model_safetensors = "checkpoints/model.safetensors"
+path_to_src_tokenizer = "trained_tokenizers/vocab_en.json"
+path_to_tgt_tokenizer = "trained_tokenizers/vocab_fr.json"
+config = TransformerConfig(device='cpu', max_seq_length=512)
+model = Transformer(config=config)
+#load weights dict
+model.load_weights_from_checkpoints(path_to_model_safetensors)
+model.eval()
+src_tokenizer = CustomTokenizer(path_to_vocab=path_to_src_tokenizer)
+tgt_tokenizer = CustomTokenizer(path_to_vocab=path_to_tgt_tokenizer)
+english_text = "I'm very sick and i want to see a doctor."
+src_ids = torch.tensor(src_tokenizer.encode(english_text)).unsqueeze(0)
+translated_ids = model.inference(src_ids=src_ids, tgt_start_id=tgt_tokenizer.eos_token_id, tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=512)
+translated_tokens = tgt_tokenizer.decode(translated_ids, skip_special_tokens=True)
+print(f"English: {english_text} \nFrench: {translated_tokens}")
+dataset = load_dataset("bilalfaye/english-wolof-french-translation", split="train")
+samples = dataset.shuffle().select(range(50))
+for i in range(50):
+    sample = samples[i]
+    src_ids = torch.tensor(src_tokenizer.encode(sample["en"])).unsqueeze(0)
+    output_ids = model.inference(src_ids=src_ids, tgt_start_id=tgt_tokenizer.eos_token_id, tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=512)
+    predicted_tokens = tgt_tokenizer.decode(output_ids, skip_special_tokens=True)
+    print(f"English: {sample["en"]}")
+    print(f"French (labels): {sample["fr"]}")
+    print(f"French (predicted): {predicted_tokens}")
+    print("--------------------------------\n\n")

model.py ADDED Viewed

	@@ -0,0 +1,318 @@

+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+import os
+from utils import get_file_FROM_HF
+from safetensors.torch import load_file
+@dataclass
+class TransformerConfig:
+    src_vocab_size: int = 32000
+    tgt_vocab_size: int = 32000
+    max_seq_length: int = 64
+    d_model: int = 512
+    num_heads: int = 8
+    num_encoder_layers: int = 6
+    num_decoder_layers: int = 6
+    dropout_p: float = 0.1
+    dff: int = 2048
+    device: str = 'cpu'
+# Source Embedding block
+class SourceEmbedding(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.src_embedding = nn.Embedding(num_embeddings=config.src_vocab_size, embedding_dim=config.d_model)
+    def forward(self, x):
+        x = self.src_embedding(x)
+        return x
+# Target Embedding block
+class TargetEmbedding(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.tgt_embedding = nn.Embedding(num_embeddings=config.tgt_vocab_size, embedding_dim=config.d_model)
+    def forward(self, x):
+        x = self.tgt_embedding(x)
+        return x
+# Position Encoding (PE)
+class PositionEncoding(nn.Module):
+    def __init__(self, config: TransformerConfig, require_grad=False):
+        super().__init__()
+        self.PE = torch.zeros(config.max_seq_length, config.d_model)
+        pos = torch.arange(0, config.max_seq_length).reshape(-1, 1)
+        i = torch.arange(0, config.d_model, step=2)
+        denominator = torch.pow(10000, (2*i) / config.d_model)
+        self.PE[:, 0::2] = torch.sin(pos/denominator)
+        self.PE[:, 1::2] = torch.cos(pos/denominator)
+        self.PE = nn.Parameter(self.PE, requires_grad=require_grad)
+    def forward(self, x):
+        max_seq_length = x.shape[1]
+        return x + self.PE[:max_seq_length]
+# Muti Head Attention block for (Multi Head Attention, Masked Multi Head Attention and Cross Multi Heads Attention)
+class MultiheadAttention(nn.Module):
+    def __init__(self, config:TransformerConfig):
+        super().__init__()
+        self.config = config
+        # check if the d_model is divided by num_heads to get the head dim
+        assert config.d_model % self.config.num_heads == 0, "The d_model is not divided by the num of heads"
+        self.head_dim = self.config.d_model // self.config.num_heads
+        self.q_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
+        self.k_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
+        self.v_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
+        self.out_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
+    def forward(self, src, tgt=None, attention_mask=None, causal=False):
+        batch, src_seq_length, d_model = src.shape
+        if tgt is None:
+            q = self.q_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
+            k = self.k_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
+            v = self.v_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
+            #MASKED MULTI HEAD ATTENTION
+            if attention_mask is not None:
+                attention_mask = attention_mask.bool()
+                attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1,1,src_seq_length,1).to(self.config.device)
+            if causal and attention_mask is not None:
+                # compute new mask (pad mask + causal mask)
+                causal_mask = ~torch.triu(torch.ones((src_seq_length, src_seq_length), dtype=torch.bool), diagonal=1)
+                causal_mask = causal_mask.unsqueeze(0).unsqueeze(0).to(self.config.device)
+                combined_mask = causal_mask.int() * attention_mask.int()
+                attention_mask = combined_mask.bool().to(self.config.device)
+                # torch.set_printoptions(threshold=torch.inf)
+            attention_out = F.scaled_dot_product_attention(q,k,v,
+                                                           attn_mask=attention_mask,
+                                                           dropout_p=self.config.dropout_p if self.training else 0.0,
+                                                           is_causal=False)
+        # CROSS ATTENTION
+        else:
+            tgt_seq_length = tgt.shape[1]
+            q = self.q_proj(tgt).reshape(batch, tgt_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
+            k = self.k_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
+            v = self.v_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
+            if attention_mask is not None:
+                attention_mask = attention_mask.bool()
+                attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1,1,tgt_seq_length,1)
+            attention_out = F.scaled_dot_product_attention(q,k,v,
+                                                           attn_mask=attention_mask,
+                                                           dropout_p=self.config.dropout_p if self.training else 0.0,
+                                                           is_causal=False)
+        attention_out = attention_out.transpose(1,2).flatten(2)
+        attention_out = self.out_proj(attention_out)
+        return attention_out
+# Position Wise Feed Forward Network (MLP)
+class FeedForward(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.hidden_layer = nn.Linear(in_features=config.d_model, out_features=config.dff) #eg: 512 -> 2048
+        self.hidden_dropout = nn.Dropout(p=config.dropout_p)
+        self.output_layer = nn.Linear(in_features=config.dff, out_features=config.d_model) #eg : 2048 - > 512
+        self.output_dropout = nn.Dropout(p=config.dropout_p)
+    def forward(self, x):
+        x = self.hidden_layer(x)
+        x = F.gelu(x)
+        x = self.hidden_dropout(x)
+        x = self.output_layer(x)
+        x = self.output_dropout(x)
+        return x
+# Encoder block
+class EncoderBlock(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.multi_head_attention = MultiheadAttention(config=config)
+        self.feed_forward = FeedForward(config=config)
+        self.layer_norm_1 = nn.LayerNorm(config.d_model)
+        self.layer_norm_2 = nn.LayerNorm(config.d_model)
+        self.dropout = nn.Dropout(config.dropout_p)
+    def forward(self, x, attention_mask=None):
+        x = x + self.dropout(self.multi_head_attention(src=x, attention_mask=attention_mask))
+        x = self.layer_norm_1(x)
+        x = x + self.feed_forward(x)
+        x = self.layer_norm_2(x)
+        return x
+# Decoder block
+class DecoderBlock(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.masked_multi_head_attention = MultiheadAttention(config=config)
+        self.dropout_masked = nn.Dropout(config.dropout_p)
+        self.cross_multi_head_attention = MultiheadAttention(config=config)
+        self.dropout_cross = nn.Dropout(config.dropout_p)
+        self.feed_forward = FeedForward(config=config)
+        self.layer_norm_1 = nn.LayerNorm(config.d_model)
+        self.layer_norm_2 = nn.LayerNorm(config.d_model)
+        self.layer_norm_3 = nn.LayerNorm(config.d_model)
+    def forward(self, src,tgt, src_attention_mask=None, tgt_attention_mask=None):
+        tgt = tgt + self.dropout_masked(self.masked_multi_head_attention(tgt, attention_mask=tgt_attention_mask, causal=True))
+        tgt = self.layer_norm_1(tgt)
+        tgt = tgt + self.dropout_cross(self.cross_multi_head_attention(src, tgt, attention_mask=src_attention_mask))
+        tgt = self.layer_norm_2(tgt)
+        tgt = tgt + self.feed_forward(tgt)
+        return tgt
+# Transformer (put it all together)
+class Transformer(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.src_embedding = SourceEmbedding(config=config)
+        self.tgt_embedding = TargetEmbedding(config=config)
+        self.position_encoding = PositionEncoding(config=config)
+        self.encoder = nn.ModuleList(
+            [EncoderBlock(config=config) for _ in range(config.num_encoder_layers)]
+        )
+        self.decoder = nn.ModuleList(
+            [DecoderBlock(config=config) for _ in range(config.num_decoder_layers)]
+        )
+        self.output = nn.Linear(config.d_model, config.tgt_vocab_size)
+        ## Init weights
+        self.apply(_init_weights_)
+    def forward(self, src_ids, tgt_ids, src_attention_mask=None, tgt_attention_mask=None):
+        # embed token ids
+        src_embed = self.src_embedding(src_ids)
+        tgt_embed = self.tgt_embedding(tgt_ids)
+        # add position encoding
+        src_embed = self.position_encoding(src_embed)
+        tgt_embed = self.position_encoding(tgt_embed)
+        for layer in self.encoder:
+            src_embed = layer(src_embed, src_attention_mask)
+        for layer in self.decoder:
+            tgt_embed = layer(src_embed, tgt_embed, src_attention_mask, tgt_attention_mask)
+        pred = self.output(tgt_embed)
+        return pred
+    @torch.no_grad()
+    def inference(self, src_ids, tgt_start_id, tgt_end_id, max_seq_length):
+        tgt_ids = torch.tensor([tgt_start_id], device=src_ids.device).reshape(1,1)
+        #Encode the source
+        src_embed = self.src_embedding(src_ids)
+        src_embed = self.position_encoding(src_embed)
+        for layer in self.encoder:
+            src_embed = layer(src_embed)
+        #Generate Target
+        for i in range(max_seq_length):
+            tgt_embed = self.tgt_embedding(tgt_ids)
+            tgt_embed = self.position_encoding(tgt_embed)
+            for layer in self.decoder:
+                tgt_embed = layer(src_embed, tgt_embed)
+            tgt_embed = tgt_embed[:, -1]
+            pred = self.output(tgt_embed)
+            pred = pred.argmax(axis=-1).unsqueeze(0)
+            tgt_ids = torch.cat([tgt_ids, pred], axis=-1)
+            if torch.all(pred == tgt_end_id):
+                break
+        return tgt_ids.squeeze().cpu().tolist()
+    def load_weights_from_checkpoints(self, path_to_checkpoints):
+        if not os.path.exists(path_to_checkpoints):
+            print("------------------- LOADING MODEL CHECKPOINTS FROM HUGGING FACE --------------------------")
+            folder = os.path.dirname(path_to_checkpoints)
+            os.makedirs(folder, exist_ok=True)
+            path_to_checkpoints = get_file_FROM_HF(repo_id="ngia/ml-translation-en-fr", file_path="final_checkpoint/model.safetensors", local_dir=folder)
+        chekpoints = load_file(filename=path_to_checkpoints)
+        self.load_state_dict(chekpoints)
+        return self
+def _init_weights_(module):
+    """
+    Simple weight intialization taken directly from the huggingface
+    `modeling_roberta.py` implementation!
+    """
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=0.02)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=0.02)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+if __name__ == "__main__":
+    config = TransformerConfig()
+    model = Transformer(config=config)
+    english = torch.randint(low=0, high=1000, size=(1,3))
+    res = model.inference(src_ids=english, tgt_start_id=1, tgt_end_id=2, max_seq_length=config.max_seq_length)
+    print(res)

process_raw_data.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from datasets import load_dataset, concatenate_datasets, load_from_disk
+import torch
+import os
+def create_dataset(root_data_path, save_data_path, cache_data_path, test_size=0.01):
+    list_datasets = []
+    for directory in os.listdir(root_data_path):
+        path_to_dir = os.path.join(root_data_path, directory)
+        if os.path.isdir(path_to_dir):
+            print(f"Processing: {path_to_dir}")
+            english_text = None
+            french_text = None
+            for file_dir in os.listdir(path_to_dir):
+                if file_dir.endswith(".en"):
+                    english_text = os.path.join(path_to_dir, file_dir)
+                if file_dir.endswith(".fr"):
+                    french_text = os.path.join(path_to_dir, file_dir)
+            if english_text is not None and french_text is not None:
+                english_dataset = load_dataset("text", data_files=english_text, cache_dir=cache_data_path)["train"]
+                french_dataset = load_dataset("text", data_files=french_text, cache_dir=cache_data_path)["train"]
+                english_dataset = english_dataset.rename_column("text", "english_src")
+                dataset = english_dataset.add_column("french_tgt", french_dataset["text"])
+                list_datasets.append(dataset)
+    hf_dataset =  concatenate_datasets(list_datasets)
+    hf_dataset = hf_dataset.train_test_split(test_size=test_size)
+    hf_dataset.save_to_disk(save_data_path)
+    print(f"Dataset successfully saved in: {save_data_path}")
+def push_dataset_into_hf_hub(save_data_path):
+    dataset = load_from_disk(dataset_path=save_data_path)
+    dataset = dataset.shuffle()
+    dataset.push_to_hub(repo_id="ngia/translation-en-fr")
+    print("Successfully pushed on Hugging Face Hub")
+if __name__ == "__main__":
+    root_data_path = "data/raw_data/"
+    save_data_path = "data/saved_data/"
+    cache_data_path = "data/cached_data/"
+    create_dataset(root_data_path=root_data_path, save_data_path=save_data_path, cache_data_path=cache_data_path)
+    dataset = load_from_disk(dataset_path=save_data_path)
+    print(dataset["train"][10])
+    push_dataset_into_hf_hub(save_data_path=save_data_path)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,20 @@

+[project]
+name = "translator-en-fr"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12.0"
+dependencies = [
+    "accelerate>=1.4.0",
+    "datasets>=3.3.2",
+    "gradio>=5.21.0",
+    "huggingface-hub>=0.29.1",
+    "matplotlib>=3.10.1",
+    "sentencepiece>=0.2.0",
+    "streamlit>=1.43.2",
+    "torch>=2.6.0",
+    "torchvision>=0.21.0",
+    "transformers>=4.49.0",
+    "wandb>=0.19.8",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+accelerate
+datasets
+huggingface-hub
+sentencepiece
+transformers
+wandb
+matplotlib
+gradio

tokenize_dataset.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from tokenizer import CustomTokenizer
+from datasets import load_from_disk
+def tokenize_dataset(path_to_dataset,
+                    path_to_save,
+                    num_workers=24,
+                    truncate=False,
+                    max_length=512,
+                    min_length=3):
+    english_tokenizer = CustomTokenizer(path_to_vocab="trained_tokenizers/vocab_en.json", truncate=truncate, max_length=max_length)
+    french_tokenizer = CustomTokenizer(path_to_vocab="trained_tokenizers/vocab_fr.json", truncate=truncate, max_length=max_length)
+    dataset = load_from_disk(path_to_dataset)
+    def _tokenize_text(examples):
+        english_text = examples["english_src"]
+        french_text = examples["french_tgt"]
+        src_ids = english_tokenizer.encode(english_text)
+        tgt_ids = french_tokenizer.encode(french_text)
+        batch = {
+            "src_ids": src_ids,
+            "tgt_ids": tgt_ids
+        }
+        return batch
+    tokenized_dataset = dataset.map(_tokenize_text, batched=True, num_proc=num_workers)
+    tokenized_dataset = tokenized_dataset.remove_columns(["english_src", "french_tgt"])
+    filter_func = lambda batch: [len(e) >= min_length for e in batch["tgt_ids"]]
+    tokenized_dataset = tokenized_dataset.filter(filter_func, batched=True)
+    print(tokenized_dataset)
+    tokenized_dataset.save_to_disk(path_to_save)
+    print("Tokenized dataset is successfully saved into the disk")
+if __name__ == "__main__":
+    path_to_dataset = "data/saved_data"
+    path_to_save = "data/tokenized_dataset"
+    tokenize_dataset(path_to_dataset=path_to_dataset, path_to_save=path_to_save)
+    #push dataset into the hub:
+    tokenized_dataset = load_from_disk(dataset_path=path_to_save)
+    tokenized_dataset.push_to_hub("ngia/tokenized-translation-en-fr")
+    print("Tokenized dataset is successfully pushed into Hugging Face hub")

tokenizer.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from tokenizers import Tokenizer, normalizers, decoders
+from tokenizers.models import WordPiece
+from tokenizers.trainers import WordPieceTrainer
+from tokenizers.normalizers import NFC, Lowercase
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.processors import TemplateProcessing
+from utils import get_file_FROM_HF
+import glob
+import os
+def train_tokenizer(path_to_data, lang):
+    special_token_dict = {
+    "pad_token" : "[PAD]",
+    "start_token": "[BOS]",
+    "end_token": "[EOS]",
+    "unknown_token": "[UNK]"
+    }
+    tokenizer = Tokenizer(WordPiece(unk_token=special_token_dict["unknown_token"]))
+    tokenizer.normalizer = normalizers.Sequence([NFC(), Lowercase()])
+    tokenizer.pre_tokenizer = Whitespace()
+    files = []
+    if lang == "fr":
+        print("---------Training French Tokenizer--------------")
+        files = glob.glob(os.path.join(path_to_data, "**/*.fr"))
+    elif lang == "en":
+        print("---------Training English Tokenizer--------------")
+        files = glob.glob(os.path.join(path_to_data, "**/*.en"))
+    trainer = WordPieceTrainer(vocab_size=32000, special_tokens=list(special_token_dict.values()))
+    tokenizer.train(files, trainer)
+    tokenizer.save(f"trained_tokenizers/vocab_{lang}.json")
+    print(f"Tokenizer is successfully saved into trained_tokenizers/vocab_{lang}.json")
+class CustomTokenizer:
+    def __init__(self, path_to_vocab, truncate=False, max_length=512):
+        self.path_to_vocab = path_to_vocab
+        self.truncate = truncate
+        self.max_length = max_length
+        self.tokenizer = self.config_tokenizer()
+        self.vocab_size = self.tokenizer.get_vocab_size()
+        self.pad_token = "[PAD]"
+        self.pad_token_id = self.tokenizer.token_to_id("[PAD]")
+        self.bos_token = "[BOS]"
+        self.bos_token_id = self.tokenizer.token_to_id("[BOS]")
+        self.eos_token = "[EOS]"
+        self.eos_token_id = self.tokenizer.token_to_id("[EOS]")
+        self.unk_token = "[UNK]"
+        self.unk_token_id = self.tokenizer.token_to_id("[UNK]")
+        self.post_processor = TemplateProcessing(
+            single="[BOS] $A [EOS]",
+            special_tokens=[
+                (self.bos_token, self.bos_token_id),
+                (self.eos_token, self.eos_token_id)
+            ]
+        )
+        if self.truncate:
+            self.max_length = max_length - self.post_processor.num_special_tokens_to_add(is_pair=False)
+    def config_tokenizer(self):
+        if not os.path.exists(self.path_to_vocab):
+            self.path_to_vocab = self.load_file_from_hugging_face()
+        tokenizer = Tokenizer.from_file(self.path_to_vocab)
+        tokenizer.decoder = decoders.WordPiece()
+        return tokenizer
+    def encode(self, input):
+        def _parse_process_tokenized(tokenized):
+            if self.truncate:
+                tokenized.truncate(self.max_length, direction="right")
+            tokenized = self.post_processor.process(tokenized)
+            return tokenized.ids
+        if isinstance(input, str):
+            tokenized = self.tokenizer.encode(input)
+            tokenized = _parse_process_tokenized(tokenized)
+        if isinstance(input, (list, tuple)):
+            tokenized = self.tokenizer.encode_batch(input)
+            tokenized = [_parse_process_tokenized(t) for t in tokenized]
+        return tokenized
+    def decode(self, input, skip_special_tokens=True):
+        if isinstance(input, list):
+            if all(isinstance(item, list) for item in input):
+                decoded = self.tokenizer.decode_batch(input, skip_special_tokens=skip_special_tokens)
+            elif all(isinstance(item, int) for item in input):
+                decoded = self.tokenizer.decode(input, skip_special_tokens=skip_special_tokens)
+        return decoded
+    def load_file_from_hugging_face(self):
+        filename = os.path.basename(self.path_to_vocab)
+        if filename == "vocab_en.json":
+            print("------------------- LOADING SOURCE TOKENIZER FROM HUGGING FACE --------------------------")
+        elif filename == "vocab_fr.json":
+            print("------------------- LOADING TARGET TOKENIZER FROM HUGGING FACE --------------------------")
+        os.makedirs("trained_tokenizers/", exist_ok=True)
+        path_to_tokenizer = get_file_FROM_HF(repo_id="ngia/ml-translation-en-fr", file_path=filename, local_dir="trained_tokenizers/")
+        return path_to_tokenizer
+if __name__ == "__main__":
+    path_to_data_root = "/home/ngam/Documents/translator-en-fr/data/raw_data"
+    #replace False by True if you want to train a new tokenizer
+    if False:
+        train_tokenizer(path_to_data_root, lang='fr')
+        train_tokenizer(path_to_data_root, lang='en')

train.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import os
+import numpy as np
+import torch
+from model import Transformer, TransformerConfig
+from data_collector import DataCollector
+from torch.utils.data import DataLoader
+from datasets import load_from_disk, load_dataset,DatasetDict,concatenate_datasets
+from transformers import get_scheduler
+from tokenizer import CustomTokenizer
+from tqdm import tqdm
+from accelerate import Accelerator
+import wandb
+from huggingface_hub import HfApi, create_repo
+import shutil
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# MODEL CONFIG
+src_vocab_size: int = 32000
+tgt_vocab_size: int = 32000
+max_seq_length: int = 512
+d_model: int = 512
+num_heads: int = 8
+num_encoder_layers: int = 6
+num_decoder_layers: int = 6
+dropout_p: float = 0.1
+dff: int = 2048
+config = TransformerConfig(
+    src_vocab_size=src_vocab_size,
+    tgt_vocab_size=tgt_vocab_size,
+    max_seq_length=max_seq_length,
+    d_model=d_model,
+    num_heads=num_heads,
+    num_encoder_layers=num_encoder_layers,
+    num_decoder_layers=num_decoder_layers,
+    dropout_p=0.1,
+    dff=dff
+)
+# TOKENIZER CONFIG
+src_tokenizer_path = "trained_tokenizers/vocab_en.json"
+tgt_tokenizer_path = "trained_tokenizers/vocab_fr.json"
+src_tokenizer = CustomTokenizer(path_to_vocab=src_tokenizer_path, max_length=config.max_seq_length)
+tgt_tokenizer = CustomTokenizer(path_to_vocab=tgt_tokenizer_path, max_length=config.max_seq_length)
+# DATALOADER CONFIG
+path_to_data = "data/tokenized_dataset"
+batch_size = 64
+gradient_accumulation_steps = 2
+# num_workers = 4
+# Training Config
+learning_rate = 1e-4
+training_steps = 170000
+warmup_steps = 2000
+scheduler_type = "cosine"
+evaluation_steps = 5000
+bias_norm_weight_decay = False
+weight_decay = 0.001
+betas = (0.9, 0.98)
+adam_eps = 1e-6
+#Logging Config
+working_directory = "work_dir"
+experiment_name = "Seq2Seq_Neural_Machine_Translation"
+logging_interval = 1
+#Resume from checkpoint
+resume_from_checkpoint = "checkpoint_170000"
+#Prepare Accelerator
+path_to_experiment = os.path.join(working_directory, experiment_name)
+accelerator = Accelerator(project_dir=path_to_experiment,
+                          log_with="wandb")
+accelerator.init_trackers(experiment_name)
+#config model device
+config.device = accelerator.device
+# Prepare Dataloaders
+dataset = load_dataset("ngia/tokenized-translation-en-fr")
+accelerator.print("Dataset:", dataset)
+min_batch_size = batch_size // gradient_accumulation_steps
+train_dataset = DataCollector(dataset=dataset["train"], english_tokenizer=src_tokenizer, french_tokenizer=tgt_tokenizer, max_length=config.max_seq_length)
+test_dataset = DataCollector(dataset=dataset["test"], english_tokenizer=tgt_tokenizer, french_tokenizer=tgt_tokenizer, max_length=config.max_seq_length)
+train_loader = DataLoader(dataset=train_dataset, batch_size=min_batch_size, shuffle=True)
+test_loader = DataLoader(dataset=test_dataset, batch_size=min_batch_size, shuffle=False)
+# Prepare model
+model = Transformer(config=config)
+model_parameters = filter(lambda p: p.requires_grad, model.parameters())
+params = sum([np.prod(p.size()) for p in model_parameters])
+accelerator.print("Number of trainable parameters:", params)
+# Prepare Optimizer
+optimizer = torch.optim.AdamW(model.parameters(),
+                                  lr=learning_rate,
+                                  betas=betas,
+                                  eps=adam_eps)
+# Define scheduler
+scheduler = get_scheduler(
+    name=scheduler_type,
+    optimizer=optimizer,
+    num_warmup_steps=warmup_steps,
+    num_training_steps=training_steps
+)
+# Define Loss Function
+loss_fn = torch.nn.CrossEntropyLoss()
+### Define a Sample Sentence for Testing ###
+src_ids = torch.tensor(src_tokenizer.encode("I want to learn how to training a machine translation")).unsqueeze(0)
+model, optimizer, trainloader, testloader, scheduler = accelerator.prepare(
+    model, optimizer, train_loader, test_loader, scheduler
+)
+accelerator.register_for_checkpointing(scheduler)
+if resume_from_checkpoint is not None:
+    path_to_checkpoint = os.path.join(path_to_experiment, resume_from_checkpoint)
+    with accelerator.main_process_first():
+        accelerator.load_state(path_to_checkpoint)
+    completed_steps = int(resume_from_checkpoint.split("_")[-1])
+    accelerator.print(f"Resuming from Iteration: {completed_steps}")
+else:
+    completed_steps = 0
+def push_model_HF(repo_id, path_to_experiment, step):
+    """Save model and tokenizer locally, then push to Hugging Face Hub."""
+    # Push to Hugging Face Hub
+    api = HfApi()
+    create_repo(repo_id, exist_ok=True)
+    api.upload_folder(
+        folder_path=path_to_experiment,
+        repo_id=repo_id,
+        repo_type="model"  # or "dataset" if it's a dataset
+    )
+    print(f"Checkpoint {step} pushed to {repo_id}")
+#copy tokenizers
+shutil.copy2("trained_tokenizers/vocab_en.json", f"{path_to_experiment}/vocab_en.json")
+shutil.copy2("trained_tokenizers/vocab_fr.json", f"{path_to_experiment}/vocab_fr.json")
+#push model on HF
+push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps)
+train = True
+progress_bar = tqdm(range(completed_steps, training_steps), disable= not accelerator.is_local_main_process)
+save_dir = ""
+while train:
+    accumulate_steps = 0
+    accumulate_loss = 0
+    accuracy = 0
+    for batch in trainloader:
+        src_input_ids = batch["src_input_ids"].to(accelerator.device)
+        src_pad_mask = batch["src_pad_mask"].to(accelerator.device)
+        tgt_input_ids = batch["tgt_input_ids"].to(accelerator.device)
+        tgt_pad_mask = batch["tgt_pad_mask"].to(accelerator.device)
+        tgt_labels = batch["tgt_labels"].to(accelerator.device)
+        model_output = model(
+            src_input_ids,
+            tgt_input_ids,
+            src_pad_mask,
+            tgt_pad_mask
+        )
+        model_output = model_output.flatten(0,1)
+        tgt_labels = tgt_labels.flatten()
+        loss = loss_fn(model_output, tgt_labels)
+        ### Scale Loss and Accumulate ###
+        loss = loss / gradient_accumulation_steps
+        accumulate_loss += loss
+        ### Compute Gradients ###
+        accelerator.backward(loss)
+        ### Compute Accuracy (ignoring -100 padding labels) ###
+        model_output = model_output.argmax(axis=-1)
+        mask = (tgt_labels != -100)
+        output = model_output[mask]
+        tgt_outputs = tgt_labels[mask]
+        acc = (output == tgt_outputs).sum() / len(output)
+        accuracy += acc / gradient_accumulation_steps
+        ### Iterate Accumulation ###
+        accumulate_steps += 1
+        if accumulate_steps % gradient_accumulation_steps == 0:
+            ### Clip and Update Model ###
+            accelerator.clip_grad_norm_(model.parameters(), max_norm=1.0)
+            optimizer.step()
+            optimizer.zero_grad(set_to_none=True)
+            scheduler.step()
+            ### Log Results ###
+            if completed_steps % logging_interval == 0:
+                accumulate_loss = accumulate_loss.detach()
+                accuracy = accuracy.detach()
+                if accelerator.num_processes > 1:
+                        accumulate_loss = torch.mean(accelerator.gather_for_metrics(accumulate_loss))
+                        accuracy = torch.mean(accelerator.gather_for_metrics(accuracy))
+                log = {"train_loss": accumulate_loss,
+                        "training_acc": accuracy,
+                        "learning_rate": scheduler.get_last_lr()[0]}
+                accelerator.log(log, step=completed_steps)
+                logging_string = f"[{completed_steps}/{training_steps}] Training Loss: {accumulate_loss} | Training Acc: {accuracy}"
+                if accelerator.is_main_process:
+                    progress_bar.write(logging_string)
+            if completed_steps % evaluation_steps == 0:
+                model.eval()
+                print("Evaluating!")
+                test_losses = []
+                test_accs = []
+                for batch in tqdm(testloader, disable=not accelerator.is_main_process):
+                    src_input_ids = batch["src_input_ids"].to(accelerator.device)
+                    src_pad_mask = batch["src_pad_mask"].to(accelerator.device)
+                    tgt_input_ids = batch["tgt_input_ids"].to(accelerator.device)
+                    tgt_pad_mask = batch["tgt_pad_mask"].to(accelerator.device)
+                    tgt_labels = batch["tgt_labels"].to(accelerator.device)
+                    with torch.inference_mode():
+                        model_output = model(src_input_ids,
+                                    tgt_input_ids,
+                                    src_pad_mask,
+                                    tgt_pad_mask)
+                    ### Flatten for Loss ###
+                    model_output = model_output.flatten(0,1)
+                    tgt_labels = tgt_labels.flatten()
+                    ### Compute Loss ###
+                    loss = loss_fn(model_output, tgt_labels)
+                    ### Compute Accuracy (make sure to ignore -100 targets) ###
+                    model_output = model_output.argmax(axis=-1)
+                    mask = (tgt_labels != -100)
+                    model_output = model_output[mask]
+                    tgt_labels = tgt_labels[mask]
+                    accuracy = (model_output == tgt_labels).sum() / len(model_output)
+                    ### Store Results ###
+                    loss = loss.detach()
+                    accuracy = accuracy.detach()
+                    if accelerator.num_processes > 1:
+                        loss = torch.mean(accelerator.gather_for_metrics(loss))
+                        accuracy = torch.mean(accelerator.gather_for_metrics(accuracy))
+                    ### Store Metrics ###
+                    test_losses.append(loss.item())
+                    test_accs.append(accuracy.item())
+                test_loss = np.mean(test_losses)
+                test_acc = np.mean(test_accs)
+                log = {"test_loss": test_loss,
+                        "test_acc": test_acc}
+                logging_string = f"Testing Loss: {test_loss} | Testing Acc: {test_acc}"
+                if accelerator.is_main_process:
+                    progress_bar.write(logging_string)
+                ### Log and Save Model ###
+                accelerator.log(log, step=completed_steps)
+                accelerator.save_state(os.path.join(path_to_experiment, f"checkpoint_{completed_steps}"))
+                push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps)
+                ### Testing Sentence ###
+                if accelerator.is_main_process:
+                    src_ids = src_ids.to(accelerator.device)
+                    unrwapped = accelerator.unwrap_model(model)
+                    translated = unrwapped.inference(src_ids,
+                                                    tgt_start_id=tgt_tokenizer.bos_token_id,
+                                                    tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=config.max_seq_length)
+                    translated = tgt_tokenizer.decode(translated, skip_special_tokens=False)
+                    if accelerator.is_main_process:
+                        progress_bar.write(f"Translation: {translated}")
+                model.train()
+            if completed_steps >= training_steps:
+                train = False
+                accelerator.save_state(os.path.join(path_to_experiment, f"final_checkpoint"))
+                push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps)
+                break
+            ### Iterate Completed Steps ###
+            completed_steps += 1
+            progress_bar.update(1)
+            ### Reset Accumulated Variables ###
+            accumulate_loss = 0
+            accuracy = 0
+accelerator.end_training()

utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from huggingface_hub import list_repo_files, hf_hub_download
+def get_files_from_HF(repo_id, folder_name, local_dir):
+    files = list_repo_files(repo_id)
+    folder_files = [f for f in files if f.startswith(folder_name)]
+    for file in folder_files:
+        file_path = hf_hub_download(repo_id=repo_id, filename=file, local_dir=local_dir)
+        print(f"Downloaded: {file_path} to {local_dir}")
+def get_file_FROM_HF(repo_id, file_path, local_dir):
+    file_path = hf_hub_download(repo_id=repo_id, filename=file_path, local_dir=local_dir)
+    return file_path