Spaces:
Sleeping
Sleeping
deploy on hugging face spaces for inference
Browse files- .gitignore +20 -0
- README.md +73 -13
- app.py +44 -0
- data_collector.py +61 -0
- inference.py +43 -0
- model.py +318 -0
- process_raw_data.py +64 -0
- pyproject.toml +20 -0
- requirements.txt +8 -0
- tokenize_dataset.py +57 -0
- tokenizer.py +133 -0
- train.py +353 -0
- utils.py +20 -0
.gitignore
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python-generated files
|
2 |
+
__pycache__/
|
3 |
+
*.py[oc]
|
4 |
+
build/
|
5 |
+
dist/
|
6 |
+
wheels/
|
7 |
+
*.egg-info
|
8 |
+
.cache
|
9 |
+
|
10 |
+
# Virtual environments
|
11 |
+
.venv
|
12 |
+
.env
|
13 |
+
|
14 |
+
#folders
|
15 |
+
trained_tokenizers/
|
16 |
+
checkpoints/
|
17 |
+
work_dir/
|
18 |
+
data/
|
19 |
+
pyproject_copy.toml
|
20 |
+
|
README.md
CHANGED
@@ -1,13 +1,73 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Training Transformers from Scratch for Language Translation (English to French)
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
This project focuses on training a Transformer model from scratch to perform English-to-French translation. It follows a structured approach, from data collection to model deployment using Gradio.
|
5 |
+
|
6 |
+

|
7 |
+
|
8 |
+
## Project Steps
|
9 |
+
|
10 |
+
1. **Data Collection**
|
11 |
+
- Gather parallel English-French text data for training.
|
12 |
+
|
13 |
+
2. **Dataset Creation and Upload to Hugging Face**
|
14 |
+
- Preprocess and structure the dataset.
|
15 |
+
- Upload the dataset to the Hugging Face Hub for easy access.
|
16 |
+
|
17 |
+
3. **Training Tokenizers**
|
18 |
+
- Train separate tokenizers for English and French.
|
19 |
+
- Save and store trained tokenizers.
|
20 |
+
|
21 |
+
4. **Creating a Tokenized Dataset**
|
22 |
+
- Tokenize the dataset using the trained tokenizers.
|
23 |
+
- Publish the tokenized dataset on Hugging Face.
|
24 |
+
|
25 |
+
5. **Building the Transformer Model from Scratch**
|
26 |
+
- Implement custom Transformer components, including:
|
27 |
+
- Encoder
|
28 |
+
- Decoder
|
29 |
+
- Embedding Layer
|
30 |
+
- Positional Encoding
|
31 |
+
|
32 |
+
6. **Model Training and Evaluation**
|
33 |
+
- Train the model using the prepared dataset.
|
34 |
+
- Use Weights & Biases (Wandb) for real-time metric visualization.
|
35 |
+
|
36 |
+
7. **Inference**
|
37 |
+
- Test the trained model with sample English inputs.
|
38 |
+
- Generate translated French text.
|
39 |
+
|
40 |
+
8. **Web Interface with Gradio**
|
41 |
+
- Develop an interactive UI using Gradio for easy model inference.
|
42 |
+
|
43 |
+
## Installation
|
44 |
+
|
45 |
+
To use the application, install the required dependencies using either `uv` or `pip`:
|
46 |
+
|
47 |
+
Using `uv`:
|
48 |
+
```bash
|
49 |
+
uv pip install -r requirements.txt
|
50 |
+
```
|
51 |
+
|
52 |
+
Using `pip`:
|
53 |
+
```bash
|
54 |
+
pip install -r requirements.txt
|
55 |
+
```
|
56 |
+
|
57 |
+
## Running the Application
|
58 |
+
|
59 |
+
To launch the application, run:
|
60 |
+
```bash
|
61 |
+
python app.py
|
62 |
+
```
|
63 |
+
|
64 |
+
This will start a Gradio interface where users can input English text and receive French translations.
|
65 |
+
|
66 |
+
## Repository Structure
|
67 |
+
- **data_collector.py** - Script for data collection.
|
68 |
+
- **tokenize_dataset.py** - Prepares and tokenizes dataset.
|
69 |
+
- **model.py** - Contains the Transformer model implementation.
|
70 |
+
- **train.py** - Training script.
|
71 |
+
- **inference.py** - Inference script for model predictions.
|
72 |
+
- **app.py** - Web interface with Gradio.
|
73 |
+
- **requirements.txt** - List of dependencies.
|
app.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from tokenizer import CustomTokenizer
|
3 |
+
from model import Transformer, TransformerConfig
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
|
7 |
+
# load tokenizers
|
8 |
+
path_to_src_tokenizer = "trained_tokenizers/vocab_en.json"
|
9 |
+
path_to_tgt_tokenizer = "trained_tokenizers/vocab_fr.json"
|
10 |
+
|
11 |
+
src_tokenizer = CustomTokenizer(path_to_vocab=path_to_src_tokenizer)
|
12 |
+
tgt_tokenizer = CustomTokenizer(path_to_vocab=path_to_tgt_tokenizer)
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
#load model
|
17 |
+
config = TransformerConfig(max_seq_length=512)
|
18 |
+
model = Transformer(config=config)
|
19 |
+
|
20 |
+
path_to_checkpoints = "checkpoints/model.safetensors"
|
21 |
+
model.load_weights_from_checkpoints(path_to_checkpoints=path_to_checkpoints)
|
22 |
+
model.eval()
|
23 |
+
|
24 |
+
|
25 |
+
def translate(input_text, skip_special_tokens=True):
|
26 |
+
src_ids = torch.tensor(src_tokenizer.encode(input_text)).unsqueeze(0)
|
27 |
+
output_ids = model.inference(src_ids=src_ids, tgt_start_id=tgt_tokenizer.bos_token_id, tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=512)
|
28 |
+
output_tokens = tgt_tokenizer.decode(input=output_ids, skip_special_tokens=skip_special_tokens)
|
29 |
+
return output_tokens
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
with gr.Blocks() as demo:
|
34 |
+
gr.Markdown("## Traduction Anglais → Français")
|
35 |
+
|
36 |
+
with gr.Row():
|
37 |
+
texte_input = gr.Textbox(label="Texte en anglais", lines=4)
|
38 |
+
texte_output = gr.Textbox(label="Texte traduit (Français)",lines=4, interactive=False)
|
39 |
+
|
40 |
+
bouton = gr.Button("Traduire")
|
41 |
+
bouton.click(translate, inputs=texte_input, outputs=texte_output)
|
42 |
+
|
43 |
+
demo.launch()
|
44 |
+
|
data_collector.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
from torch.utils.data import Dataset
|
4 |
+
from model import *
|
5 |
+
|
6 |
+
|
7 |
+
class DataCollector(Dataset):
|
8 |
+
def __init__(self, dataset, english_tokenizer, french_tokenizer, max_length=512):
|
9 |
+
self.dataset = dataset
|
10 |
+
self.english_tokenizer = english_tokenizer
|
11 |
+
self.french_tokenizer = french_tokenizer
|
12 |
+
self.max_length = max_length
|
13 |
+
|
14 |
+
def __len__(self):
|
15 |
+
return len(self.dataset)
|
16 |
+
|
17 |
+
def __getitem__(self, index):
|
18 |
+
english_input_ids = torch.tensor(self.dataset[index]['src_ids'])
|
19 |
+
french_input_ids = torch.tensor(self.dataset[index]['tgt_ids'])
|
20 |
+
|
21 |
+
# Padder manuellement avec torch.nn.functional.pad ou en utilisant torch.cat
|
22 |
+
src_pad_token = self.english_tokenizer.pad_token_id
|
23 |
+
tgt_pad_token = self.french_tokenizer.pad_token_id
|
24 |
+
|
25 |
+
# Pour l'anglais
|
26 |
+
if len(english_input_ids) < self.max_length:
|
27 |
+
pad_length = self.max_length - len(english_input_ids)
|
28 |
+
english_input_ids = torch.cat([english_input_ids, torch.full((pad_length,), src_pad_token, dtype=english_input_ids.dtype)])
|
29 |
+
else:
|
30 |
+
english_input_ids = english_input_ids[:self.max_length]
|
31 |
+
|
32 |
+
# Pour le français
|
33 |
+
if len(french_input_ids) < self.max_length:
|
34 |
+
pad_length = self.max_length - len(french_input_ids)
|
35 |
+
french_input_ids = torch.cat([french_input_ids, torch.full((pad_length,), tgt_pad_token, dtype=french_input_ids.dtype)])
|
36 |
+
else:
|
37 |
+
french_input_ids = french_input_ids[:self.max_length]
|
38 |
+
|
39 |
+
# Créer les masques de padding
|
40 |
+
src_pad_mask = (english_input_ids != src_pad_token)
|
41 |
+
tgt_pad_mask = (french_input_ids != tgt_pad_token)
|
42 |
+
|
43 |
+
# Pour les tâches de traduction ou LM, on décale la cible
|
44 |
+
input_tgt = french_input_ids[:-1].clone()
|
45 |
+
label_tgt = french_input_ids[1:].clone()
|
46 |
+
input_tgt_mask = (input_tgt != tgt_pad_token)
|
47 |
+
label_tgt[label_tgt == tgt_pad_token] = -100
|
48 |
+
|
49 |
+
return {
|
50 |
+
"src_input_ids": english_input_ids, # Taille fixe: (self.max_length,)
|
51 |
+
"src_pad_mask": src_pad_mask,
|
52 |
+
"tgt_input_ids": french_input_ids, # Taille fixe: (self.max_length,)
|
53 |
+
"tgt_pad_mask": torch.cat([input_tgt_mask, torch.full((1,), 0, dtype=french_input_ids.dtype)]),
|
54 |
+
"tgt_labels": torch.cat([label_tgt, torch.full((1,), -100, dtype=french_input_ids.dtype)])
|
55 |
+
}
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
inference.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from model import Transformer, TransformerConfig
|
3 |
+
from safetensors.torch import load_file
|
4 |
+
from tokenizer import CustomTokenizer
|
5 |
+
from datasets import load_dataset
|
6 |
+
path_to_model_safetensors = "checkpoints/model.safetensors"
|
7 |
+
path_to_src_tokenizer = "trained_tokenizers/vocab_en.json"
|
8 |
+
path_to_tgt_tokenizer = "trained_tokenizers/vocab_fr.json"
|
9 |
+
|
10 |
+
config = TransformerConfig(device='cpu', max_seq_length=512)
|
11 |
+
model = Transformer(config=config)
|
12 |
+
|
13 |
+
#load weights dict
|
14 |
+
model.load_weights_from_checkpoints(path_to_model_safetensors)
|
15 |
+
model.eval()
|
16 |
+
|
17 |
+
src_tokenizer = CustomTokenizer(path_to_vocab=path_to_src_tokenizer)
|
18 |
+
tgt_tokenizer = CustomTokenizer(path_to_vocab=path_to_tgt_tokenizer)
|
19 |
+
|
20 |
+
|
21 |
+
english_text = "I'm very sick and i want to see a doctor."
|
22 |
+
|
23 |
+
src_ids = torch.tensor(src_tokenizer.encode(english_text)).unsqueeze(0)
|
24 |
+
|
25 |
+
translated_ids = model.inference(src_ids=src_ids, tgt_start_id=tgt_tokenizer.eos_token_id, tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=512)
|
26 |
+
translated_tokens = tgt_tokenizer.decode(translated_ids, skip_special_tokens=True)
|
27 |
+
print(f"English: {english_text} \nFrench: {translated_tokens}")
|
28 |
+
|
29 |
+
|
30 |
+
dataset = load_dataset("bilalfaye/english-wolof-french-translation", split="train")
|
31 |
+
samples = dataset.shuffle().select(range(50))
|
32 |
+
|
33 |
+
for i in range(50):
|
34 |
+
sample = samples[i]
|
35 |
+
src_ids = torch.tensor(src_tokenizer.encode(sample["en"])).unsqueeze(0)
|
36 |
+
output_ids = model.inference(src_ids=src_ids, tgt_start_id=tgt_tokenizer.eos_token_id, tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=512)
|
37 |
+
predicted_tokens = tgt_tokenizer.decode(output_ids, skip_special_tokens=True)
|
38 |
+
print(f"English: {sample["en"]}")
|
39 |
+
print(f"French (labels): {sample["fr"]}")
|
40 |
+
print(f"French (predicted): {predicted_tokens}")
|
41 |
+
print("--------------------------------\n\n")
|
42 |
+
|
43 |
+
|
model.py
ADDED
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
import os
|
6 |
+
from utils import get_file_FROM_HF
|
7 |
+
from safetensors.torch import load_file
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
@dataclass
|
12 |
+
class TransformerConfig:
|
13 |
+
src_vocab_size: int = 32000
|
14 |
+
tgt_vocab_size: int = 32000
|
15 |
+
max_seq_length: int = 64
|
16 |
+
d_model: int = 512
|
17 |
+
num_heads: int = 8
|
18 |
+
num_encoder_layers: int = 6
|
19 |
+
num_decoder_layers: int = 6
|
20 |
+
dropout_p: float = 0.1
|
21 |
+
dff: int = 2048
|
22 |
+
device: str = 'cpu'
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
# Source Embedding block
|
27 |
+
class SourceEmbedding(nn.Module):
|
28 |
+
def __init__(self, config: TransformerConfig):
|
29 |
+
super().__init__()
|
30 |
+
self.src_embedding = nn.Embedding(num_embeddings=config.src_vocab_size, embedding_dim=config.d_model)
|
31 |
+
|
32 |
+
def forward(self, x):
|
33 |
+
x = self.src_embedding(x)
|
34 |
+
return x
|
35 |
+
|
36 |
+
|
37 |
+
# Target Embedding block
|
38 |
+
class TargetEmbedding(nn.Module):
|
39 |
+
def __init__(self, config: TransformerConfig):
|
40 |
+
super().__init__()
|
41 |
+
self.tgt_embedding = nn.Embedding(num_embeddings=config.tgt_vocab_size, embedding_dim=config.d_model)
|
42 |
+
|
43 |
+
def forward(self, x):
|
44 |
+
x = self.tgt_embedding(x)
|
45 |
+
return x
|
46 |
+
|
47 |
+
# Position Encoding (PE)
|
48 |
+
|
49 |
+
class PositionEncoding(nn.Module):
|
50 |
+
def __init__(self, config: TransformerConfig, require_grad=False):
|
51 |
+
super().__init__()
|
52 |
+
self.PE = torch.zeros(config.max_seq_length, config.d_model)
|
53 |
+
pos = torch.arange(0, config.max_seq_length).reshape(-1, 1)
|
54 |
+
i = torch.arange(0, config.d_model, step=2)
|
55 |
+
|
56 |
+
denominator = torch.pow(10000, (2*i) / config.d_model)
|
57 |
+
self.PE[:, 0::2] = torch.sin(pos/denominator)
|
58 |
+
self.PE[:, 1::2] = torch.cos(pos/denominator)
|
59 |
+
|
60 |
+
|
61 |
+
self.PE = nn.Parameter(self.PE, requires_grad=require_grad)
|
62 |
+
|
63 |
+
def forward(self, x):
|
64 |
+
max_seq_length = x.shape[1]
|
65 |
+
return x + self.PE[:max_seq_length]
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
# Muti Head Attention block for (Multi Head Attention, Masked Multi Head Attention and Cross Multi Heads Attention)
|
70 |
+
class MultiheadAttention(nn.Module):
|
71 |
+
def __init__(self, config:TransformerConfig):
|
72 |
+
super().__init__()
|
73 |
+
self.config = config
|
74 |
+
|
75 |
+
# check if the d_model is divided by num_heads to get the head dim
|
76 |
+
assert config.d_model % self.config.num_heads == 0, "The d_model is not divided by the num of heads"
|
77 |
+
self.head_dim = self.config.d_model // self.config.num_heads
|
78 |
+
|
79 |
+
|
80 |
+
self.q_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
|
81 |
+
self.k_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
|
82 |
+
self.v_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
|
83 |
+
|
84 |
+
self.out_proj = nn.Linear(in_features=self.config.d_model, out_features=self.config.d_model)
|
85 |
+
|
86 |
+
|
87 |
+
def forward(self, src, tgt=None, attention_mask=None, causal=False):
|
88 |
+
batch, src_seq_length, d_model = src.shape
|
89 |
+
if tgt is None:
|
90 |
+
q = self.q_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
|
91 |
+
k = self.k_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
|
92 |
+
v = self.v_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
|
93 |
+
|
94 |
+
#MASKED MULTI HEAD ATTENTION
|
95 |
+
if attention_mask is not None:
|
96 |
+
attention_mask = attention_mask.bool()
|
97 |
+
attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1,1,src_seq_length,1).to(self.config.device)
|
98 |
+
|
99 |
+
if causal and attention_mask is not None:
|
100 |
+
# compute new mask (pad mask + causal mask)
|
101 |
+
causal_mask = ~torch.triu(torch.ones((src_seq_length, src_seq_length), dtype=torch.bool), diagonal=1)
|
102 |
+
causal_mask = causal_mask.unsqueeze(0).unsqueeze(0).to(self.config.device)
|
103 |
+
|
104 |
+
combined_mask = causal_mask.int() * attention_mask.int()
|
105 |
+
attention_mask = combined_mask.bool().to(self.config.device)
|
106 |
+
# torch.set_printoptions(threshold=torch.inf)
|
107 |
+
|
108 |
+
|
109 |
+
attention_out = F.scaled_dot_product_attention(q,k,v,
|
110 |
+
attn_mask=attention_mask,
|
111 |
+
dropout_p=self.config.dropout_p if self.training else 0.0,
|
112 |
+
is_causal=False)
|
113 |
+
|
114 |
+
# CROSS ATTENTION
|
115 |
+
else:
|
116 |
+
tgt_seq_length = tgt.shape[1]
|
117 |
+
q = self.q_proj(tgt).reshape(batch, tgt_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
|
118 |
+
k = self.k_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
|
119 |
+
v = self.v_proj(src).reshape(batch, src_seq_length, self.config.num_heads, self.head_dim).transpose(1,2).contiguous()
|
120 |
+
|
121 |
+
if attention_mask is not None:
|
122 |
+
attention_mask = attention_mask.bool()
|
123 |
+
attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1,1,tgt_seq_length,1)
|
124 |
+
|
125 |
+
attention_out = F.scaled_dot_product_attention(q,k,v,
|
126 |
+
attn_mask=attention_mask,
|
127 |
+
dropout_p=self.config.dropout_p if self.training else 0.0,
|
128 |
+
is_causal=False)
|
129 |
+
|
130 |
+
attention_out = attention_out.transpose(1,2).flatten(2)
|
131 |
+
attention_out = self.out_proj(attention_out)
|
132 |
+
return attention_out
|
133 |
+
|
134 |
+
|
135 |
+
# Position Wise Feed Forward Network (MLP)
|
136 |
+
class FeedForward(nn.Module):
|
137 |
+
def __init__(self, config: TransformerConfig):
|
138 |
+
super().__init__()
|
139 |
+
self.hidden_layer = nn.Linear(in_features=config.d_model, out_features=config.dff) #eg: 512 -> 2048
|
140 |
+
self.hidden_dropout = nn.Dropout(p=config.dropout_p)
|
141 |
+
self.output_layer = nn.Linear(in_features=config.dff, out_features=config.d_model) #eg : 2048 - > 512
|
142 |
+
self.output_dropout = nn.Dropout(p=config.dropout_p)
|
143 |
+
|
144 |
+
|
145 |
+
|
146 |
+
def forward(self, x):
|
147 |
+
x = self.hidden_layer(x)
|
148 |
+
x = F.gelu(x)
|
149 |
+
x = self.hidden_dropout(x)
|
150 |
+
x = self.output_layer(x)
|
151 |
+
x = self.output_dropout(x)
|
152 |
+
return x
|
153 |
+
|
154 |
+
|
155 |
+
# Encoder block
|
156 |
+
class EncoderBlock(nn.Module):
|
157 |
+
def __init__(self, config: TransformerConfig):
|
158 |
+
super().__init__()
|
159 |
+
self.multi_head_attention = MultiheadAttention(config=config)
|
160 |
+
self.feed_forward = FeedForward(config=config)
|
161 |
+
self.layer_norm_1 = nn.LayerNorm(config.d_model)
|
162 |
+
self.layer_norm_2 = nn.LayerNorm(config.d_model)
|
163 |
+
self.dropout = nn.Dropout(config.dropout_p)
|
164 |
+
|
165 |
+
def forward(self, x, attention_mask=None):
|
166 |
+
x = x + self.dropout(self.multi_head_attention(src=x, attention_mask=attention_mask))
|
167 |
+
x = self.layer_norm_1(x)
|
168 |
+
|
169 |
+
x = x + self.feed_forward(x)
|
170 |
+
x = self.layer_norm_2(x)
|
171 |
+
return x
|
172 |
+
|
173 |
+
# Decoder block
|
174 |
+
|
175 |
+
class DecoderBlock(nn.Module):
|
176 |
+
def __init__(self, config: TransformerConfig):
|
177 |
+
super().__init__()
|
178 |
+
self.masked_multi_head_attention = MultiheadAttention(config=config)
|
179 |
+
self.dropout_masked = nn.Dropout(config.dropout_p)
|
180 |
+
|
181 |
+
self.cross_multi_head_attention = MultiheadAttention(config=config)
|
182 |
+
self.dropout_cross = nn.Dropout(config.dropout_p)
|
183 |
+
|
184 |
+
self.feed_forward = FeedForward(config=config)
|
185 |
+
|
186 |
+
self.layer_norm_1 = nn.LayerNorm(config.d_model)
|
187 |
+
self.layer_norm_2 = nn.LayerNorm(config.d_model)
|
188 |
+
self.layer_norm_3 = nn.LayerNorm(config.d_model)
|
189 |
+
|
190 |
+
|
191 |
+
def forward(self, src,tgt, src_attention_mask=None, tgt_attention_mask=None):
|
192 |
+
|
193 |
+
tgt = tgt + self.dropout_masked(self.masked_multi_head_attention(tgt, attention_mask=tgt_attention_mask, causal=True))
|
194 |
+
tgt = self.layer_norm_1(tgt)
|
195 |
+
|
196 |
+
tgt = tgt + self.dropout_cross(self.cross_multi_head_attention(src, tgt, attention_mask=src_attention_mask))
|
197 |
+
tgt = self.layer_norm_2(tgt)
|
198 |
+
|
199 |
+
tgt = tgt + self.feed_forward(tgt)
|
200 |
+
return tgt
|
201 |
+
|
202 |
+
|
203 |
+
# Transformer (put it all together)
|
204 |
+
class Transformer(nn.Module):
|
205 |
+
def __init__(self, config: TransformerConfig):
|
206 |
+
super().__init__()
|
207 |
+
|
208 |
+
self.src_embedding = SourceEmbedding(config=config)
|
209 |
+
self.tgt_embedding = TargetEmbedding(config=config)
|
210 |
+
|
211 |
+
self.position_encoding = PositionEncoding(config=config)
|
212 |
+
|
213 |
+
self.encoder = nn.ModuleList(
|
214 |
+
[EncoderBlock(config=config) for _ in range(config.num_encoder_layers)]
|
215 |
+
)
|
216 |
+
|
217 |
+
self.decoder = nn.ModuleList(
|
218 |
+
[DecoderBlock(config=config) for _ in range(config.num_decoder_layers)]
|
219 |
+
)
|
220 |
+
|
221 |
+
self.output = nn.Linear(config.d_model, config.tgt_vocab_size)
|
222 |
+
|
223 |
+
## Init weights
|
224 |
+
self.apply(_init_weights_)
|
225 |
+
|
226 |
+
|
227 |
+
|
228 |
+
def forward(self, src_ids, tgt_ids, src_attention_mask=None, tgt_attention_mask=None):
|
229 |
+
|
230 |
+
# embed token ids
|
231 |
+
src_embed = self.src_embedding(src_ids)
|
232 |
+
tgt_embed = self.tgt_embedding(tgt_ids)
|
233 |
+
|
234 |
+
# add position encoding
|
235 |
+
src_embed = self.position_encoding(src_embed)
|
236 |
+
tgt_embed = self.position_encoding(tgt_embed)
|
237 |
+
|
238 |
+
for layer in self.encoder:
|
239 |
+
src_embed = layer(src_embed, src_attention_mask)
|
240 |
+
|
241 |
+
for layer in self.decoder:
|
242 |
+
tgt_embed = layer(src_embed, tgt_embed, src_attention_mask, tgt_attention_mask)
|
243 |
+
|
244 |
+
pred = self.output(tgt_embed)
|
245 |
+
|
246 |
+
return pred
|
247 |
+
|
248 |
+
@torch.no_grad()
|
249 |
+
def inference(self, src_ids, tgt_start_id, tgt_end_id, max_seq_length):
|
250 |
+
tgt_ids = torch.tensor([tgt_start_id], device=src_ids.device).reshape(1,1)
|
251 |
+
|
252 |
+
#Encode the source
|
253 |
+
src_embed = self.src_embedding(src_ids)
|
254 |
+
src_embed = self.position_encoding(src_embed)
|
255 |
+
for layer in self.encoder:
|
256 |
+
src_embed = layer(src_embed)
|
257 |
+
|
258 |
+
#Generate Target
|
259 |
+
for i in range(max_seq_length):
|
260 |
+
tgt_embed = self.tgt_embedding(tgt_ids)
|
261 |
+
tgt_embed = self.position_encoding(tgt_embed)
|
262 |
+
for layer in self.decoder:
|
263 |
+
tgt_embed = layer(src_embed, tgt_embed)
|
264 |
+
|
265 |
+
tgt_embed = tgt_embed[:, -1]
|
266 |
+
|
267 |
+
pred = self.output(tgt_embed)
|
268 |
+
pred = pred.argmax(axis=-1).unsqueeze(0)
|
269 |
+
tgt_ids = torch.cat([tgt_ids, pred], axis=-1)
|
270 |
+
|
271 |
+
if torch.all(pred == tgt_end_id):
|
272 |
+
break
|
273 |
+
|
274 |
+
return tgt_ids.squeeze().cpu().tolist()
|
275 |
+
|
276 |
+
def load_weights_from_checkpoints(self, path_to_checkpoints):
|
277 |
+
if not os.path.exists(path_to_checkpoints):
|
278 |
+
print("------------------- LOADING MODEL CHECKPOINTS FROM HUGGING FACE --------------------------")
|
279 |
+
folder = os.path.dirname(path_to_checkpoints)
|
280 |
+
os.makedirs(folder, exist_ok=True)
|
281 |
+
path_to_checkpoints = get_file_FROM_HF(repo_id="ngia/ml-translation-en-fr", file_path="final_checkpoint/model.safetensors", local_dir=folder)
|
282 |
+
|
283 |
+
chekpoints = load_file(filename=path_to_checkpoints)
|
284 |
+
self.load_state_dict(chekpoints)
|
285 |
+
return self
|
286 |
+
|
287 |
+
|
288 |
+
|
289 |
+
|
290 |
+
def _init_weights_(module):
|
291 |
+
|
292 |
+
"""
|
293 |
+
Simple weight intialization taken directly from the huggingface
|
294 |
+
`modeling_roberta.py` implementation!
|
295 |
+
"""
|
296 |
+
if isinstance(module, nn.Linear):
|
297 |
+
module.weight.data.normal_(mean=0.0, std=0.02)
|
298 |
+
if module.bias is not None:
|
299 |
+
module.bias.data.zero_()
|
300 |
+
elif isinstance(module, nn.Embedding):
|
301 |
+
module.weight.data.normal_(mean=0.0, std=0.02)
|
302 |
+
if module.padding_idx is not None:
|
303 |
+
module.weight.data[module.padding_idx].zero_()
|
304 |
+
elif isinstance(module, nn.LayerNorm):
|
305 |
+
module.bias.data.zero_()
|
306 |
+
module.weight.data.fill_(1.0)
|
307 |
+
|
308 |
+
|
309 |
+
|
310 |
+
|
311 |
+
if __name__ == "__main__":
|
312 |
+
config = TransformerConfig()
|
313 |
+
model = Transformer(config=config)
|
314 |
+
|
315 |
+
english = torch.randint(low=0, high=1000, size=(1,3))
|
316 |
+
res = model.inference(src_ids=english, tgt_start_id=1, tgt_end_id=2, max_seq_length=config.max_seq_length)
|
317 |
+
print(res)
|
318 |
+
|
process_raw_data.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, concatenate_datasets, load_from_disk
|
2 |
+
import torch
|
3 |
+
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
def create_dataset(root_data_path, save_data_path, cache_data_path, test_size=0.01):
|
8 |
+
|
9 |
+
list_datasets = []
|
10 |
+
|
11 |
+
for directory in os.listdir(root_data_path):
|
12 |
+
|
13 |
+
path_to_dir = os.path.join(root_data_path, directory)
|
14 |
+
|
15 |
+
if os.path.isdir(path_to_dir):
|
16 |
+
|
17 |
+
print(f"Processing: {path_to_dir}")
|
18 |
+
|
19 |
+
english_text = None
|
20 |
+
french_text = None
|
21 |
+
|
22 |
+
for file_dir in os.listdir(path_to_dir):
|
23 |
+
|
24 |
+
if file_dir.endswith(".en"):
|
25 |
+
english_text = os.path.join(path_to_dir, file_dir)
|
26 |
+
|
27 |
+
if file_dir.endswith(".fr"):
|
28 |
+
french_text = os.path.join(path_to_dir, file_dir)
|
29 |
+
|
30 |
+
if english_text is not None and french_text is not None:
|
31 |
+
english_dataset = load_dataset("text", data_files=english_text, cache_dir=cache_data_path)["train"]
|
32 |
+
french_dataset = load_dataset("text", data_files=french_text, cache_dir=cache_data_path)["train"]
|
33 |
+
|
34 |
+
english_dataset = english_dataset.rename_column("text", "english_src")
|
35 |
+
dataset = english_dataset.add_column("french_tgt", french_dataset["text"])
|
36 |
+
|
37 |
+
list_datasets.append(dataset)
|
38 |
+
|
39 |
+
|
40 |
+
hf_dataset = concatenate_datasets(list_datasets)
|
41 |
+
hf_dataset = hf_dataset.train_test_split(test_size=test_size)
|
42 |
+
|
43 |
+
hf_dataset.save_to_disk(save_data_path)
|
44 |
+
print(f"Dataset successfully saved in: {save_data_path}")
|
45 |
+
|
46 |
+
|
47 |
+
def push_dataset_into_hf_hub(save_data_path):
|
48 |
+
dataset = load_from_disk(dataset_path=save_data_path)
|
49 |
+
dataset = dataset.shuffle()
|
50 |
+
dataset.push_to_hub(repo_id="ngia/translation-en-fr")
|
51 |
+
print("Successfully pushed on Hugging Face Hub")
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == "__main__":
|
55 |
+
root_data_path = "data/raw_data/"
|
56 |
+
save_data_path = "data/saved_data/"
|
57 |
+
cache_data_path = "data/cached_data/"
|
58 |
+
|
59 |
+
create_dataset(root_data_path=root_data_path, save_data_path=save_data_path, cache_data_path=cache_data_path)
|
60 |
+
dataset = load_from_disk(dataset_path=save_data_path)
|
61 |
+
print(dataset["train"][10])
|
62 |
+
|
63 |
+
push_dataset_into_hf_hub(save_data_path=save_data_path)
|
64 |
+
|
pyproject.toml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "translator-en-fr"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Add your description here"
|
5 |
+
readme = "README.md"
|
6 |
+
requires-python = ">=3.12.0"
|
7 |
+
dependencies = [
|
8 |
+
"accelerate>=1.4.0",
|
9 |
+
"datasets>=3.3.2",
|
10 |
+
"gradio>=5.21.0",
|
11 |
+
"huggingface-hub>=0.29.1",
|
12 |
+
"matplotlib>=3.10.1",
|
13 |
+
"sentencepiece>=0.2.0",
|
14 |
+
"streamlit>=1.43.2",
|
15 |
+
"torch>=2.6.0",
|
16 |
+
"torchvision>=0.21.0",
|
17 |
+
"transformers>=4.49.0",
|
18 |
+
"wandb>=0.19.8",
|
19 |
+
]
|
20 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate
|
2 |
+
datasets
|
3 |
+
huggingface-hub
|
4 |
+
sentencepiece
|
5 |
+
transformers
|
6 |
+
wandb
|
7 |
+
matplotlib
|
8 |
+
gradio
|
tokenize_dataset.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tokenizer import CustomTokenizer
|
2 |
+
from datasets import load_from_disk
|
3 |
+
|
4 |
+
|
5 |
+
def tokenize_dataset(path_to_dataset,
|
6 |
+
path_to_save,
|
7 |
+
num_workers=24,
|
8 |
+
truncate=False,
|
9 |
+
max_length=512,
|
10 |
+
min_length=3):
|
11 |
+
|
12 |
+
english_tokenizer = CustomTokenizer(path_to_vocab="trained_tokenizers/vocab_en.json", truncate=truncate, max_length=max_length)
|
13 |
+
french_tokenizer = CustomTokenizer(path_to_vocab="trained_tokenizers/vocab_fr.json", truncate=truncate, max_length=max_length)
|
14 |
+
|
15 |
+
dataset = load_from_disk(path_to_dataset)
|
16 |
+
|
17 |
+
def _tokenize_text(examples):
|
18 |
+
|
19 |
+
english_text = examples["english_src"]
|
20 |
+
french_text = examples["french_tgt"]
|
21 |
+
|
22 |
+
src_ids = english_tokenizer.encode(english_text)
|
23 |
+
tgt_ids = french_tokenizer.encode(french_text)
|
24 |
+
|
25 |
+
batch = {
|
26 |
+
"src_ids": src_ids,
|
27 |
+
"tgt_ids": tgt_ids
|
28 |
+
}
|
29 |
+
return batch
|
30 |
+
|
31 |
+
tokenized_dataset = dataset.map(_tokenize_text, batched=True, num_proc=num_workers)
|
32 |
+
tokenized_dataset = tokenized_dataset.remove_columns(["english_src", "french_tgt"])
|
33 |
+
|
34 |
+
filter_func = lambda batch: [len(e) >= min_length for e in batch["tgt_ids"]]
|
35 |
+
tokenized_dataset = tokenized_dataset.filter(filter_func, batched=True)
|
36 |
+
|
37 |
+
print(tokenized_dataset)
|
38 |
+
|
39 |
+
tokenized_dataset.save_to_disk(path_to_save)
|
40 |
+
print("Tokenized dataset is successfully saved into the disk")
|
41 |
+
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
path_to_dataset = "data/saved_data"
|
45 |
+
path_to_save = "data/tokenized_dataset"
|
46 |
+
tokenize_dataset(path_to_dataset=path_to_dataset, path_to_save=path_to_save)
|
47 |
+
|
48 |
+
#push dataset into the hub:
|
49 |
+
tokenized_dataset = load_from_disk(dataset_path=path_to_save)
|
50 |
+
tokenized_dataset.push_to_hub("ngia/tokenized-translation-en-fr")
|
51 |
+
print("Tokenized dataset is successfully pushed into Hugging Face hub")
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
tokenizer.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tokenizers import Tokenizer, normalizers, decoders
|
2 |
+
from tokenizers.models import WordPiece
|
3 |
+
from tokenizers.trainers import WordPieceTrainer
|
4 |
+
from tokenizers.normalizers import NFC, Lowercase
|
5 |
+
from tokenizers.pre_tokenizers import Whitespace
|
6 |
+
from tokenizers.processors import TemplateProcessing
|
7 |
+
from utils import get_file_FROM_HF
|
8 |
+
|
9 |
+
import glob
|
10 |
+
import os
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def train_tokenizer(path_to_data, lang):
|
15 |
+
|
16 |
+
special_token_dict = {
|
17 |
+
"pad_token" : "[PAD]",
|
18 |
+
"start_token": "[BOS]",
|
19 |
+
"end_token": "[EOS]",
|
20 |
+
"unknown_token": "[UNK]"
|
21 |
+
}
|
22 |
+
|
23 |
+
tokenizer = Tokenizer(WordPiece(unk_token=special_token_dict["unknown_token"]))
|
24 |
+
tokenizer.normalizer = normalizers.Sequence([NFC(), Lowercase()])
|
25 |
+
tokenizer.pre_tokenizer = Whitespace()
|
26 |
+
|
27 |
+
files = []
|
28 |
+
|
29 |
+
if lang == "fr":
|
30 |
+
print("---------Training French Tokenizer--------------")
|
31 |
+
files = glob.glob(os.path.join(path_to_data, "**/*.fr"))
|
32 |
+
|
33 |
+
elif lang == "en":
|
34 |
+
print("---------Training English Tokenizer--------------")
|
35 |
+
files = glob.glob(os.path.join(path_to_data, "**/*.en"))
|
36 |
+
|
37 |
+
trainer = WordPieceTrainer(vocab_size=32000, special_tokens=list(special_token_dict.values()))
|
38 |
+
tokenizer.train(files, trainer)
|
39 |
+
tokenizer.save(f"trained_tokenizers/vocab_{lang}.json")
|
40 |
+
print(f"Tokenizer is successfully saved into trained_tokenizers/vocab_{lang}.json")
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
class CustomTokenizer:
|
45 |
+
|
46 |
+
def __init__(self, path_to_vocab, truncate=False, max_length=512):
|
47 |
+
self.path_to_vocab = path_to_vocab
|
48 |
+
self.truncate = truncate
|
49 |
+
self.max_length = max_length
|
50 |
+
self.tokenizer = self.config_tokenizer()
|
51 |
+
self.vocab_size = self.tokenizer.get_vocab_size()
|
52 |
+
|
53 |
+
self.pad_token = "[PAD]"
|
54 |
+
self.pad_token_id = self.tokenizer.token_to_id("[PAD]")
|
55 |
+
|
56 |
+
self.bos_token = "[BOS]"
|
57 |
+
self.bos_token_id = self.tokenizer.token_to_id("[BOS]")
|
58 |
+
|
59 |
+
self.eos_token = "[EOS]"
|
60 |
+
self.eos_token_id = self.tokenizer.token_to_id("[EOS]")
|
61 |
+
|
62 |
+
self.unk_token = "[UNK]"
|
63 |
+
self.unk_token_id = self.tokenizer.token_to_id("[UNK]")
|
64 |
+
|
65 |
+
self.post_processor = TemplateProcessing(
|
66 |
+
single="[BOS] $A [EOS]",
|
67 |
+
special_tokens=[
|
68 |
+
(self.bos_token, self.bos_token_id),
|
69 |
+
(self.eos_token, self.eos_token_id)
|
70 |
+
]
|
71 |
+
)
|
72 |
+
|
73 |
+
if self.truncate:
|
74 |
+
self.max_length = max_length - self.post_processor.num_special_tokens_to_add(is_pair=False)
|
75 |
+
|
76 |
+
|
77 |
+
def config_tokenizer(self):
|
78 |
+
if not os.path.exists(self.path_to_vocab):
|
79 |
+
self.path_to_vocab = self.load_file_from_hugging_face()
|
80 |
+
tokenizer = Tokenizer.from_file(self.path_to_vocab)
|
81 |
+
tokenizer.decoder = decoders.WordPiece()
|
82 |
+
return tokenizer
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
def encode(self, input):
|
88 |
+
|
89 |
+
def _parse_process_tokenized(tokenized):
|
90 |
+
if self.truncate:
|
91 |
+
tokenized.truncate(self.max_length, direction="right")
|
92 |
+
tokenized = self.post_processor.process(tokenized)
|
93 |
+
return tokenized.ids
|
94 |
+
|
95 |
+
if isinstance(input, str):
|
96 |
+
tokenized = self.tokenizer.encode(input)
|
97 |
+
tokenized = _parse_process_tokenized(tokenized)
|
98 |
+
|
99 |
+
if isinstance(input, (list, tuple)):
|
100 |
+
tokenized = self.tokenizer.encode_batch(input)
|
101 |
+
tokenized = [_parse_process_tokenized(t) for t in tokenized]
|
102 |
+
|
103 |
+
return tokenized
|
104 |
+
|
105 |
+
def decode(self, input, skip_special_tokens=True):
|
106 |
+
if isinstance(input, list):
|
107 |
+
if all(isinstance(item, list) for item in input):
|
108 |
+
decoded = self.tokenizer.decode_batch(input, skip_special_tokens=skip_special_tokens)
|
109 |
+
elif all(isinstance(item, int) for item in input):
|
110 |
+
decoded = self.tokenizer.decode(input, skip_special_tokens=skip_special_tokens)
|
111 |
+
|
112 |
+
return decoded
|
113 |
+
|
114 |
+
def load_file_from_hugging_face(self):
|
115 |
+
filename = os.path.basename(self.path_to_vocab)
|
116 |
+
if filename == "vocab_en.json":
|
117 |
+
print("------------------- LOADING SOURCE TOKENIZER FROM HUGGING FACE --------------------------")
|
118 |
+
|
119 |
+
elif filename == "vocab_fr.json":
|
120 |
+
print("------------------- LOADING TARGET TOKENIZER FROM HUGGING FACE --------------------------")
|
121 |
+
|
122 |
+
os.makedirs("trained_tokenizers/", exist_ok=True)
|
123 |
+
path_to_tokenizer = get_file_FROM_HF(repo_id="ngia/ml-translation-en-fr", file_path=filename, local_dir="trained_tokenizers/")
|
124 |
+
return path_to_tokenizer
|
125 |
+
|
126 |
+
|
127 |
+
if __name__ == "__main__":
|
128 |
+
|
129 |
+
path_to_data_root = "/home/ngam/Documents/translator-en-fr/data/raw_data"
|
130 |
+
#replace False by True if you want to train a new tokenizer
|
131 |
+
if False:
|
132 |
+
train_tokenizer(path_to_data_root, lang='fr')
|
133 |
+
train_tokenizer(path_to_data_root, lang='en')
|
train.py
ADDED
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from model import Transformer, TransformerConfig
|
6 |
+
from data_collector import DataCollector
|
7 |
+
from torch.utils.data import DataLoader
|
8 |
+
from datasets import load_from_disk, load_dataset,DatasetDict,concatenate_datasets
|
9 |
+
from transformers import get_scheduler
|
10 |
+
from tokenizer import CustomTokenizer
|
11 |
+
from tqdm import tqdm
|
12 |
+
from accelerate import Accelerator
|
13 |
+
import wandb
|
14 |
+
from huggingface_hub import HfApi, create_repo
|
15 |
+
import shutil
|
16 |
+
|
17 |
+
|
18 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
19 |
+
|
20 |
+
|
21 |
+
# MODEL CONFIG
|
22 |
+
src_vocab_size: int = 32000
|
23 |
+
tgt_vocab_size: int = 32000
|
24 |
+
max_seq_length: int = 512
|
25 |
+
d_model: int = 512
|
26 |
+
num_heads: int = 8
|
27 |
+
num_encoder_layers: int = 6
|
28 |
+
num_decoder_layers: int = 6
|
29 |
+
dropout_p: float = 0.1
|
30 |
+
dff: int = 2048
|
31 |
+
|
32 |
+
config = TransformerConfig(
|
33 |
+
src_vocab_size=src_vocab_size,
|
34 |
+
tgt_vocab_size=tgt_vocab_size,
|
35 |
+
max_seq_length=max_seq_length,
|
36 |
+
d_model=d_model,
|
37 |
+
num_heads=num_heads,
|
38 |
+
num_encoder_layers=num_encoder_layers,
|
39 |
+
num_decoder_layers=num_decoder_layers,
|
40 |
+
dropout_p=0.1,
|
41 |
+
dff=dff
|
42 |
+
)
|
43 |
+
|
44 |
+
|
45 |
+
# TOKENIZER CONFIG
|
46 |
+
src_tokenizer_path = "trained_tokenizers/vocab_en.json"
|
47 |
+
tgt_tokenizer_path = "trained_tokenizers/vocab_fr.json"
|
48 |
+
|
49 |
+
src_tokenizer = CustomTokenizer(path_to_vocab=src_tokenizer_path, max_length=config.max_seq_length)
|
50 |
+
tgt_tokenizer = CustomTokenizer(path_to_vocab=tgt_tokenizer_path, max_length=config.max_seq_length)
|
51 |
+
|
52 |
+
|
53 |
+
# DATALOADER CONFIG
|
54 |
+
path_to_data = "data/tokenized_dataset"
|
55 |
+
batch_size = 64
|
56 |
+
gradient_accumulation_steps = 2
|
57 |
+
# num_workers = 4
|
58 |
+
|
59 |
+
# Training Config
|
60 |
+
learning_rate = 1e-4
|
61 |
+
training_steps = 170000
|
62 |
+
warmup_steps = 2000
|
63 |
+
scheduler_type = "cosine"
|
64 |
+
evaluation_steps = 5000
|
65 |
+
bias_norm_weight_decay = False
|
66 |
+
weight_decay = 0.001
|
67 |
+
betas = (0.9, 0.98)
|
68 |
+
adam_eps = 1e-6
|
69 |
+
|
70 |
+
|
71 |
+
#Logging Config
|
72 |
+
working_directory = "work_dir"
|
73 |
+
experiment_name = "Seq2Seq_Neural_Machine_Translation"
|
74 |
+
logging_interval = 1
|
75 |
+
|
76 |
+
#Resume from checkpoint
|
77 |
+
resume_from_checkpoint = "checkpoint_170000"
|
78 |
+
|
79 |
+
|
80 |
+
#Prepare Accelerator
|
81 |
+
path_to_experiment = os.path.join(working_directory, experiment_name)
|
82 |
+
accelerator = Accelerator(project_dir=path_to_experiment,
|
83 |
+
log_with="wandb")
|
84 |
+
|
85 |
+
accelerator.init_trackers(experiment_name)
|
86 |
+
|
87 |
+
#config model device
|
88 |
+
config.device = accelerator.device
|
89 |
+
|
90 |
+
|
91 |
+
# Prepare Dataloaders
|
92 |
+
dataset = load_dataset("ngia/tokenized-translation-en-fr")
|
93 |
+
|
94 |
+
|
95 |
+
accelerator.print("Dataset:", dataset)
|
96 |
+
min_batch_size = batch_size // gradient_accumulation_steps
|
97 |
+
train_dataset = DataCollector(dataset=dataset["train"], english_tokenizer=src_tokenizer, french_tokenizer=tgt_tokenizer, max_length=config.max_seq_length)
|
98 |
+
test_dataset = DataCollector(dataset=dataset["test"], english_tokenizer=tgt_tokenizer, french_tokenizer=tgt_tokenizer, max_length=config.max_seq_length)
|
99 |
+
|
100 |
+
train_loader = DataLoader(dataset=train_dataset, batch_size=min_batch_size, shuffle=True)
|
101 |
+
test_loader = DataLoader(dataset=test_dataset, batch_size=min_batch_size, shuffle=False)
|
102 |
+
|
103 |
+
|
104 |
+
# Prepare model
|
105 |
+
model = Transformer(config=config)
|
106 |
+
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
|
107 |
+
params = sum([np.prod(p.size()) for p in model_parameters])
|
108 |
+
accelerator.print("Number of trainable parameters:", params)
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
# Prepare Optimizer
|
113 |
+
optimizer = torch.optim.AdamW(model.parameters(),
|
114 |
+
lr=learning_rate,
|
115 |
+
betas=betas,
|
116 |
+
eps=adam_eps)
|
117 |
+
|
118 |
+
|
119 |
+
# Define scheduler
|
120 |
+
scheduler = get_scheduler(
|
121 |
+
name=scheduler_type,
|
122 |
+
optimizer=optimizer,
|
123 |
+
num_warmup_steps=warmup_steps,
|
124 |
+
num_training_steps=training_steps
|
125 |
+
)
|
126 |
+
|
127 |
+
# Define Loss Function
|
128 |
+
loss_fn = torch.nn.CrossEntropyLoss()
|
129 |
+
|
130 |
+
|
131 |
+
### Define a Sample Sentence for Testing ###
|
132 |
+
src_ids = torch.tensor(src_tokenizer.encode("I want to learn how to training a machine translation")).unsqueeze(0)
|
133 |
+
|
134 |
+
|
135 |
+
model, optimizer, trainloader, testloader, scheduler = accelerator.prepare(
|
136 |
+
model, optimizer, train_loader, test_loader, scheduler
|
137 |
+
)
|
138 |
+
|
139 |
+
|
140 |
+
accelerator.register_for_checkpointing(scheduler)
|
141 |
+
|
142 |
+
if resume_from_checkpoint is not None:
|
143 |
+
|
144 |
+
path_to_checkpoint = os.path.join(path_to_experiment, resume_from_checkpoint)
|
145 |
+
|
146 |
+
with accelerator.main_process_first():
|
147 |
+
accelerator.load_state(path_to_checkpoint)
|
148 |
+
|
149 |
+
completed_steps = int(resume_from_checkpoint.split("_")[-1])
|
150 |
+
accelerator.print(f"Resuming from Iteration: {completed_steps}")
|
151 |
+
else:
|
152 |
+
completed_steps = 0
|
153 |
+
|
154 |
+
|
155 |
+
|
156 |
+
def push_model_HF(repo_id, path_to_experiment, step):
|
157 |
+
"""Save model and tokenizer locally, then push to Hugging Face Hub."""
|
158 |
+
|
159 |
+
# Push to Hugging Face Hub
|
160 |
+
api = HfApi()
|
161 |
+
create_repo(repo_id, exist_ok=True)
|
162 |
+
|
163 |
+
api.upload_folder(
|
164 |
+
folder_path=path_to_experiment,
|
165 |
+
repo_id=repo_id,
|
166 |
+
repo_type="model" # or "dataset" if it's a dataset
|
167 |
+
)
|
168 |
+
|
169 |
+
print(f"Checkpoint {step} pushed to {repo_id}")
|
170 |
+
|
171 |
+
|
172 |
+
#copy tokenizers
|
173 |
+
shutil.copy2("trained_tokenizers/vocab_en.json", f"{path_to_experiment}/vocab_en.json")
|
174 |
+
shutil.copy2("trained_tokenizers/vocab_fr.json", f"{path_to_experiment}/vocab_fr.json")
|
175 |
+
|
176 |
+
|
177 |
+
#push model on HF
|
178 |
+
push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps)
|
179 |
+
|
180 |
+
|
181 |
+
train = True
|
182 |
+
progress_bar = tqdm(range(completed_steps, training_steps), disable= not accelerator.is_local_main_process)
|
183 |
+
|
184 |
+
save_dir = ""
|
185 |
+
|
186 |
+
while train:
|
187 |
+
accumulate_steps = 0
|
188 |
+
accumulate_loss = 0
|
189 |
+
accuracy = 0
|
190 |
+
|
191 |
+
for batch in trainloader:
|
192 |
+
src_input_ids = batch["src_input_ids"].to(accelerator.device)
|
193 |
+
src_pad_mask = batch["src_pad_mask"].to(accelerator.device)
|
194 |
+
tgt_input_ids = batch["tgt_input_ids"].to(accelerator.device)
|
195 |
+
tgt_pad_mask = batch["tgt_pad_mask"].to(accelerator.device)
|
196 |
+
tgt_labels = batch["tgt_labels"].to(accelerator.device)
|
197 |
+
|
198 |
+
model_output = model(
|
199 |
+
src_input_ids,
|
200 |
+
tgt_input_ids,
|
201 |
+
src_pad_mask,
|
202 |
+
tgt_pad_mask
|
203 |
+
)
|
204 |
+
|
205 |
+
model_output = model_output.flatten(0,1)
|
206 |
+
tgt_labels = tgt_labels.flatten()
|
207 |
+
loss = loss_fn(model_output, tgt_labels)
|
208 |
+
|
209 |
+
### Scale Loss and Accumulate ###
|
210 |
+
loss = loss / gradient_accumulation_steps
|
211 |
+
accumulate_loss += loss
|
212 |
+
|
213 |
+
### Compute Gradients ###
|
214 |
+
accelerator.backward(loss)
|
215 |
+
|
216 |
+
### Compute Accuracy (ignoring -100 padding labels) ###
|
217 |
+
model_output = model_output.argmax(axis=-1)
|
218 |
+
mask = (tgt_labels != -100)
|
219 |
+
output = model_output[mask]
|
220 |
+
tgt_outputs = tgt_labels[mask]
|
221 |
+
acc = (output == tgt_outputs).sum() / len(output)
|
222 |
+
accuracy += acc / gradient_accumulation_steps
|
223 |
+
|
224 |
+
### Iterate Accumulation ###
|
225 |
+
accumulate_steps += 1
|
226 |
+
|
227 |
+
if accumulate_steps % gradient_accumulation_steps == 0:
|
228 |
+
|
229 |
+
### Clip and Update Model ###
|
230 |
+
accelerator.clip_grad_norm_(model.parameters(), max_norm=1.0)
|
231 |
+
optimizer.step()
|
232 |
+
optimizer.zero_grad(set_to_none=True)
|
233 |
+
scheduler.step()
|
234 |
+
|
235 |
+
### Log Results ###
|
236 |
+
if completed_steps % logging_interval == 0:
|
237 |
+
|
238 |
+
accumulate_loss = accumulate_loss.detach()
|
239 |
+
accuracy = accuracy.detach()
|
240 |
+
|
241 |
+
if accelerator.num_processes > 1:
|
242 |
+
accumulate_loss = torch.mean(accelerator.gather_for_metrics(accumulate_loss))
|
243 |
+
accuracy = torch.mean(accelerator.gather_for_metrics(accuracy))
|
244 |
+
|
245 |
+
log = {"train_loss": accumulate_loss,
|
246 |
+
"training_acc": accuracy,
|
247 |
+
"learning_rate": scheduler.get_last_lr()[0]}
|
248 |
+
|
249 |
+
accelerator.log(log, step=completed_steps)
|
250 |
+
logging_string = f"[{completed_steps}/{training_steps}] Training Loss: {accumulate_loss} | Training Acc: {accuracy}"
|
251 |
+
if accelerator.is_main_process:
|
252 |
+
progress_bar.write(logging_string)
|
253 |
+
|
254 |
+
|
255 |
+
if completed_steps % evaluation_steps == 0:
|
256 |
+
model.eval()
|
257 |
+
print("Evaluating!")
|
258 |
+
|
259 |
+
test_losses = []
|
260 |
+
test_accs = []
|
261 |
+
|
262 |
+
for batch in tqdm(testloader, disable=not accelerator.is_main_process):
|
263 |
+
|
264 |
+
src_input_ids = batch["src_input_ids"].to(accelerator.device)
|
265 |
+
src_pad_mask = batch["src_pad_mask"].to(accelerator.device)
|
266 |
+
tgt_input_ids = batch["tgt_input_ids"].to(accelerator.device)
|
267 |
+
tgt_pad_mask = batch["tgt_pad_mask"].to(accelerator.device)
|
268 |
+
tgt_labels = batch["tgt_labels"].to(accelerator.device)
|
269 |
+
|
270 |
+
with torch.inference_mode():
|
271 |
+
model_output = model(src_input_ids,
|
272 |
+
tgt_input_ids,
|
273 |
+
src_pad_mask,
|
274 |
+
tgt_pad_mask)
|
275 |
+
|
276 |
+
### Flatten for Loss ###
|
277 |
+
model_output = model_output.flatten(0,1)
|
278 |
+
tgt_labels = tgt_labels.flatten()
|
279 |
+
|
280 |
+
### Compute Loss ###
|
281 |
+
loss = loss_fn(model_output, tgt_labels)
|
282 |
+
|
283 |
+
### Compute Accuracy (make sure to ignore -100 targets) ###
|
284 |
+
model_output = model_output.argmax(axis=-1)
|
285 |
+
mask = (tgt_labels != -100)
|
286 |
+
model_output = model_output[mask]
|
287 |
+
tgt_labels = tgt_labels[mask]
|
288 |
+
accuracy = (model_output == tgt_labels).sum() / len(model_output)
|
289 |
+
|
290 |
+
### Store Results ###
|
291 |
+
loss = loss.detach()
|
292 |
+
accuracy = accuracy.detach()
|
293 |
+
|
294 |
+
if accelerator.num_processes > 1:
|
295 |
+
loss = torch.mean(accelerator.gather_for_metrics(loss))
|
296 |
+
accuracy = torch.mean(accelerator.gather_for_metrics(accuracy))
|
297 |
+
|
298 |
+
### Store Metrics ###
|
299 |
+
test_losses.append(loss.item())
|
300 |
+
test_accs.append(accuracy.item())
|
301 |
+
|
302 |
+
test_loss = np.mean(test_losses)
|
303 |
+
test_acc = np.mean(test_accs)
|
304 |
+
|
305 |
+
log = {"test_loss": test_loss,
|
306 |
+
"test_acc": test_acc}
|
307 |
+
|
308 |
+
logging_string = f"Testing Loss: {test_loss} | Testing Acc: {test_acc}"
|
309 |
+
if accelerator.is_main_process:
|
310 |
+
progress_bar.write(logging_string)
|
311 |
+
|
312 |
+
|
313 |
+
### Log and Save Model ###
|
314 |
+
accelerator.log(log, step=completed_steps)
|
315 |
+
accelerator.save_state(os.path.join(path_to_experiment, f"checkpoint_{completed_steps}"))
|
316 |
+
|
317 |
+
push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps)
|
318 |
+
|
319 |
+
|
320 |
+
### Testing Sentence ###
|
321 |
+
if accelerator.is_main_process:
|
322 |
+
src_ids = src_ids.to(accelerator.device)
|
323 |
+
unrwapped = accelerator.unwrap_model(model)
|
324 |
+
translated = unrwapped.inference(src_ids,
|
325 |
+
tgt_start_id=tgt_tokenizer.bos_token_id,
|
326 |
+
tgt_end_id=tgt_tokenizer.eos_token_id, max_seq_length=config.max_seq_length)
|
327 |
+
|
328 |
+
translated = tgt_tokenizer.decode(translated, skip_special_tokens=False)
|
329 |
+
|
330 |
+
if accelerator.is_main_process:
|
331 |
+
progress_bar.write(f"Translation: {translated}")
|
332 |
+
|
333 |
+
model.train()
|
334 |
+
|
335 |
+
if completed_steps >= training_steps:
|
336 |
+
train = False
|
337 |
+
accelerator.save_state(os.path.join(path_to_experiment, f"final_checkpoint"))
|
338 |
+
push_model_HF(repo_id="ngia/ml-translation-en-fr", path_to_experiment=path_to_experiment, step=completed_steps)
|
339 |
+
break
|
340 |
+
|
341 |
+
### Iterate Completed Steps ###
|
342 |
+
completed_steps += 1
|
343 |
+
progress_bar.update(1)
|
344 |
+
|
345 |
+
### Reset Accumulated Variables ###
|
346 |
+
accumulate_loss = 0
|
347 |
+
accuracy = 0
|
348 |
+
|
349 |
+
|
350 |
+
|
351 |
+
|
352 |
+
accelerator.end_training()
|
353 |
+
|
utils.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import list_repo_files, hf_hub_download
|
2 |
+
|
3 |
+
def get_files_from_HF(repo_id, folder_name, local_dir):
|
4 |
+
|
5 |
+
files = list_repo_files(repo_id)
|
6 |
+
|
7 |
+
folder_files = [f for f in files if f.startswith(folder_name)]
|
8 |
+
|
9 |
+
for file in folder_files:
|
10 |
+
file_path = hf_hub_download(repo_id=repo_id, filename=file, local_dir=local_dir)
|
11 |
+
print(f"Downloaded: {file_path} to {local_dir}")
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
def get_file_FROM_HF(repo_id, file_path, local_dir):
|
16 |
+
file_path = hf_hub_download(repo_id=repo_id, filename=file_path, local_dir=local_dir)
|
17 |
+
return file_path
|
18 |
+
|
19 |
+
|
20 |
+
|