Spaces:

crpatel
/

deepseek-v3-text-generation

Sleeping

App Files Files Community

crpatel commited on Mar 10

Commit

ab3efd5

1 Parent(s): fb995a8

gradio app

Browse files

Files changed (6) hide show

app.py +177 -0
config_smollm2_135M.yaml +108 -0
deepseek_v3.py +459 -0
requirements.txt +14 -0
train.py +417 -0
utils.py +182 -0

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import gradio as gr
+import torch
+from transformers import AutoTokenizer
+import yaml
+from deepseek_v3 import DeepSeekV3Model
+import os
+def generate_helper(model, idx, max_new_tokens, context_length, temperature=1.0, top_k=None, eos_token=None, device=None):
+    model = model.to(device)
+    idx = idx.to(device)
+    model.eval()
+    for _ in range(max_new_tokens):
+        idx_cond = idx[:, -context_length:]
+        with torch.no_grad():
+            logits, _ = model(idx_cond)  # Unpack both logits and loss (ignore loss)
+            logits = logits.view(idx_cond.shape[0], -1, model.config['vocab_size'])  # Reshape to [batch, seq, vocab]
+        # Get the logits for the last token only
+        logits = logits[:, -1, :]  # Shape: [batch_size, vocab_size]
+        if top_k is not None:
+            # top k sampling
+            top_logits, top_pos = torch.topk(logits, top_k)
+            min_logit = top_logits[:, -1].unsqueeze(-1)
+            logits = torch.where(logits < min_logit,
+                               torch.tensor(float('-inf')).to(logits.device),
+                               logits)
+        # temperature scaling
+        if temperature > 0.0:
+            logits /= temperature
+            probs = torch.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+        else:
+            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
+        if idx_next.item() == eos_token:
+            break
+        idx = torch.cat((idx, idx_next), dim=1)
+    model.train()
+    return idx
+def get_config(config_path):
+    config = yaml.load(open(config_path, "r"), Loader=yaml.FullLoader)
+    return config
+def extract_and_save_weights(config_path, checkpoint_path, weights_path, device):
+    """Extract model weights from checkpoint and save as a separate .pt file"""
+    print(f"Extracting weights from checkpoint: {checkpoint_path}")
+    config = get_config(config_path)
+    model = DeepSeekV3Model(config['model'])
+    # Load checkpoint
+    checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
+    state_dict = checkpoint['model_state_dict']
+    state_dict = {k.replace('_orig_mod.', ''): v for k, v in state_dict.items()}
+    # Save just the model weights
+    torch.save(state_dict, weights_path)
+    print(f"Model weights saved to: {weights_path}")
+    return state_dict
+def load_weights(config, weights_path, device):
+    """Load model from weights file"""
+    print(f"Loading model from weights: {weights_path}")
+    model = DeepSeekV3Model(config['model'])
+    state_dict = torch.load(weights_path, map_location=torch.device(device))
+    model.load_state_dict(state_dict)
+    return model
+def get_tokenizer(config):
+    tokenizer_path = config['tokenizer']['tokenizer_name_or_path']
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    vocab_size = tokenizer.vocab_size
+    return tokenizer, vocab_size
+def generate_text(model, tokenizer, input_text, max_new_tokens, context_length, temperature, top_k, eos_token, device):
+    encoded_text = tokenizer.encode(input_text, return_tensors="pt").to(device)
+    generated_text = generate_helper(model,
+                            idx=encoded_text,
+                            max_new_tokens=max_new_tokens,
+                            context_length=context_length,
+                            temperature=temperature,
+                            top_k=top_k,
+                            eos_token=eos_token,
+                            device=device)
+    return tokenizer.decode(generated_text.squeeze(0))
+# Initialize model and tokenizer
+def initialize_model():
+    config_path = "config_smollm2_135M.yaml"
+    # Use HF Hub or another external storage instead of local path
+    model_id = "crpatel/DeepSeek-V3-SmolLm2"  # Replace with your actual model ID
+    weights_path = "model.pt"
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Load configuration
+    config = get_config(config_path)
+    # Check if weights exist locally, otherwise download from HF Hub
+    if not os.path.exists(weights_path):
+        try:
+            from huggingface_hub import hf_hub_download
+            print(f"Downloading model weights from Hugging Face Hub: {model_id}")
+            weights_path = hf_hub_download(
+                repo_id=model_id,
+                filename="model.pt"
+            )
+        except Exception as e:
+            print(f"Error downloading weights: {e}")
+            print("Falling back to local checkpoint extraction if available")
+            checkpoint_path = "checkpoints/model_100000_step_avg_loss_4.61663.pth"
+            if os.path.exists(checkpoint_path):
+                extract_and_save_weights(config_path, checkpoint_path, weights_path, device)
+            else:
+                raise FileNotFoundError(f"Neither weights file nor checkpoint found. Please upload model to HF Hub first.")
+    # Load model from weights
+    model = load_weights(config, weights_path, device)
+    model.to(device)
+    model.eval()
+    # Load tokenizer
+    tokenizer, vocab_size = get_tokenizer(config)
+    return model, tokenizer, device
+def generate_response(prompt, max_new_tokens):
+    generated_text = generate_text(
+        model=model,
+        tokenizer=tokenizer,
+        input_text=prompt,
+        max_new_tokens=max_new_tokens,
+        context_length=256,
+        temperature=0.9,
+        top_k=2,
+        eos_token=tokenizer.eos_token_id,
+        device=device
+    )
+    return generated_text
+# Initialize global variables
+model, tokenizer, device = initialize_model()
+# Create Gradio interface
+iface = gr.Interface(
+    fn=generate_response,
+    inputs=[
+        gr.Textbox(
+            lines=3,
+            placeholder="Enter your prompt here...",
+            label="Input Prompt"
+        ),
+        gr.Slider(
+            minimum=50,
+            maximum=256,
+            value=100,
+            step=10,
+            label="Max New Tokens"
+        )
+    ],
+    outputs=gr.Textbox(
+        lines=5,
+        label="Generated Text"
+    ),
+    title="DeepSeek-V3 Text Generator",
+    description="Enter a prompt and adjust the maximum number of tokens to generate text with DeepSeek-V3 SmolLM2  model."
+)
+if __name__ == "__main__":
+    iface.launch()

config_smollm2_135M.yaml ADDED Viewed

	@@ -0,0 +1,108 @@

+checkpoints:
+  checkpoint_interval: 2000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_final_state: false
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_folder:
+      - datasets/smollm2-corpus
+      dataset_weights:
+      - 1.0
+    num_loading_workers: 0
+    seed: 8
+  name: stable phase
+  start_training_step: 1
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: smollm2
+  run: smollm2-135M
+  seed: 8
+  step: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.041666666666666664
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 0
+    eos_token_id: 0
+    hidden_act: silu
+    hidden_size: 576
+    initializer_range: 0.041666666666666664
+    intermediate_size: 1536
+    is_llama_config: true
+    max_position_embeddings: 2048
+    num_attention_heads: 9
+    num_hidden_layers: 30
+    num_key_value_heads: 3
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_interleaved: false
+    rope_scaling: null
+    rope_theta: 10000.0
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 49152
+    s3_bucket: deepseek-v3-train-mar-2025
+    s3_checkpoint_folder: checkpoints
+    s3_log_folder: logs
+    s3_log_file_name: training.log
+    # deepseek
+    compression_ratio: 4
+    num_experts: 4
+    num_shared_experts: 1
+    top_k: 2
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.003
+    lr_decay_starting_step: 1600000
+    lr_decay_steps: 400000
+    lr_decay_style: linear
+    lr_warmup_steps: 2000
+    lr_warmup_style: linear
+    min_decay_lr: 0
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 64
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  recompute_layer: false
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+  tp_recompute_allgather: true
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 1
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 8
+  sequence_length: 512
+  train_steps: 2000000
+  val_check_interval: 1000

deepseek_v3.py ADDED Viewed

	@@ -0,0 +1,459 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import SiLU
+import yaml
+def _init_weights(module, std=0.041666666666666664):
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=std)
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=std)
+class RotaryPositionalEmbedding(nn.Module):
+    """
+    # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L240
+    Rotary Positional Embedding (RoPE) for transformers Implemntation derived from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    """
+    def __init__(self, dim: int, theta: float = 10000.0):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+    def forward(self, x: torch.Tensor, seq_len: int) -> torch.Tensor:
+        """
+        Apply rotary positional embedding to the input tensor.
+        Args:
+            x (torch.Tensor): Input tensor of shape [B, T, H, D] or [B, T, D]
+            seq_len (int): Sequence length.
+        Returns:
+            torch.Tensor: Output tensor with rotary positional embeddings applied.
+        """
+        # Handle different input shapes
+        if len(x.shape) == 3:
+            B, T, D = x.shape
+            is_4d = False
+        else:
+            B, T, H, D = x.shape
+            is_4d = True
+        # For 3D tensors, we need to ensure D is even
+        if not is_4d and D % 2 != 0:
+            raise ValueError(f"Feature dimension {D} must be divisible by 2 for RoPE")
+        # Generate position indices
+        position = torch.arange(T, dtype=torch.float32, device=x.device).unsqueeze(-1)
+        # Generate frequencies
+        if is_4d:
+            # For 4D tensors, use the head dimension
+            freqs = torch.exp(
+                torch.arange(0, D, 2, dtype=torch.float32, device=x.device) *
+                -(torch.log(torch.tensor(self.theta)) / D)
+            )
+        else:
+            # For 3D tensors, use the full dimension
+            freqs = torch.exp(
+                torch.arange(0, D, 2, dtype=torch.float32, device=x.device) *
+                -(torch.log(torch.tensor(self.theta)) / D)
+            )
+        # Compute sinusoids
+        sinusoid = position * freqs
+        sin = torch.sin(sinusoid)
+        cos = torch.cos(sinusoid)
+        # Reshape sin and cos to match the input tensor's shape
+        if is_4d:
+            sin = sin.unsqueeze(0).unsqueeze(2)  # Shape: (1, T, 1, D // 2)
+            cos = cos.unsqueeze(0).unsqueeze(2)  # Shape: (1, T, 1, D // 2)
+        else:
+            sin = sin.unsqueeze(0)  # Shape: (1, T, D // 2)
+            cos = cos.unsqueeze(0)  # Shape: (1, T, D // 2)
+        # Apply rotary embeddings
+        x_rotated = x.clone()
+        if is_4d:
+            x_rotated[..., 0::2] = x[..., 0::2] * cos - x[..., 1::2] * sin
+            x_rotated[..., 1::2] = x[..., 1::2] * cos + x[..., 0::2] * sin
+        else:
+            x_rotated[..., 0::2] = x[..., 0::2] * cos - x[..., 1::2] * sin
+            x_rotated[..., 1::2] = x[..., 1::2] * cos + x[..., 0::2] * sin
+        return x_rotated
+class MultiHeadLatentAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_attention_heads = self.config['num_attention_heads']
+        self.hidden_size = self.config['hidden_size']
+        # Ensure the hidden size is divisible by the number of attention heads
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"hidden_size ({self.hidden_size}) must be divisible by num_attention_heads ({self.num_attention_heads})"
+            )
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.latent_dim = self.hidden_size // self.config['compression_ratio']
+        # Matrix is decomposed into D and U matrix
+        # Compression KV Projection Matrix
+        self.kv_proj_D = nn.Linear(self.hidden_size, self.latent_dim, bias=False)
+        # Compression Q Projection Matrix
+        self.q_proj_D = nn.Linear(self.hidden_size, self.latent_dim, bias=False)
+        # UnCompression k projection matrix
+        self.k_proj_U = nn.Linear(self.latent_dim, self.hidden_size//2, bias=False)
+        # UnCompression v projection matrix
+        self.v_proj_U = nn.Linear(self.latent_dim, self.hidden_size, bias=False)
+        # UnCompression Q projection matrix
+        self.q_proj_U = nn.Linear(self.latent_dim, self.hidden_size//2, bias=False)
+        # Rope Key Components, K is built from X and Q is build from q_proj_D
+        self.rope_k = nn.Linear(self.hidden_size, self.hidden_size//2, bias=False)
+        self.rope_q = nn.Linear(self.latent_dim, self.hidden_size//2, bias=False)
+        # output projection matrix
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.rotary_emb = RotaryPositionalEmbedding(self.hidden_size//2, self.config['rope_theta'])
+    def forward(self, x, attn_mask=None):
+        B, T, C = x.size() # Batch Size, Sequence Length, Hidden Size
+        # Compression KV Projection Matrix
+        kv_d = self.kv_proj_D(x) # [B, T, Latent Dim]
+        # Compression Q Projection Matrix
+        q_d = self.q_proj_D(x) # [B, T, Latent Dim]
+        # Uncompress KV & Q Projection Matrix
+        k_proj_2 = self.k_proj_U(kv_d) # [B, T, Hidden Size//2]
+        q_proj_2 = self.q_proj_U(q_d) # [B, T, Hidden Size//2]
+        v = self.v_proj_U(kv_d) # [B, T, Hidden Size]
+        # Rope components
+        k_rope_2 = self.rope_k(x) # [B, T, Hidden Size//2]
+        q_rope_2 = self.rope_q(q_d) # [B, T, Hidden Size//2]
+        # Apply ROPE to the rope components
+        k_rope_2 = self.rotary_emb(k_rope_2, T) # [B, T, Hidden Size//2]
+        q_rope_2 = self.rotary_emb(q_rope_2, T) # [B, T, Hidden Size//2]
+        # Reshape Components for Multi-Head Attention
+        k_proj_2 = k_proj_2.view(B, T, self.num_attention_heads, self.head_dim//2)
+        k_rope_2 = k_rope_2.view(B, T, self.num_attention_heads, self.head_dim//2)
+        q_proj_2 = q_proj_2.view(B, T, self.num_attention_heads, self.head_dim//2)
+        q_rope_2 = q_rope_2.view(B, T, self.num_attention_heads, self.head_dim//2)
+        # Concatenate Components
+        k = torch.cat((k_proj_2, k_rope_2), dim=-1) # [B, T, H, D]
+        q = torch.cat((q_proj_2, q_rope_2), dim=-1) # [B, T, H, D]
+        v = v.view(B, T, self.num_attention_heads, self.head_dim)
+        # Reshape Components for Multi-Head Attention
+        k = k.transpose(1, 2) # [B, H, T, D]
+        q = q.transpose(1, 2) # [B, H, T, D]
+        v = v.transpose(1, 2) # [B, H, T, D]
+        # Apply Scaled Dot-Product Attention
+        attn_out = F.scaled_dot_product_attention(q, k, v,
+                                                  dropout_p=0.0,
+                                                  is_causal=True,
+                                                  attn_mask=attn_mask)
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, T, C) # [B, T, C]
+        return self.o_proj(attn_out) # [B, T, C]
+class DeepSeekExpertLayer(nn.Module):
+    def __init__(self, hidden_size, intermediate_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = SiLU()
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class DeepSeekMOE(nn.Module):
+    """
+    A Mixture of Experts (MoE) layer that routes input through a set of expert layers.
+    This class implements a mixture of experts mechanism where a subset of experts is selected
+    for each input token based on learned routing logits. The output is a combination of the
+    shared experts and the routed experts, allowing for efficient computation and increased
+    model capacity.
+    Attributes:
+        hidden_size (int): The size of the hidden layer.
+        intermediate_size (int): The size of the intermediate layer.
+        num_experts (int): Total number of experts available.
+        num_shared_experts (int): Number of shared experts that are used for all inputs.
+        top_k (int): The number of top experts to route each input to.
+        shared_experts (nn.ModuleList): List of shared expert layers.
+        routed_experts (nn.ModuleList): List of routed expert layers.
+        routing_fn (nn.Linear): Linear layer for computing routing logits.
+        routing_bias (nn.Parameter): Bias for the routing logits.
+    Methods:
+        forward(x): Forward pass through the MoE layer, routing input through selected experts.
+    """
+    def __init__(self, hidden_size, intermediate_size, num_experts, num_shared_experts, top_k):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_experts = num_experts
+        self.num_shared_experts = num_shared_experts
+        self.top_k = top_k
+        self.num_routed_experts = num_experts - num_shared_experts
+        self.shared_experts = nn.ModuleList(
+            [DeepSeekExpertLayer(self.hidden_size, self.intermediate_size) for _ in range(self.num_shared_experts)]
+        )
+        self.routed_experts = nn.ModuleList(
+            [DeepSeekExpertLayer(self.hidden_size, self.intermediate_size) for _ in range(self.num_routed_experts)]
+        )
+        # Routing Function
+        self.routing_fn = nn.Linear(self.hidden_size, self.num_routed_experts, bias=False)
+        self.routing_bias = nn.Parameter(torch.zeros(self.num_routed_experts))
+    def forward(self, x):
+        B, T, C = x.size()
+        shared_out = sum(expert(x) for expert in self.shared_experts)
+        if self.num_shared_experts>1:
+            shared_out = shared_out/self.num_shared_experts # normalize the shared experts
+        # calculate the routing function
+        routing_logits = self.routing_fn(x) + self.routing_bias # [B, T, num_routed_experts]
+        # GEt Topk Experts per token
+        routing_probs = torch.sigmoid(routing_logits) # [B, T, num_routed_experts]
+        scores, indices = torch.topk(routing_probs, self.top_k, dim=-1) # [B, T, top_k]
+        # normalize the top k scores
+        scores  = scores/torch.sum(scores, dim=-1, keepdim=True)
+        # process the routed experts
+        #combined_output = torch.zeros(B, T, C, device=x.device)
+        combined_output = torch.zeros_like(x)
+        # Calculate expert load for all experts
+        expert_load = torch.zeros(self.num_routed_experts, device=x.device)
+        for i in range(self.top_k):
+            expert_idx = indices[:, :, i] # [B, T, top_k]
+            expert_scores = scores[...,i:i+1]
+            # process the routed experts
+            for j in range(self.num_routed_experts):
+                mask = (expert_idx == j) # [B, T, 1]
+                if mask.any():
+                    # Track expert usage (load)
+                    expert_load[j] += mask.sum().float() / (B * T * self.top_k)
+                    # Process tokens through this expert
+                    expert_input = x[mask] # [B, T, 1, C]
+                    expert_output = self.routed_experts[j](expert_input)
+                    combined_output[mask] += expert_scores[mask] * expert_output
+        final_output = shared_out + combined_output
+        router_z_loss = self.update_bias_terms(expert_load)
+        return final_output, router_z_loss
+    def update_bias_terms(self, expert_load, router_z_loss_coef=0.001):
+        # Balance expert routing by adjusting the bias terms
+        # Target load is uniform distribution across experts
+        target_load = 1.0 / self.num_routed_experts
+        # Calculate load imbalance for each expert
+        load_diff = expert_load - target_load
+        # Dynamic update rate based on the magnitude of imbalance
+        # Larger imbalances get larger corrections
+        update_rate = 0.1 * torch.abs(load_diff)
+        # Update the routing bias to counteract imbalance
+        # Decrease bias for overutilized experts, increase for underutilized
+        self.routing_bias.data -= update_rate * load_diff
+        # Calculate the router z-loss to discourage extreme routing probabilities
+        # This helps stabilize training without auxiliary losses
+        # Z-loss encourages routing probabilities to stay away from 0 and 1
+        router_z_loss = router_z_loss_coef * torch.mean(torch.log(torch.sum(
+            torch.exp(self.routing_fn.weight), dim=-1)))
+        return router_z_loss
+    def update_bias_terms_old(self, expert_load, ):
+        # adjust the bias terms based on the expert load
+        target_load = 1/self.num_experts
+        load_diff = expert_load - target_load
+        # dyanamic update the bias based on the load imbalance
+        update_rate = 0.1 * torch.abs(load_diff)
+        # dyanmic update the bias terms using update rate
+        self.routing_bias = self.routing_bias - update_rate * load_diff
+        # for i in range(self.num_routed_experts):
+        #     if expert_load[i] < target_load:
+        #         self.routing_bias[i] -= 1
+        #     else:
+        #         self.routing_bias[i] += 1
+class LlamaMLP(nn.Module):
+    """
+    (mlp): LlamaMLP(
+        (moe): DeepSeekMOE(
+          (shared_experts): ModuleList(
+            (0): DeepSeekExpertLayer(
+              (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
+              (up_proj): Linear(in_features=576, out_features=1536, bias=False)
+              (down_proj): Linear(in_features=1536, out_features=576, bias=False)
+              (act_fn): SiLU()
+            )
+          )
+          (routed_experts): ModuleList(
+            (0-2): 3 x DeepSeekExpertLayer(
+              (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
+              (up_proj): Linear(in_features=576, out_features=1536, bias=False)
+              (down_proj): Linear(in_features=1536, out_features=576, bias=False)
+              (act_fn): SiLU()
+            )
+          )
+          (routing_fn): Linear(in_features=576, out_features=3, bias=False)
+        )
+      )
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.moe = DeepSeekMOE(hidden_size=config['hidden_size'],
+                                intermediate_size=config['intermediate_size'],
+                                num_experts=config['num_experts'],
+                                num_shared_experts= config['num_shared_experts'],
+                                top_k=config['top_k'])
+        # self.gate_proj = nn.Linear(self.config['hidden_size'], self.config['intermediate_size'], bias=False)
+        # self.up_proj = nn.Linear(self.config['hidden_size'], self.config['intermediate_size'], bias=False)
+        # self.down_proj = nn.Linear(self.config['intermediate_size'], self.config['hidden_size'], bias=False)
+        # self.act_fn = SiLU()
+    def forward(self, x):
+        output, router_z_loss = self.moe(x)
+        return output, router_z_loss
+        # gate = self.gate_proj(x)
+        # up = self.up_proj(x)
+        # down = self.down_proj(self.act_fn(gate)*up)
+        # return down
+class LlamaRMSNorm(nn.Module):
+    """
+    (norm): LlamaRMSNorm((576,), eps=1e-05)
+        # RMSNorm Formula:
+        #    RMS(x) = sqrt((1 / d) * sum(x_i^2 for i in range(d)))
+        #    x_normalized = x / RMS(x)
+        #    output = gamma * x_normalized
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.eps = self.config['rms_norm_eps']
+        self.weight = nn.Parameter(torch.ones(self.config['hidden_size']))
+    def forward(self, x):
+        rms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)
+        return  self.weight *rms * x
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.self_attn = MultiHeadLatentAttention(self.config)
+        self.input_layernorm = LlamaRMSNorm(self.config)
+        self.mlp = LlamaMLP(self.config)
+        self.post_attention_layernorm = LlamaRMSNorm(self.config)
+    def forward(self, x):
+        residual = x
+        x = self.input_layernorm(x)
+        x = self.self_attn(x)
+        x = x + residual
+        residual = x
+        x = self.post_attention_layernorm(x)
+        x, router_z_loss = self.mlp(x)
+        x = x + residual
+        return x, router_z_loss
+class DeepSeekV3Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.init_method = config['init_method']
+        self.config = config['model_config']
+        self.embed_tokens = nn.Embedding(self.config['vocab_size'], self.config['hidden_size'])
+        self.rotary_emb = RotaryPositionalEmbedding(self.config['hidden_size'], self.config['rope_theta'])
+        self.layers = nn.ModuleList([LlamaDecoderLayer(self.config) for _ in range(self.config['num_hidden_layers'])])
+        self.norm = LlamaRMSNorm(self.config)
+        self.lm_head = nn.Linear(self.config['hidden_size'], self.config['vocab_size'], bias=False)
+        if self.config['tie_word_embeddings']:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.apply(lambda m: _init_weights(m, self.init_method['std']))
+    def forward(self, x, y=None):
+        x = self.embed_tokens(x)
+        total_router_z_loss = 0.0
+        for layer in self.layers:
+            x, router_z_loss = layer(x)
+            total_router_z_loss += router_z_loss
+        x = self.norm(x)
+        logits = self.lm_head(x) # B,T,V
+        logits = logits.view(-1, logits.size(-1))  # Shape: [B*T, V] # 20, 49152
+        if y is not None:
+            y = y.view(-1)  # Shape: [B*T] # 20
+            ce_loss = torch.nn.functional.cross_entropy(logits, y)
+            # Combine CE loss with router z-loss
+            loss = ce_loss + total_router_z_loss
+            return logits, loss
+        else:
+            return logits, None
+    def generate(self, idx, max_new_tokens, context_length, temperature=1.0, top_k=None, eos_token=None, device=None):
+        model = self.to(device)
+        idx = idx.to(device)
+        model.eval()
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -context_length:]
+            with torch.no_grad():
+                logits, _ = model(idx_cond)  # Unpack both logits and loss (ignore loss)
+                logits = logits.view(idx_cond.shape[0], -1, model.config['vocab_size'])  # Reshape to [batch, seq, vocab]
+            # Get the logits for the last token only
+            logits = logits[:, -1, :]  # Shape: [batch_size, vocab_size]
+            if top_k is not None:
+                # top k sampling
+                top_logits, top_pos = torch.topk(logits, top_k)
+                min_logit = top_logits[:, -1].unsqueeze(-1)
+                logits = torch.where(logits < min_logit,
+                                torch.tensor(float('-inf')).to(logits.device),
+                                logits)
+            # temperature scaling
+            if temperature > 0.0:
+                logits /= temperature
+                probs = torch.softmax(logits, dim=-1)
+                idx_next = torch.multinomial(probs, num_samples=1)
+            else:
+                idx_next = torch.argmax(logits, dim=-1, keepdim=True)
+            if idx_next.item() == eos_token:
+                break
+            idx = torch.cat((idx, idx_next), dim=1)
+        model.train()
+        return idx
+# if __name__ == "__main__":
+#     torch.manual_seed(0)
+#     config = yaml.load(open("config_smollm2_135M.yaml", "r"), Loader=yaml.FullLoader)
+#     print(config.keys())
+#     model_config = config['model']['model_config']
+#     print(model_config)
+#     model = DeepSeekV3Model(config['model'])
+#     x_tokens = torch.randint(0, model_config['vocab_size'], (1, 10))  # Generate random token indices
+#     print(model(x_tokens).shape)
+#     total_params = sum(p.numel() for p in model.parameters())
+#     print(f"Total parameters: {total_params}") #134515008
+#     print(model)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch
+torchtext
+pandas
+numpy==1.26.1
+matplotlib
+tiktoken
+tensorflow>=2.15.0
+tqdm
+# urllib
+requests
+boto3
+datasets
+transformers
+gradio

train.py ADDED Viewed

	@@ -0,0 +1,417 @@

+from deepseek_v3 import DeepSeekV3Model
+import torch
+import yaml
+from transformers import AutoTokenizer
+# from gptdataloader import GPTDataLoader
+from torch.utils.data import DataLoader
+import numpy as np
+from datasets import load_dataset
+import logging
+import math
+from utils import upload_file_to_s3
+# At the start of training loop
+# print(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+# print(f"GPU Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
+logger = logging.getLogger(__name__)
+formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+file_handler = logging.FileHandler('training.log')
+file_handler.setFormatter(formatter)  # Set formatter on the handler, not the logger
+logger.addHandler(file_handler)
+logger.setLevel(logging.INFO)
+def encode_text(examples, tokenizer, seq_length):
+    """Tokenize and prepare text examples for training."""
+    tokens = tokenizer(
+        examples["text"],
+        truncation=True,
+        padding="max_length",
+        max_length=seq_length + 1,
+        return_tensors="pt",
+    )
+    # Use clone().detach() as recommended
+    input_ids = tokens["input_ids"].squeeze(0).clone().detach()
+    input_ids = torch.clamp(input_ids, min=0, max=tokenizer.vocab_size - 1)
+    labels = input_ids.clone().detach()
+    labels = labels[1:].to(torch.int64)
+    input_ids = input_ids[:-1].to(torch.int64)
+    return {"input_ids": input_ids, "labels": labels}
+def load_cosmopedia_dataset(batch_size=8, seq_length=1024, tokenizer=None):
+    """
+    Returns a torch dataloader for the cosmopedia dataset
+    """
+    # Set tokenizer parallelism explicitly
+    import os
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    logger.info("tokenizer parallelism set to false")
+    try:
+        # Increase timeout and retries for dataset loading
+        from datasets import config
+        config.HF_DATASETS_TIMEOUT = 300  # 5 minutes timeout
+        config.MAX_RETRIES = 10  # Increase retry attempts
+        logger.info("dataset loading config set")
+        train_dataset = load_dataset(
+            "HuggingFaceTB/smollm-corpus",
+            name="cosmopedia-v2",
+            split="train",
+            streaming=True,
+        )
+        logger.info("dataset loaded")
+        # Use partial to bind tokenizer and seq_length to the encode function
+        from functools import partial
+        encode_fn = partial(encode_text, tokenizer=tokenizer, seq_length=seq_length)
+        train_dataset = train_dataset.map(
+            encode_fn,
+            remove_columns=["text"],
+            batched=False
+        )
+        train_dataset = train_dataset.with_format("torch")
+        train_dataloader = DataLoader(
+            train_dataset,
+            batch_size=batch_size,
+            num_workers=2,
+            pin_memory=True,
+            prefetch_factor=4,
+            persistent_workers=True
+        )
+        return train_dataloader
+    except Exception as e:
+        logger.error(f"Error loading dataset: {str(e)}")
+        return None
+# def create_dataloader(file_path, tokenizer, context_size, stride):
+#     with open(file_path, "r") as file:
+#         text_data = file.read()
+#     total_characters = len(text_data)
+#     total_tokens = len(tokenizer.encode(text_data))
+#     logger.info(f"Characters: {total_characters}")
+#     logger.info(f"Tokens: {total_tokens}")
+#     # create dataloader
+#     train_ratio = 0.9
+#     val_ratio = 0.1
+#     split_idx  =   int(train_ratio * total_characters)
+#     train_data = text_data[:split_idx]
+#     valid_data = text_data[split_idx:]
+#     train_dataset = GPTDataLoader(train_data, tokenizer, context_size, stride)
+#     valid_dataset = GPTDataLoader(valid_data, tokenizer, context_size, stride)
+#     return DataLoader(train_dataset, batch_size=10, shuffle=True, drop_last=True), DataLoader(valid_dataset, batch_size=10, shuffle=False, drop_last=True)
+# def calculate_loss_batch(input_batch, target_batch, model, device):
+#     input_batch = input_batch.to(device)
+#     target_batch = target_batch.to(device)
+#     logits, loss = model(input_batch, target_batch) # e.g. 10, 32, 49152
+#     logits = logits.view(-1, logits.size(-1))  # Shape: [320, 49152]
+#     target_batch = target_batch.view(-1)  # Shape: [320]
+#     loss = torch.nn.functional.cross_entropy(logits, target_batch)
+#     return loss
+# def calc_loss_loader(data_loader, model, device, num_batches=None):
+#     total_loss = 0.0
+#     if len(data_loader) == 0:
+#         return float("nan")
+#     elif num_batches is None:
+#         num_batches = len(data_loader)
+#     else:
+#         num_batches = min(num_batches, len(data_loader))
+#     for i, (input_batch, target_batch) in enumerate(data_loader):
+#         if i < num_batches:
+#             loss = calculate_loss_batch(input_batch, target_batch, model, device)
+#             total_loss += loss.item()
+#         else:
+#             break
+#     return total_loss / num_batches
+# def evaluate_model(model, train_dataloader, valid_dataloader, device, eval_iter=100):
+#     model.eval()
+#     with torch.no_grad():
+#         train_loss = calc_loss_loader(train_dataloader, model, device, num_batches=eval_iter)
+#         valid_loss = calc_loss_loader(valid_dataloader, model, device, num_batches=eval_iter)
+#     model.train()
+#     return train_loss, valid_loss
+def generate(model, idx, max_new_tokens, context_length, temperature=1.0, top_k=None, eos_token=None, device=None):
+    logger.info(f"Generating on device {device}")
+    model = model.to(device)
+    idx = idx.to(device)
+    model.eval()
+    for _ in range(max_new_tokens):
+        idx_cond = idx[:, -context_length:]
+        with torch.no_grad():
+            logits, _ = model(idx_cond)  # Unpack both logits and loss (ignore loss)
+            logits = logits.view(idx_cond.shape[0], -1, model.config['vocab_size'])  # Reshape to [batch, seq, vocab]
+        # Get the logits for the last token only
+        logits = logits[:, -1, :]  # Shape: [batch_size, vocab_size]
+        if top_k is not None:
+            # top k sampling
+            top_logits, top_pos = torch.topk(logits, top_k)
+            min_logit = top_logits[:, -1].unsqueeze(-1)
+            logits = torch.where(logits < min_logit,
+                               torch.tensor(float('-inf')).to(logits.device),
+                               logits)
+        # temperature scaling
+        if temperature > 0.0:
+            logits /= temperature
+            probs = torch.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+        else:
+            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
+        if idx_next.item() == eos_token:
+            break
+        idx = torch.cat((idx, idx_next), dim=1)
+    model.train()
+    return idx
+def sync_device(device):
+    if device.startswith('cuda'):
+        torch.cuda.synchronize()
+    elif device == 'cpu':
+        torch.cpu.synchronize() if hasattr(torch.cpu, 'synchronize') else None
+    elif device.startswith('mps'):  # For Apple Silicon
+        torch.mps.synchronize()
+def print_gpu_memory(step_name=""):
+    """
+    Print GPU memory statistics with a specified step name
+    """
+    if torch.cuda.is_available():
+        logger.info(f"\nGPU Memory Stats {step_name}:")
+        logger.info(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+        logger.info(f"GPU Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
+        logger.info(f"Max GPU Memory allocated: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB")
+# Learning rate scheduler
+def get_lr_lambda(current_step, warmup_steps, max_steps, max_lr):
+    """
+    Modified learning rate scheduler with:
+    1. Linear warmup for first 3000 steps
+    2. Cosine decay from 3000 to 60000 steps
+    3. Minimum learning rate of 1.5e-5 (5% of max_lr)
+    """
+    min_lr = max_lr * 0.05  # Minimum learning rate (5% of max_lr)
+    if current_step < warmup_steps:
+        # Linear warmup from 0 to max_lr
+        return float(current_step) / float(max(1, warmup_steps))
+    else:
+        # Cosine decay from max_lr to min_lr
+        progress = float(current_step - warmup_steps) / float(max(1, max_steps - warmup_steps))
+        return min_lr + 0.5 * (max_lr - min_lr) * (1.0 + math.cos(math.pi * progress))
+def train_model(config, model, train_loader, test_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context="Jack Gisburn rather a cheap genius- ", tokenizer=None):
+    total_loss = 0
+    tokens_seen, global_step = 0, -1
+    # Adjusted gradient accumulation setup for batch size 8
+    actual_batch_size = config['tokens']['micro_batch_size']  # Now 8
+    effective_batch_size_multiplier = 1  # Adjusted for batch size 8
+    target_batch_size = effective_batch_size_multiplier * config['tokens']['micro_batch_size']
+    gradient_accumulation_steps = target_batch_size // actual_batch_size
+    # Learning rate parameters adjusted for batch size 8
+    max_lr = 3e-4  # Keep the same max learning rate
+    warmup_steps = 3000  # Keep warmup steps
+    max_steps = 60000  # Keep max steps
+    min_lr = max_lr * 0.05  # Keep minimum LR at 5% of max
+    # Create LambdaLR scheduler with the improved lambda function
+    lr_lambda = lambda step: get_lr_lambda(step, warmup_steps, max_steps, max_lr)
+    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+    logger.info(f"Training with learning rate schedule:")
+    logger.info(f"Max LR: {max_lr}")
+    logger.info(f"Warmup Steps: {warmup_steps}")
+    logger.info(f"Max Steps: {max_steps}")
+    logger.info(f"Min LR: {max_lr * 0.05}")
+    logger.info(f"Gradient Accumulation Steps: {gradient_accumulation_steps}")
+    logger.info(f"Effective Batch Size: {actual_batch_size * gradient_accumulation_steps}")
+    print_gpu_memory("at start of training")
+    # Add these near the start of training loop
+    torch.cuda.empty_cache()
+    torch.backends.cudnn.benchmark = True
+    for epoch in range(num_epochs):
+        model.train()
+        optimizer.zero_grad()  # Zero gradients at start of epoch
+        for batch_idx, batch in enumerate(train_loader):
+            input_batch = batch['input_ids'].to(device)
+            target_batch = batch['labels'].to(device)
+            # Forward pass
+            with torch.autocast(device_type=device, dtype=torch.bfloat16):
+                logits, original_loss = model(input_batch, target_batch)
+            # Scale loss for gradient accumulation
+            scaled_loss = original_loss / gradient_accumulation_steps
+            scaled_loss.backward()
+            # Add the original loss to total_loss for logging
+            total_loss += original_loss.item()  # Don't multiply back up
+            tokens_seen += input_batch.numel()
+            # Calculate running average loss
+            total_batches = batch_idx + 1
+            avg_loss = total_loss / total_batches
+            if batch_idx % 25 == 0:
+                logger.info(f"Batch {batch_idx + 1}, Running Avg Loss: {avg_loss:.5f}")
+            # Only update weights after accumulating gradients
+            if (batch_idx + 1) % gradient_accumulation_steps == 0:
+                # Gradient clipping
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                optimizer.step()
+                scheduler.step()  # Update learning rate
+                optimizer.zero_grad()
+                global_step += 1
+            # Evaluation block
+            if global_step % eval_freq == 0 and global_step > 0:
+                # Use total batches processed instead of global_step
+                current_lr = scheduler.get_last_lr()[0]
+                optimizer_lr = optimizer.param_groups[0]['lr']
+                print_gpu_memory(f"at step {global_step}")
+                logger.info(f"learning rate: {current_lr:.8f}")
+                logger.info(f"Ep {epoch+1} (Step {global_step:06d}): "
+                      f"Avg loss {avg_loss:.3f} | {tokens_seen} tokens seen")
+                logger.info(f"optimizer lr: {optimizer_lr:.8f}")
+                logger.info(f"scheduler lr: {current_lr:.8f}")
+                # Generate sample text
+                start_context_list = ["In today's ever-evolving world, technology has become an integral part of our lives","Once upon a time, there was a friendly agency called Gaudette Insurance Agency, Inc. They help","A couple of years ago, I was working as an extra on the set of a low-budget British film.","Introduction: The Art of Crafting Vegan Sandwich Delights Sandwiches occupy a unique space in","Meet Chris, a superhero of supplies! Just like how Batman protects Gotham City","Identity formation is a complex and multifaceted process that involves the development of",    "With the development of science and technology, computer has become more and more ","Just as there are many variants and forms of electronic malware and Internet-based ","Correctly identifying what is causing a problem is the most important step in pest control.","Lobster, California spiny The California Spiny Lobster fishery is a small but locally ","Bees are vital for pollination. You can buy leafcutter bee houses to attract ","Feeling Alone Together: Exploring Alienation and Isolation in Literature", "Imagine if someone got their hands on dangerous weapons","Once upon a time, in a colorful town called Popville, ","he bell above the door jangled as Sarah walked into her family's hardware store"]
+                # Randomly select a prompt from the list
+                random_prompt = np.random.choice(start_context_list)
+                logger.info(f"Selected prompt: {random_prompt}")
+                logger.info(f"+++"*30)
+                encoded_text = tokenizer.encode(random_prompt, return_tensors="pt")
+                random_topk = np.random.randint(1, 10)
+                logger.info(f"random_topk: {random_topk}")
+                random_temperature = np.random.uniform(0.7, 0.9)
+                logger.info(f"random_temperature: {random_temperature}")
+                logger.info(f"global step {global_step} , batch_idx {batch_idx} => generating text")
+                generated_text = generate(model,
+                                       idx=encoded_text,
+                                       max_new_tokens=256,
+                                       context_length=256,
+                                       temperature=random_temperature,
+                                       top_k=random_topk,
+                                       eos_token=tokenizer.eos_token_id,
+                                       device=device)
+                logger.info(f"+++"*30)
+                logger.info(tokenizer.decode(generated_text.squeeze(0)))
+                logger.info(f"+++"*30)
+                # Save checkpoint
+                model_file_name = f"model_{global_step}_steps_avg_loss_{avg_loss:.5f}_optimizer_lr_{optimizer_lr:.8f}.pth"
+                torch.save({
+                    'step': global_step,
+                    'model_state_dict': model.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'scheduler_state_dict': scheduler.state_dict(),
+                    'loss': avg_loss,
+                }, model_file_name)
+                s3_path = upload_file_to_s3(model_file_name, config['model']['model_config']['s3_bucket'],
+                                          config['model']['model_config']['s3_checkpoint_folder'])
+                logger.info(f"Model saved to S3: {s3_path}")
+                log_path = upload_file_to_s3(config['model']['model_config']['s3_log_file_name'], config['model']['model_config']['s3_bucket'],
+                                              config['model']['model_config']['s3_log_folder'])
+                logger.info(f"Log saved to S3: {log_path}")
+            if batch_idx % 100 == 0:
+                logger.info(f"Batch {batch_idx} finished")
+                logger.info(f"+++"*30)
+    logger.info("Training complete")
+if __name__ == "__main__":
+    config = yaml.load(open("config_smollm2_135M.yaml", "r"), Loader=yaml.FullLoader)
+    logger.info(config)
+    # Set memory efficient settings
+    torch.set_float32_matmul_precision('high')
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    # Empty cache before model creation
+    torch.cuda.empty_cache()
+    import os
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:64'
+    model = DeepSeekV3Model(config['model'])
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Enable gradient checkpointing for memory efficiency
+    # model.gradient_checkpointing_enable()
+    model.to(device)
+    #model = torch.compile(model)
+    logger.info(model)
+    logger.info("++"*30)
+    total_params = sum(p.numel() for p in model.parameters())
+    logger.info(f"Total parameters: {total_params}")
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=3e-4,
+        weight_decay=0.15,
+        betas=(0.9, 0.95)
+    )
+    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
+    tokenizer.pad_token = tokenizer.eos_token
+    vocab_size = tokenizer.vocab_size
+    # Adjusted batch size to 8
+    train_loader = load_cosmopedia_dataset(
+        batch_size=8,  # Changed from 4 to 8
+        seq_length=512,  # Kept at 512
+        tokenizer=tokenizer
+    )
+    import time
+    t1 = time.time()
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Set environment variable for memory allocation
+    import os
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
+    train_model(
+        config,
+        model,
+        train_loader,
+        train_loader,
+        optimizer=optimizer,
+        device=device,
+        num_epochs=1,
+        eval_freq=2500,  # Increase eval frequency to every 500 steps
+        eval_iter=2500,
+        start_context="Once Upon a Time far far away in a galaxy",
+        tokenizer=tokenizer
+    )
+    t2 = time.time()
+    logger.info(f"Time taken for training: {t2 - t1:.2f} seconds")

utils.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import boto3
+from boto3.s3.transfer import TransferConfig
+from tqdm import tqdm
+import os
+def upload_file_to_s3(file_path, bucket_name, s3_prefix):
+    class ProgressPercentage(object):
+        def __init__(self, filename):
+            self._filename = filename
+            self._size = float(os.path.getsize(filename))
+            self._seen_so_far = 0
+            self._pbar = tqdm(total=self._size, unit='B', unit_scale=True, desc=f"Uploading {os.path.basename(filename)}")
+        def __call__(self, bytes_amount):
+            self._seen_so_far += bytes_amount
+            self._pbar.update(bytes_amount)
+    s3_client = boto3.client('s3')
+    file_name = os.path.basename(file_path)
+    s3_path = f"{s3_prefix}/{file_name}"
+    # Configure multipart upload
+    config = TransferConfig(
+        multipart_threshold=1024 * 25,  # 25MB
+        max_concurrency=10,
+        multipart_chunksize=1024 * 25,  # 25MB
+        use_threads=True
+    )
+    try:
+        s3_client.upload_file(
+            file_path,
+            bucket_name,
+            s3_path,
+            Config=config,
+            Callback=ProgressPercentage(file_path)
+        )
+        return f"s3://{bucket_name}/{s3_path}"
+    except Exception as e:
+        print(f"Failed to upload {file_path} to S3: {str(e)}")
+        return None
+max_lr = 1e-3
+warmup_steps = 10
+max_steps = 25000
+import math
+def get_lr_lambda(current_step, warmup_steps, max_steps, max_lr):
+    """
+    Learning rate scheduler with:
+    1. Linear warmup
+    2. Cosine decay
+    3. Minimum learning rate of 10% of max_lr
+    """
+    min_lr = max_lr * 0.1  # Minimum learning rate (10% of max_lr)
+    if current_step < warmup_steps:
+        # Linear warmup
+        return max_lr * (current_step + 1) / warmup_steps
+    elif current_step > max_steps:
+        # After max_steps, return minimum learning rate
+        return min_lr
+    else:
+        # Cosine decay between warmup_steps and max_steps
+        decay_ratio = (current_step - warmup_steps) / (max_steps - warmup_steps)
+        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
+        return min_lr + coeff * (max_lr - min_lr)
+def plot_lr_schedule():
+    """
+    Helper function to visualize the learning rate schedule
+    """
+    import matplotlib.pyplot as plt
+    steps = list(range(0, max_steps + 100))
+    lrs = [get_lr_lambda(step, warmup_steps, max_steps, max_lr) for step in steps]
+    plt.figure(figsize=(10, 5))
+    plt.plot(steps, lrs)
+    plt.title('Learning Rate Schedule')
+    plt.xlabel('Steps')
+    plt.ylabel('Learning Rate')
+    plt.grid(True)
+    plt.show()
+def plot_training_loss(log_file_path, output_path=None):
+    """
+    Parse a training log file and plot the running average loss against batch steps.
+    Also adds a trend line to visualize the overall training progress.
+    Args:
+        log_file_path (str): Path to the training log file
+        output_path (str, optional): Path to save the plot as PNG. If None, displays the plot instead.
+    """
+    import re
+    import matplotlib.pyplot as plt
+    import numpy as np
+    from scipy.optimize import curve_fit
+    # Regular expression to extract batch number and loss
+    pattern = r"Batch (\d+), Running Avg Loss: ([0-9.]+)"
+    steps = []
+    losses = []
+    # Read and parse the log file
+    with open(log_file_path, 'r') as file:
+        for line in file:
+            match = re.search(pattern, line)
+            if match:
+                batch_num = int(match.group(1))
+                loss = float(match.group(2))
+                steps.append(batch_num)
+                losses.append(loss)
+    if not steps:
+        print("No loss data found in the log file.")
+        return
+    # Create the plot
+    plt.figure(figsize=(12, 6))
+    plt.plot(steps, losses, 'b-', alpha=0.5, label='Running Avg Loss')
+    # Add trend line (using polynomial fit)
+    def poly_func(x, a, b, c):
+        return a * x**2 + b * x + c
+    # Convert to numpy arrays for curve fitting
+    x_array = np.array(steps)
+    y_array = np.array(losses)
+    # Fit the curve
+    try:
+        popt, _ = curve_fit(poly_func, x_array, y_array)
+        x_line = np.linspace(min(steps), max(steps), 1000)
+        y_line = poly_func(x_line, *popt)
+        plt.plot(x_line, y_line, 'r-', label='Trend Line')
+    except Exception as e:
+        print(f"Could not fit trend line: {e}")
+        # Fallback to simple moving average for trend
+        window_size = min(len(steps) // 10, 100) if len(steps) > 100 else len(steps) // 2
+        if window_size > 0:
+            moving_avg = np.convolve(y_array, np.ones(window_size)/window_size, mode='valid')
+            plt.plot(steps[window_size-1:], moving_avg, 'r-', label='Moving Average Trend')
+    # Add labels and title
+    plt.xlabel('Batch Number')
+    plt.ylabel('Running Average Loss')
+    plt.title('Training Loss Over Time')
+    plt.grid(True)
+    plt.legend()
+    # Add min and max loss annotations
+    min_loss = min(losses)
+    min_idx = losses.index(min_loss)
+    max_loss = max(losses)
+    max_idx = losses.index(max_loss)
+    plt.annotate(f'Min: {min_loss:.5f}',
+                xy=(steps[min_idx], min_loss),
+                xytext=(steps[min_idx], min_loss*1.05),
+                arrowprops=dict(facecolor='green', shrink=0.05),
+                fontsize=10)
+    plt.annotate(f'Max: {max_loss:.5f}',
+                xy=(steps[max_idx], max_loss),
+                xytext=(steps[max_idx], max_loss*0.95),
+                arrowprops=dict(facecolor='red', shrink=0.05),
+                fontsize=10)
+    # Save or show the plot
+    plt.tight_layout()
+    if output_path:
+        plt.savefig(output_path, dpi=300, bbox_inches='tight')
+        print(f"Plot saved to {output_path}")
+    else:
+        plt.show()
+if __name__ == "__main__":
+    # plot_lr_schedule()
+    plot_training_loss("training.log", "train_loss.png")