zaydzuhri's picture
Add files using upload-large-folder tool
0298ad2 verified
raw
history blame
2.38 kB
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
from dataclasses import dataclass, field
from datetime import timedelta
from io import BytesIO
from typing import Any, Dict, List
import torch
from torch.distributed.checkpoint.stateful import Stateful
@dataclass
class TrainState(Stateful):
step: int = 0
skipped_step: int = 0
token: int = 0
elapsed: timedelta = timedelta(0)
global_avg_losses: List[float] = field(default_factory=list)
global_max_losses: List[float] = field(default_factory=list)
log_steps: List[int] = field(default_factory=list)
def state_dict(self) -> Dict[str, Any]:
# Only checkpoint global_avg_losses and global_max_losses per log frequency
# to avoid sync overhead in every iteration.
global_avg_losses_bytes = BytesIO()
torch.save(self.global_avg_losses, global_avg_losses_bytes)
global_max_losses_bytes = BytesIO()
torch.save(self.global_max_losses, global_max_losses_bytes)
log_steps_bytes = BytesIO()
torch.save(self.log_steps, log_steps_bytes)
return {
"step": torch.tensor(self.step, dtype=torch.int32),
"skipped_step": torch.tensor(self.skipped_step, dtype=torch.int32),
"token": torch.tensor(self.token, dtype=torch.int64),
"elapsed": self.elapsed,
"global_avg_losses": global_avg_losses_bytes,
"global_max_losses": global_max_losses_bytes,
"log_steps": log_steps_bytes,
}
def load_state_dict(self, state_dict) -> None:
self.step = state_dict["step"].item()
self.skipped_step = state_dict.get("skipped_step", 0).item()
self.token = state_dict["token"].item()
self.elapsed = state_dict["elapsed"]
state_dict["global_avg_losses"].seek(0)
self.global_avg_losses = torch.load(
state_dict["global_avg_losses"], weights_only=False
)
state_dict["global_max_losses"].seek(0)
self.global_max_losses = torch.load(
state_dict["global_max_losses"], weights_only=False
)
state_dict["log_steps"].seek(0)
self.log_steps = torch.load(state_dict["log_steps"], weights_only=False)