Upload 15 files

Browse files

Files changed (14) hide show

beam_search.py +1078 -0
checkpoint.py +2023 -0
config.json +2 -13
config.py +1371 -0
exceptions.py +50 -0
initialization.py +22 -0
model.py +1959 -0
modeling_fan.py +271 -0
optim.py +1040 -0
safetensors_util.py +81 -0
torch_util.py +158 -0
train.py +1384 -0
util.py +929 -0
version.py +11 -0

beam_search.py ADDED Viewed

	@@ -0,0 +1,1078 @@

+"""
+This is a self-contained and flexible beam search implementation adapted from
+AllenNLP's beam search: https://github.com/allenai/allennlp/blob/main/allennlp/nn/beam_search.py
+"""
+import copy
+import warnings
+from abc import abstractmethod
+from inspect import signature
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, cast
+import torch
+__all__ = [
+    "Sampler",
+    "DeterministicSampler",
+    "MultinomialSampler",
+    "TopKSampler",
+    "TopPSampler",
+    "GumbelSampler",
+    "FinalSequenceScorer",
+    "SequenceLogProbabilityScorer",
+    "LengthNormalizedSequenceLogProbabilityScorer",
+    "Constraint",
+    "RepeatedNGramBlockingConstraint",
+    "BeamSearch",
+]
+StateType = Dict[str, torch.Tensor]
+StepFunctionTypeWithTimestep = Callable[[torch.Tensor, StateType, int], Tuple[torch.Tensor, StateType]]
+StepFunctionTypeNoTimestep = Callable[[torch.Tensor, StateType], Tuple[torch.Tensor, StateType]]
+StepFunctionType = TypeVar("StepFunctionType", StepFunctionTypeWithTimestep, StepFunctionTypeNoTimestep)
+"""
+The type of step function that can be passed to [`BeamSearch.search`](#search).
+This can either be [`StepFunctionTypeWithTimestep`](#stepfunctiontypewithtimestep)
+or [`StepFunctionTypeNoTimestep`](#stepfunctiontypenotimestep).
+"""
+ConstraintStateType = List[List[Dict[str, Any]]]
+class Sampler:
+    """
+    An abstract class that can be used to sample candidates (either nodes or beams)
+    within `BeamSearch`.
+    A `Sampler` just has three methods, `init_state()`, `sample_nodes()` and `sample_beams()`.
+    `init_state()` takes three arguments:
+    - a tensor of starting log probs with shape `(batch_size,, num_classes)`,
+    - the batch size, an int,
+    - and the number of classes, also an int.
+    It returns a state dictionary with any state tensors needed for subsequent
+    calls to `sample_nodes()` and `sample_beams()`.
+    By default this method just returns an empty dictionary.
+    Both `sample_nodes()` and `sample_beams()` should take three arguments:
+    - tensor of normalized log probabilities with shape `(batch_size, num_examples)`,
+    - an integer representing the number of samples to take for each example in the batch,
+    - and a state dictionary which could contain any tensors needed for the `Sampler` to keep
+      track of state.
+    For `sample_nodes()`, `num_examples = num_classes`, but for `sample_beams`,
+    `num_examples = beam_size * per_node_beam_size`.
+    The return value should be a tuple containing:
+    - a tensor of log probabilities of the sampled examples with shape `(batch_size, num_samples)`,
+    - a tensor of indices of the sampled examples with shape `(batch_size, num_samples)`,
+    - and the updated state dictionary.
+    A default implementation of `sample_beams` is provided, which just deterministically
+    picks the `k` examples with highest log probability.
+    """
+    def init_state(
+        self, start_class_log_probabilities: torch.Tensor, batch_size: int, num_classes: int
+    ) -> StateType:
+        del start_class_log_probabilities, batch_size, num_classes
+        return {}
+    @abstractmethod
+    def sample_nodes(
+        self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        raise NotImplementedError
+    def sample_beams(
+        self, log_probs: torch.Tensor, beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        del state
+        selected_log_probs, selected_indices = torch.topk(log_probs, beam_size, dim=-1)
+        return selected_log_probs, selected_indices, {}
+class DeterministicSampler(Sampler):
+    """
+    A `Sampler` that just deterministically returns the `k` nodes or beams with highest
+    log probability.
+    """
+    def sample_nodes(
+        self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        del state
+        selected_log_probs, selected_indices = torch.topk(log_probs, per_node_beam_size, dim=-1)
+        return selected_log_probs, selected_indices, {}
+class MultinomialSampler(Sampler):
+    """
+    A `Sampler` which samples nodes from the given multinomial distribution. Beams are sampled
+    in the default, non-deterministic way.
+    :param temperature: A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
+        above 1.0 produces a flatter probability distribution.
+    :param with_replacement: Whether to sample with replacement.
+    """
+    def __init__(
+        self,
+        temperature: float = 1.0,
+        with_replacement: bool = False,
+    ) -> None:
+        self.temperature = temperature
+        self.with_replacement = with_replacement
+    def sample_nodes(
+        self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        if self.temperature != 1.0:
+            _probabilities = torch.nn.functional.softmax(log_probs / self.temperature, dim=-1)
+        else:
+            _probabilities = log_probs.exp()
+        selected_indices = torch.multinomial(_probabilities, per_node_beam_size, replacement=self.with_replacement)
+        return torch.gather(log_probs, 1, selected_indices), selected_indices, state
+class TopKSampler(Sampler):
+    """
+    A `Sampler` which redistributes the probability mass function for nodes among the
+    top `k` choices, then samples from that subset after re-normalizing the probabilities.
+    Beams are sampled in the default, deterministic way.
+    :param k: The number of top choices to be selected from.
+    :param temperature: A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
+        above 1.0 produces a flatter probability distribution.
+    :param with_replacement: If set to `True`, samples will be selected with replacement from the top k choices.
+    """
+    def __init__(
+        self,
+        k: int = 1,
+        temperature: float = 1.0,
+        with_replacement: bool = False,
+    ):
+        self.k = k
+        self.temperature = temperature or 1.0
+        self.with_replacement = with_replacement
+    def sample_nodes(
+        self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        if not per_node_beam_size <= self.k <= log_probs.size()[1]:
+            raise ValueError(
+                "k must be a postive integer no less than per_node_beam_size and no greater than vocabulary size"
+            )
+        # shape (both): (batch_size, k)
+        top_k_log_probs, top_k_indices = log_probs.topk(self.k, dim=-1)
+        # Apply temperature if necessary.
+        # shape: (batch_size, k)
+        if self.temperature != 1.0:
+            top_k_log_probs = top_k_log_probs / self.temperature
+        # Re-normalize the subset.
+        # shape: (batch_size, k)
+        normalized_top_k_probs = torch.nn.functional.softmax(top_k_log_probs, dim=-1)
+        # Sample from the re-normalized subset.
+        # NOTE: These indices are not indices into `log_probs`, they are indices into `top_k_log_probs`.
+        # shape: (batch_size, per_node_beam_size)
+        sampled_indices = torch.multinomial(
+            normalized_top_k_probs, per_node_beam_size, replacement=self.with_replacement
+        )
+        # Convert `sampled_indices` back to indices in the original `log_probs` tensor.
+        # shape: (batch_size, per_node_beam_size)
+        indices = top_k_indices.gather(-1, sampled_indices)
+        return log_probs.gather(1, indices), indices, state
+class TopPSampler(Sampler):
+    """
+    A `Sampler` which redistributes the probability mass function for nodes among
+    the top choices with a cumulative probability of at least `p`, then samples from that subset
+    after re-normalizing the probabilities.
+    Beams are sampled in the default, deterministic way.
+    :param p:
+        The cumulative probability cutoff threshold. A higher value of `p` will result in more possible
+        examples to sample from. If `with_replacement` is `False` and the number of possible samples is
+        insufficient to sample without replacement from when calling `sample_nodes`, then the top
+        `per_node_beam_size` examples will be chosen.
+    :param temperature:
+        A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
+        above 1.0 produces a flatter probability distribution.
+    :param with_replacement:
+        If set to `True`, samples will be selected with replacement from the top choices.
+    """
+    def __init__(
+        self,
+        p: float = 0.9,
+        temperature: float = 1.0,
+        with_replacement: bool = False,
+    ):
+        if p < 0.0 or p > 1.0:
+            raise ValueError("p must be a positive float no greater than 1.0")
+        self.p = p
+        self.temperature = temperature or 1.0
+        self.with_replacement = with_replacement
+    def sample_nodes(
+        self, log_probs: torch.Tensor, per_node_beam_size: int, state: StateType
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        if not per_node_beam_size <= log_probs.size()[1]:
+            raise ValueError("per_node_beam_size cannot be greater than vocabulary size")
+        # First apply temperature coefficient:
+        if self.temperature != 1.0:
+            _log_probs = torch.nn.functional.log_softmax(log_probs / self.temperature, dim=-1)
+        else:
+            _log_probs = log_probs
+        # Sort the probabilities in descending order to then find cumulative sum
+        log_probs_descending, sorting_indices = torch.sort(_log_probs, descending=True)
+        # shape: (batch_size, num_classes)
+        probabilities_descending = log_probs_descending.exp()
+        probabilities_summed = torch.cumsum(probabilities_descending, dim=-1)
+        # Create a mask for filtering out probabilities that don't make the top `p`.
+        # shape: (batch_size, num_classes)
+        exclusion_mask = probabilities_summed >= self.p
+        # We want to include the first index where probabilities_summed >= p, so we shift over one.
+        exclusion_mask[..., 1:] = exclusion_mask[..., :-1].clone()
+        exclusion_mask[..., 0] = False
+        # Make sure there's at least `per_node_beam_size` options to be selected.
+        if not self.with_replacement:
+            exclusion_mask[..., :per_node_beam_size] = False
+        log_probs_descending[exclusion_mask] = torch.finfo(log_probs.dtype).min
+        # Now re-normalized the included log probs.
+        # shape: (batch_size, num_classes)
+        filtered_probabilities = torch.nn.functional.softmax(log_probs_descending, dim=-1)
+        # Sample from the re-normalized subset.
+        # NOTE: These indices are not indices into `log_probs`, they are indices into `log_probs_descending`.
+        # shape: (batch_size, per_node_beam_size)
+        sampled_indices = torch.multinomial(
+            filtered_probabilities, per_node_beam_size, replacement=self.with_replacement
+        )
+        # Convert `sampled_indices` back to indices in the original `log_probs` tensor.
+        # shape: (batch_size, per_node_beam_size)
+        selected_indices = sorting_indices.gather(-1, sampled_indices)
+        # Return (selected log probabilities, selected classes)
+        # shape: (len(log_probs),1) , (len(log_probs), 1)
+        return torch.gather(log_probs, 1, selected_indices), selected_indices, state
+class GumbelSampler(Sampler):
+    """
+    A `Sampler` which uses the Gumbel-Top-K trick to sample without replacement. See
+    [*Stochastic Beams and Where to Find Them: The Gumbel-Top-k Trick for Sampling
+    Sequences Without Replacement*, W Kool, H Van Hoof and M Welling, 2010]
+    (https://api.semanticscholar.org/CorpusID:76662039).
+    :param temperature: A `temperature` below 1.0 produces a sharper probability distribution and a `temperature`
+        above 1.0 produces a flatter probability distribution.
+    """
+    def __init__(self, temperature: float = 1.0):
+        self.temperature = temperature
+    def init_state(
+        self, start_class_log_probabilities: torch.Tensor, batch_size: int, num_classes: int
+    ) -> StateType:
+        # shape: (batch_size, num_classes)
+        zeros = start_class_log_probabilities.new_zeros((batch_size, num_classes))
+        # shape: (batch_size, num_classes)
+        G_phi_S = self.gumbel_with_max(start_class_log_probabilities, zeros)
+        return {"G_phi_S": G_phi_S}
+    def sample_nodes(
+        self,
+        log_probs: torch.Tensor,
+        per_node_beam_size: int,
+        state: StateType,
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        # First apply temperature coefficient:
+        # shape: (batch_size * beam_size, num_classes)
+        if self.temperature != 1.0:
+            _log_probs = torch.nn.functional.log_softmax(log_probs / self.temperature, dim=-1)
+        else:
+            _log_probs = log_probs
+        # shape: (group_size,)
+        phi_S = state["phi_S"]
+        # shape: (group_size, num_classes)
+        phi_S = phi_S.unsqueeze(-1).expand_as(_log_probs)
+        # shape: (group_size, num_classes)
+        phi_S_new = phi_S + _log_probs
+        # shape: (group_size, 1)
+        G_phi_S = state["G_phi_S"].unsqueeze(-1)
+        # shape: (group_size, num_classes)
+        G_phi_S_new = self.gumbel_with_max(phi_S_new, G_phi_S)
+        # Replace NaNs with very negative number.
+        # shape: (group_size, num_classes)
+        #  G_phi_S_new[G_phi_S_new.isnan()] = torch.finfo(G_phi_S_new.dtype).min
+        # shape (both): (group_size, per_node_beam_size)
+        top_G_phi_S_new, top_indices = torch.topk(G_phi_S_new, per_node_beam_size, dim=-1)
+        # shape: (group_size, per_node_beam_size)
+        top_log_probs = log_probs.gather(1, top_indices)
+        return top_log_probs, top_indices, {"G_phi_S": top_G_phi_S_new}
+    def sample_beams(
+        self,
+        log_probs: torch.Tensor,
+        beam_size: int,
+        state: StateType,
+    ) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
+        """
+        Returns the beams with the highest perturbed log probabilities.
+        """
+        # shape (log_probs): (batch_size, beam_size * per_node_beam_size)
+        batch_size = log_probs.size()[0]
+        # shape: (batch_size * beam_size, per_node_beam_size)
+        G_phi_S = state["G_phi_S"]
+        # shape: (batch_size, beam_size * per_node_beam_size)
+        G_phi_S = G_phi_S.reshape_as(log_probs)
+        # shape (both): (batch_size, beam_size)
+        G_phi_S_new, selected_indices = torch.topk(G_phi_S, beam_size, dim=-1)
+        # shape: (batch_size, beam_size)
+        selected_log_probs = log_probs.gather(1, selected_indices)
+        # Now sort the selected beams by their true log prob.
+        # shape (all): (batch_size, beam_size)
+        selected_log_probs, sort_indices = selected_log_probs.sort(dim=-1, descending=True)
+        selected_indices = selected_indices.gather(1, sort_indices)
+        G_phi_S_new = G_phi_S_new.gather(1, sort_indices)
+        # shape: (batch_size * beam_size,)
+        G_phi_S_new = G_phi_S_new.reshape(batch_size * beam_size)
+        # shape: (batch_size * beam_size,)
+        phi_S = selected_log_probs.reshape(batch_size * beam_size)
+        return selected_log_probs, selected_indices, {"G_phi_S": G_phi_S_new, "phi_S": phi_S}
+    def gumbel(self, phi) -> torch.Tensor:
+        """
+        Sample `Gumbel(phi)`.
+        `phi` should have shape `(batch_size, num_classes)`.
+        """
+        return -torch.log(-torch.log(torch.rand_like(phi))) + phi
+    def gumbel_with_max(self, phi, T) -> torch.Tensor:
+        """
+        Sample `Gumbel(phi)` conditioned on the maximum value being equal to `T`.
+        `phi` should have shape `(batch_size, num_classes)` and `T` should have
+        shape `(batch_size, 1)`.
+        """
+        # Shape: (batch_size, num_classes)
+        G_phi = self.gumbel(phi)
+        # Now we find the maximum from these samples.
+        # Shape: (batch_size, )
+        Z, _ = G_phi.max(dim=-1)
+        # Shape: (batch_size, num_classes)
+        v = T - G_phi + torch.log1p(-torch.exp(G_phi - Z.unsqueeze(-1)))
+        # Shape: (batch_size, num_classes)
+        return T - torch.nn.functional.relu(v) - torch.log1p(torch.exp(-v.abs()))
+class FinalSequenceScorer:
+    """
+    An abstract class that can be used to score the final generated sequences found
+    by beam search. Given the predicted sequences and the corresponding log probabilities of
+    those sequences, the class calculates and returns the final score of the sequences.
+    The default implementation scores the sequences using the sum of the log probabilities of
+    the sequence, which is passed as input.
+    """
+    @abstractmethod
+    def score(self, predictions: torch.Tensor, log_probabilities: torch.Tensor, end_index: int) -> torch.Tensor:
+        """
+        Score the final predictions found by beam search.
+        Returns a tensor of the final sequence scores of shape `(batch_size, beam_size)`.
+        :param predictions: A tensor containing the initial predictions with shape `(batch_size, beam_size, max_steps)`.
+        :param log_probabilities: A tensor containing the log probabilities of the sequence, defined as the sum
+            of the log probabilities per token, with shape `(batch_size, beam_size)`.
+        :param end_index: The index of the end symbol.
+        """
+        raise NotImplementedError
+class SequenceLogProbabilityScorer(FinalSequenceScorer):
+    """
+    A :class:`FinalSequenceScorer` which scores the sequences by the sum of the log probabilities
+    across the sequence's tokens.
+    """
+    def score(self, predictions: torch.Tensor, log_probabilities: torch.Tensor, end_index: int) -> torch.Tensor:
+        del predictions, end_index
+        # The sum of the sequence log probabilities is the input parameter, so just
+        # return it.
+        return log_probabilities
+class LengthNormalizedSequenceLogProbabilityScorer(FinalSequenceScorer):
+    """
+    A :class:`FinalSequenceScorer` which scores the sequences by the average log probability of the
+    tokens in the sequence. It optionally includes a length penalty which promotes
+    or demotes sequences based on their lengths. The final score for a sequence will
+    be `(sequence_log_probability) / (sequence_length ** length_penalty)`. The sequence length
+    here includes the end token.
+    :param length_penalty: The length penalty to use. A value of 1.0 means no length penalty is used.
+        A value > 1.0 favors longer sequences, and < 1.0 favors shorter sequences.
+    """
+    def __init__(self, length_penalty: float = 1.0):
+        super().__init__()
+        self.length_penalty = length_penalty
+    def score(self, predictions: torch.Tensor, log_probabilities: torch.Tensor, end_index: int) -> torch.Tensor:
+        # shape: (batch_size, beam_size)
+        lengths = (predictions != end_index).long().sum(dim=2)
+        # If the sequence ended during beam search, the `log_probabilities` will include
+        # the transition to the end token. Therefore, in such situations, `lengths` is
+        # actually off by 1. This corrects for that.
+        # shape: (batch_size, beam_size)
+        is_end_token = predictions[:, :, -1] == end_index
+        lengths += is_end_token.long()
+        # shape: (batch_size, beam_size)
+        average_log_probs = log_probabilities / (lengths**self.length_penalty)
+        return average_log_probs
+class Constraint:
+    """
+    An abstract class that can be used to enforce constraints on the output predictions
+    by manipulating the class log probabilities during beam search.
+    A `Constraint` just has three methods that need to be implemented by subclasses:
+    `init_state()`, `apply()` and `_update_state()`.
+    `init_state()` takes one argument:
+    - the batch size, an int
+    It returns a constraint state, which is a nested list of dictionaries, with any state needed for subsequent
+    calls to `apply()` and `update_state()`. The length of the outer list should be equal to `batch_size`.
+    Each inner list should be of length 1.
+    `apply()` takes two arguments:
+    - the constraint state, which is a nested list of dictionaries. The length of the outer list is `batch_size`
+    and the length of each inner list is `beam_size` except on the first time `apply()` is called when it is 1.
+    - `class_log_probabilities`, a tensor of shape `(batch_size, beam_size, num_classes)` that contains the
+    log probabilities for the classes during search. The first time `apply()` is called, `beam_size = 1`.
+    The `apply()` method should return new `class_log_probabilities` that enforce the constraint
+    for this step of beam search. For instance, it may prevent a specific class from being selected by setting
+    the corresponding log probability to a negligible value such as `float("-inf")` or
+    `torch.finfo(class_log_probabilities.dtype).min`.
+    `_update_state()` takes two arguments:
+    - the copied parent constraint state, which is a nested list of dictionaries. `state[i][j]` contains the
+    copied state for the parent of `last_prediction[i, j]`. It is unique to that batch and beam, so it can be
+    directly edited in-place without affecting the others.
+    - last_prediction, a tensor of shape `(batch_size, beam_size)` containing the predictions from the last
+    step of beam search.
+    The `_update_state()` function should return a new constraint state, a nested list of dictionaries of
+    length `batch_size` and inner list of length `beam_size`, one for each of the predictions in `last_prediction`.
+    """
+    @abstractmethod
+    def init_state(
+        self,
+        batch_size: int,
+    ) -> ConstraintStateType:
+        raise NotImplementedError
+    @abstractmethod
+    def apply(
+        self,
+        state: ConstraintStateType,
+        class_log_probabilities: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    @staticmethod
+    def _copy_state(
+        state: ConstraintStateType,
+        batch_size: int,
+        beam_size: int,
+        last_backpointer: Optional[torch.Tensor] = None,
+    ) -> ConstraintStateType:
+        """
+        Copies the `state` . This method copies the data in `state` using `copy.deepcopy()`. If this
+        is not appropriate for your constraint, you will need to implement the copying yourself.
+        """
+        new_state = []
+        for i in range(batch_size):
+            batch_state = []
+            for j in range(beam_size):
+                if last_backpointer is None:
+                    # This is the first prediction, so the backpointer is 0
+                    backpointer = 0
+                else:
+                    backpointer = last_backpointer[i, j].item()
+                batch_state.append(copy.deepcopy(state[i][backpointer]))  # type: ignore
+            new_state.append(batch_state)
+        return new_state
+    def update_state(
+        self,
+        state: ConstraintStateType,
+        last_prediction: torch.Tensor,
+        last_backpointer: Optional[torch.Tensor] = None,
+    ) -> ConstraintStateType:
+        batch_size, beam_size = last_prediction.size()
+        new_state = self._copy_state(state, batch_size, beam_size, last_backpointer)
+        return self._update_state(new_state, last_prediction)
+    @abstractmethod
+    def _update_state(
+        self,
+        state: ConstraintStateType,
+        last_prediction: torch.Tensor,
+    ) -> ConstraintStateType:
+        raise NotImplementedError
+class RepeatedNGramBlockingConstraint(Constraint):
+    def __init__(self, ngram_size: int, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.ngram_size = ngram_size
+    def init_state(
+        self,
+        batch_size: int,
+    ) -> ConstraintStateType:
+        return [[{"seen_ngrams": {}, "current_prefix": []}] for _ in range(batch_size)]
+    def apply(
+        self,
+        state: ConstraintStateType,
+        class_log_probabilities: torch.Tensor,
+    ) -> torch.Tensor:
+        for i, batch in enumerate(state):
+            for j, beam in enumerate(batch):
+                current_prefix = tuple(beam["current_prefix"])
+                seen_ngrams = beam["seen_ngrams"]
+                try:
+                    disallowed_indices = seen_ngrams[current_prefix]
+                    class_log_probabilities[i, j, disallowed_indices] = torch.finfo(
+                        class_log_probabilities.dtype
+                    ).min
+                except KeyError:
+                    # We have not seen this prefix before, so there is no index
+                    # that needs to be blocked
+                    pass
+        return class_log_probabilities
+    def _update_state(
+        self,
+        state: ConstraintStateType,
+        last_prediction: torch.Tensor,
+    ) -> ConstraintStateType:
+        for i, batch in enumerate(state):
+            for j, beam in enumerate(batch):
+                prediction = last_prediction[i, j].item()
+                prefix = beam["current_prefix"]
+                seen_ngrams = beam["seen_ngrams"]
+                if len(prefix) == self.ngram_size - 1:
+                    # This is a new ngram that we have to remember
+                    if tuple(prefix) not in seen_ngrams:
+                        seen_ngrams[tuple(prefix)] = []
+                    seen_ngrams[tuple(prefix)].append(prediction)
+                # Create the new prefix, removing the oldest index if the prefix
+                # is too long
+                prefix.append(prediction)
+                if len(prefix) == self.ngram_size:
+                    prefix.pop(0)
+        return state
+class BeamSearch:
+    """
+    Implements the beam search algorithm for decoding the most likely sequences.
+    :param end_index: The index of the "stop" or "end" token in the vocabulary. Usually the EOS token ID.
+    :param max_steps: The maximum number of decoding steps to take, i.e. the maximum length
+        of the predicted sequences.
+    :param beam_size: The width of the beam used.
+    :param per_node_beam_size: The maximum number of candidates to consider per node, at each step in the search.
+        If not given, this just defaults to `beam_size`. Setting this parameter
+        to a number smaller than `beam_size` may give better results, as it can introduce
+        more diversity into the search. See
+        [*Beam Search Strategies for Neural Machine Translation*, Freitag and Al-Onaizan, 2017]
+        (https://api.semanticscholar.org/CorpusID:2229477).
+    :param sampler: An optional `Sampler` which is used to pick next candidate nodes and beams.
+        If not specified, `DeterministicSampler` will be used, which just takes the
+        `per_node_beam_size` most likely nodes and the `beam_size` most likely beams.
+        Using the [`GumbelSampler`](#gumbelsampler), on the other hand, will give you
+        [Stochastic Beam Search](https://api.semanticscholar.org/CorpusID:76662039).
+    :param min_steps: The minimum number of decoding steps to take, i.e. the minimum length of
+        the predicted sequences. This does not include the start or end tokens. If `None`,
+        no minimum is enforced.
+    :param final_sequence_scorer: An optional `FinalSequenceScorer` which is used to score the final generated sequences.
+        The output from this module is what is returned by the `search` method. If not
+        specified, `SequenceLogProbabilityScorer` will be used, which scores the sequences
+        by the sum of the token log probabilities.
+    :param constraints: An optional list of `Constraint`s which should be applied during beam search. If not
+        provided, no constraints will be enforced.
+    """
+    def __init__(
+        self,
+        end_index: int,
+        *,
+        max_steps: int = 50,
+        beam_size: int = 10,
+        per_node_beam_size: Optional[int] = None,
+        sampler: Optional[Sampler] = None,
+        min_steps: Optional[int] = None,
+        final_sequence_scorer: Optional[FinalSequenceScorer] = None,
+        constraints: Optional[List[Constraint]] = None,
+    ) -> None:
+        if not max_steps > 0:
+            raise ValueError("max_steps must be positive")
+        if not beam_size > 0:
+            raise ValueError("beam_size must be positive")
+        if per_node_beam_size is not None and not per_node_beam_size > 0:
+            raise ValueError("per_node_beam_size must be positive")
+        if min_steps is not None:
+            if not min_steps >= 0:
+                raise ValueError("min_steps must be non-negative")
+            if not min_steps <= max_steps:
+                raise ValueError("min_steps must be less than or equal to max_steps")
+        self._end_index = end_index
+        self.max_steps = max_steps
+        self.beam_size = beam_size
+        self.per_node_beam_size = per_node_beam_size or beam_size
+        self.sampler = sampler or DeterministicSampler()
+        self.min_steps = min_steps or 0
+        self.final_sequence_scorer = final_sequence_scorer or SequenceLogProbabilityScorer()
+        self.constraints = constraints or []
+    @staticmethod
+    def _reconstruct_sequences(predictions, backpointers):
+        # Reconstruct the sequences.
+        # shape: [(batch_size, beam_size, 1)]
+        reconstructed_predictions = [predictions[-1].unsqueeze(2)]
+        if not backpointers:
+            return reconstructed_predictions
+        # shape: (batch_size, beam_size)
+        cur_backpointers = backpointers[-1]
+        for timestep in range(len(predictions) - 2, 0, -1):
+            # shape: (batch_size, beam_size, 1)
+            cur_preds = predictions[timestep].gather(1, cur_backpointers).unsqueeze(2)
+            reconstructed_predictions.append(cur_preds)
+            # shape: (batch_size, beam_size)
+            cur_backpointers = backpointers[timestep - 1].gather(1, cur_backpointers)
+        # shape: (batch_size, beam_size, 1)
+        final_preds = predictions[0].gather(1, cur_backpointers).unsqueeze(2)
+        reconstructed_predictions.append(final_preds)
+        return reconstructed_predictions
+    def search(
+        self,
+        start_predictions: torch.Tensor,
+        start_state: StateType,
+        step: StepFunctionType,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Given a starting state and a step function, apply beam search to find the
+        most likely target sequences.
+        Returns a tuple of `(predictions, final_scores)`, where `predictions`
+        has shape `(batch_size, beam_size, max_steps)` and `final_scores`
+        has shape `(batch_size, beam_size)`.
+        .. note::
+            If your step function returns `-inf` for some log probabilities
+            (like if you're using a masked log-softmax) then some of the "best"
+            sequences returned may also have `-inf` log probability. Specifically
+            this happens when the beam size is smaller than the number of actions
+            with finite log probability (non-zero probability) returned by the step function.
+            Therefore if you're using a mask you may want to check the results from `search`
+            and potentially discard sequences with non-finite log probability.
+        :param start_predictions: A tensor containing the initial predictions with shape `(batch_size,)`.
+            Usually the initial predictions are just the index of the "start" token
+            in the target vocabulary.
+        :param start_state: The initial state passed to the `step` function. Each value of the state dict
+            should be a tensor of shape `(batch_size, *)`, where `*` means any other
+            number of dimensions.
+        :param step: A function that is responsible for computing the next most likely tokens,
+            given the current state and the predictions from the last time step.
+            The function should accept two or three arguments:
+            - a tensor of shape `(group_size,)` or representing the index of the predicted
+            tokens from the last time step,
+            - the current state, a `StateType`, and
+            - optionally, the timestep, an `int`.
+            The `group_size` will be `batch_size * beam_size`, except in the initial
+            step, for which it will just be `batch_size`.
+            The function is expected to return a tuple, where the first element
+            is a tensor of shape `(group_size, vocab_size)` containing
+            the log probabilities of the tokens for the next step, and the second
+            element is the updated state. The tensor in the state should have shape
+            `(group_size, *)`, where `*` means any other number of dimensions.
+        """
+        step_signature = signature(step)
+        if len(step_signature.parameters) < 3:
+            # If the step function we're given does not take the time step argument, wrap it
+            # in one that does.
+            old_step = cast(StepFunctionTypeNoTimestep, step)
+            def new_step(last_predictions: torch.Tensor, state: Dict[str, torch.Tensor], time_step: int):
+                del time_step
+                return old_step(last_predictions, state)
+            return self._search(start_predictions, start_state, new_step)
+        else:
+            return self._search(start_predictions, start_state, cast(StepFunctionTypeWithTimestep, step))
+    def _search(
+        self,
+        start_predictions: torch.Tensor,
+        start_state: StateType,
+        step: StepFunctionTypeWithTimestep,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = start_predictions.size()[0]
+        # List of (batch_size, beam_size) tensors. One for each time step. Does not
+        # include the start symbols, which are implicit.
+        predictions: List[torch.Tensor] = []
+        # List of (batch_size, beam_size) tensors. One for each time step. None for
+        # the first.  Stores the index n for the parent prediction, i.e.
+        # predictions[t-1][i][n], that it came from.
+        backpointers: List[torch.Tensor] = []
+        constraint_states = [constraint.init_state(batch_size) for constraint in self.constraints]
+        # Calculate the first timestep. This is done outside the main loop
+        # because we are going from a single decoder input (the output from the
+        # encoder) to the top `beam_size` decoder outputs. On the other hand,
+        # within the main loop we are going from the `beam_size` elements of the
+        # beam to `beam_size`^2 candidates from which we will select the top
+        # `beam_size` elements for the next iteration.
+        # shape: (batch_size, num_classes)
+        start_class_log_probabilities, state = step(start_predictions, start_state, 0)
+        num_classes = start_class_log_probabilities.size()[1]
+        # Make sure `per_node_beam_size` is not larger than `num_classes`.
+        if self.per_node_beam_size > num_classes:
+            raise ValueError(
+                f"Vocab size ({num_classes:d}) too small "
+                f"relative to per_node_beam_size ({self.per_node_beam_size:d}).\n"
+                f"Please decrease beam_size or per_node_beam_size."
+            )
+        sampler_state = self.sampler.init_state(start_class_log_probabilities, batch_size, num_classes)
+        # Apply all constraints.
+        if self.constraints:
+            # shape: (batch_size, 1, num_classes)
+            expanded_start_class_log_probabilities = start_class_log_probabilities.unsqueeze(1)
+            for constraint, constraint_state in zip(self.constraints, constraint_states):
+                expanded_start_class_log_probabilities = constraint.apply(
+                    constraint_state, expanded_start_class_log_probabilities
+                )
+            start_class_log_probabilities = expanded_start_class_log_probabilities.squeeze(1)
+        # Prevent selecting the end symbol if there is any min_steps constraint
+        if self.min_steps >= 1:
+            start_class_log_probabilities[:, self._end_index] = torch.finfo(
+                start_class_log_probabilities.dtype
+            ).min
+        # Get the initial predicted classed and their log probabilities.
+        # shape: (batch_size, beam_size), (batch_size, beam_size)
+        (
+            start_top_log_probabilities,
+            start_predicted_classes,
+            sampler_state,
+        ) = self.sampler.sample_beams(start_class_log_probabilities, self.beam_size, sampler_state)
+        if self.beam_size == 1 and (start_predicted_classes == self._end_index).all():
+            warnings.warn(
+                "Empty sequences predicted. You may want to increase the beam size or ensure "
+                "your step function is working properly.",
+                RuntimeWarning,
+            )
+            return start_predicted_classes.unsqueeze(-1), start_top_log_probabilities
+        # The log probabilities for the last time step.
+        # shape: (batch_size, beam_size)
+        last_log_probabilities = start_top_log_probabilities
+        # shape: [(batch_size, beam_size)]
+        predictions.append(start_predicted_classes)
+        # Log probability tensor that mandates that the end token is selected.
+        # shape: (batch_size * beam_size, num_classes)
+        log_probs_after_end = start_class_log_probabilities.new_full(
+            (batch_size * self.beam_size, num_classes),
+            torch.finfo(start_class_log_probabilities.dtype).min,
+        )
+        log_probs_after_end[:, self._end_index] = 0.0
+        # Set the same state for each element in the beam.
+        self._update_initial_state(state, batch_size)
+        for i, constraint in enumerate(self.constraints):
+            constraint_states[i] = constraint.update_state(constraint_states[i], start_predicted_classes)
+        for timestep in range(self.max_steps - 1):
+            # shape: (batch_size * beam_size,)
+            last_predictions = predictions[-1].reshape(batch_size * self.beam_size)
+            # If every predicted token from the last step is `self._end_index`,
+            # then we can stop early.
+            if (last_predictions == self._end_index).all():
+                break
+            # Take a step. This get the predicted log probs of the next classes
+            # and updates the state.
+            # shape: (batch_size * beam_size, num_classes)
+            class_log_probabilities, state = step(last_predictions, state, timestep + 1)
+            # Apply all constraints.
+            if self.constraints:
+                # shape: (batch_size, beam_size, num_classes)
+                reshaped_class_log_probabilities = class_log_probabilities.view(batch_size, self.beam_size, -1)
+                for constraint, constraint_state in zip(self.constraints, constraint_states):
+                    reshaped_class_log_probabilities = constraint.apply(
+                        constraint_state, reshaped_class_log_probabilities
+                    )
+                # shape: (batch_size * beam_size, num_classes)
+                class_log_probabilities = reshaped_class_log_probabilities.view(batch_size * self.beam_size, -1)
+            # The `timestep`-th iteration of the for loop is generating the `timestep + 2`-th token
+            # of the sequence (because `timestep` is 0-indexed and we generated the first token
+            # before the for loop). Here we block the end index if the search is not allowed to
+            # terminate on this iteration.
+            if timestep + 2 <= self.min_steps:
+                class_log_probabilities[:, self._end_index] = torch.finfo(class_log_probabilities.dtype).min
+            # shape: (batch_size * beam_size, num_classes)
+            last_predictions_expanded = last_predictions.unsqueeze(-1).expand(
+                batch_size * self.beam_size, num_classes
+            )
+            # Here we are finding any beams where we predicted the end token in
+            # the previous timestep and replacing the distribution with a
+            # one-hot distribution, forcing the beam to predict the end token
+            # this timestep as well.
+            # shape: (batch_size * beam_size, num_classes)
+            cleaned_log_probabilities = torch.where(
+                last_predictions_expanded == self._end_index,
+                log_probs_after_end,
+                class_log_probabilities,
+            )
+            # shape (both): (batch_size * beam_size, per_node_beam_size)
+            top_log_probabilities, predicted_classes, sampler_state = self.sampler.sample_nodes(
+                cleaned_log_probabilities, self.per_node_beam_size, sampler_state
+            )
+            # Here we expand the last log probabilities to (batch_size * beam_size, per_node_beam_size)
+            # so that we can add them to the current log probs for this timestep.
+            # This lets us maintain the log probability of each element on the beam.
+            # shape: (batch_size * beam_size, per_node_beam_size)
+            expanded_last_log_probabilities = (
+                last_log_probabilities.unsqueeze(2)
+                .expand(batch_size, self.beam_size, self.per_node_beam_size)
+                .reshape(batch_size * self.beam_size, self.per_node_beam_size)
+            )
+            # shape: (batch_size * beam_size, per_node_beam_size)
+            summed_top_log_probabilities = top_log_probabilities + expanded_last_log_probabilities
+            # shape: (batch_size, beam_size * per_node_beam_size)
+            reshaped_summed = summed_top_log_probabilities.reshape(
+                batch_size, self.beam_size * self.per_node_beam_size
+            )
+            # shape: (batch_size, beam_size * per_node_beam_size)
+            reshaped_predicted_classes = predicted_classes.reshape(
+                batch_size, self.beam_size * self.per_node_beam_size
+            )
+            # Keep only the top `beam_size` beam indices.
+            # shape (both): (batch_size, beam_size)
+            (
+                restricted_beam_log_probs,
+                restricted_beam_indices,
+                sampler_state,
+            ) = self.sampler.sample_beams(reshaped_summed, self.beam_size, sampler_state)
+            # Use the beam indices to extract the corresponding classes.
+            # shape: (batch_size, beam_size)
+            restricted_predicted_classes = reshaped_predicted_classes.gather(1, restricted_beam_indices)
+            predictions.append(restricted_predicted_classes)
+            # shape: (batch_size, beam_size)
+            last_log_probabilities = restricted_beam_log_probs
+            # The beam indices come from a `beam_size * per_node_beam_size` dimension where the
+            # indices with a common ancestor are grouped together. Hence
+            # dividing by per_node_beam_size gives the ancestor. (Note that this is integer
+            # division as the tensor is a LongTensor.)
+            # shape: (batch_size, beam_size)
+            backpointer = torch.divide(restricted_beam_indices, self.per_node_beam_size, rounding_mode="trunc")
+            backpointers.append(backpointer)
+            # Keep only the pieces of the state tensors corresponding to the
+            # ancestors created this iteration.
+            self._update_state(state, backpointer)
+            for i, constraint in enumerate(self.constraints):
+                constraint_states[i] = constraint.update_state(
+                    constraint_states[i], restricted_predicted_classes, last_backpointer=backpointer
+                )
+        # Warn about "-inf" log probabilities if not using any constraints (negligible
+        # log probabilities are expected when using constraints).
+        if not self.constraints and (
+            not torch.isfinite(last_log_probabilities).all()
+            or (last_log_probabilities == torch.finfo(last_log_probabilities.dtype).min).any()
+        ):
+            warnings.warn(
+                "Negligible log probabilities encountered ('-inf' or equivalent). "
+                "Some final sequences may not make sense. "
+                "This can happen when the beam size is larger than the number of valid (non-zero "
+                "probability) transitions that the step function produces.",
+                RuntimeWarning,
+            )
+        reconstructed_predictions = self._reconstruct_sequences(predictions, backpointers)
+        # shape: (batch_size, beam_size, max_steps)
+        all_predictions = torch.cat(list(reversed(reconstructed_predictions)), 2)
+        # Calculate the final sequence scores
+        # shape: (batch_size, beam_size)
+        final_scores = self.final_sequence_scorer.score(all_predictions, last_log_probabilities, self._end_index)
+        # Sort the sequences based on the final scores so the best scoring
+        # sequence is at index 0
+        sorted_final_scores, sorted_indices = torch.sort(final_scores, dim=1, descending=True)
+        sorted_all_predictions = torch.gather(
+            all_predictions, 1, sorted_indices.unsqueeze(-1).expand_as(all_predictions)
+        )
+        return sorted_all_predictions, sorted_final_scores
+    def _update_initial_state(self, state: StateType, batch_size: int):
+        """
+        Expand tensors in a state dictionary from `(batch_size, *)` to `(batch_size * beam_size, *)`.
+        """
+        for key, state_tensor in state.items():
+            if state_tensor is None:
+                continue
+            # shape: (batch_size * beam_size, *)
+            _, *last_dims = state_tensor.size()
+            state[key] = (
+                state_tensor.unsqueeze(1)
+                .expand(batch_size, self.beam_size, *last_dims)
+                .reshape(batch_size * self.beam_size, *last_dims)
+            )
+    def _update_state(self, state: StateType, backpointer: torch.Tensor):
+        batch_size = backpointer.size()[0]
+        for key, state_tensor in state.items():
+            if state_tensor is None:
+                continue
+            _, *last_dims = state_tensor.size()
+            # shape: (batch_size, beam_size, *)
+            expanded_backpointer = backpointer.view(batch_size, self.beam_size, *([1] * len(last_dims))).expand(
+                batch_size, self.beam_size, *last_dims
+            )
+            # shape: (batch_size * beam_size, *)
+            state[key] = (
+                state_tensor.reshape(batch_size, self.beam_size, *last_dims)
+                .gather(1, expanded_backpointer)
+                .reshape(batch_size * self.beam_size, *last_dims)
+            )

checkpoint.py ADDED Viewed

	@@ -0,0 +1,2023 @@

+import gc
+import io
+import logging
+import pickle
+import shutil
+import traceback
+from abc import ABCMeta, abstractmethod
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+from contextlib import contextmanager
+from copy import deepcopy
+from dataclasses import dataclass, field, replace
+from functools import reduce
+from multiprocessing import shared_memory
+from pathlib import Path
+from typing import Any, Dict, Generator, List, Optional, Set, Tuple, cast
+import numpy as np
+import torch
+import torch.distributed.checkpoint as dist_cp
+import torch.multiprocessing as mp
+import torch.nn as nn
+from packaging import version
+from torch.distributed import _remote_device
+from torch.distributed._shard._utils import narrow_tensor_by_index
+from torch.distributed._shard.metadata import ShardMetadata
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed.checkpoint.filesystem import WriteResult, _StorageInfo
+from torch.distributed.checkpoint.metadata import Metadata, MetadataIndex
+from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
+from torch.distributed.checkpoint.planner import LoadItemType, ReadItem
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import StateDictType
+from torch.distributed.fsdp.api import (
+    FullOptimStateDictConfig,
+    FullStateDictConfig,
+    ShardedOptimStateDictConfig,
+    ShardedStateDictConfig,
+)
+from torch.futures import Future
+from torch.nn.parallel import DistributedDataParallel as DDP
+try:
+    from torch.distributed.fsdp.flat_param import FlatParamHandle  # type: ignore
+except ModuleNotFoundError:
+    from torch.distributed.fsdp._flat_param import FlatParamHandle  # type: ignore
+from olmo import util
+from .aliases import PathOrStr
+from .config import BaseConfig, ShardedCheckpointerType, TrainConfig
+from .exceptions import OLMoCheckpointError
+from .optim import Optimizer, fix_optim_state_dict
+from .safetensors_util import safetensors_file_to_state_dict
+from .torch_util import (
+    barrier,
+    gc_cuda,
+    get_fs_local_rank,
+    get_global_rank,
+    get_local_rank,
+    get_local_world_size,
+    get_world_size,
+)
+from .util import (
+    _get_s3_client,
+    default_thread_count,
+    dir_is_empty,
+    get_bytes_range,
+    get_progress_bar,
+    resource_path,
+    upload,
+    wait_for,
+)
+__all__ = [
+    "save_fsdp_model_and_optim_state",
+    "load_fsdp_model_and_optim_state",
+    "load_fsdp_optim_state",
+    "save_state_dict",
+    "load_state_dict",
+    "load_model_state",
+    "RemoteFileSystemWriter",
+    "RemoteFileSystemReader",
+    "Checkpointer",
+    "FullCheckpointer",
+    "TorchNewStyleShardedCheckpointer",
+    "TorchLegacyShardedCheckpointer",
+    "LocalShardedCheckpointer",
+    "build_sharded_checkpointer",
+]
+log = logging.getLogger(__name__)
+MODEL_AND_OPTIM_FOLDER = "model_and_optim"
+def save_fsdp_model_and_optim_state(
+    checkpoint_dir: PathOrStr,
+    fsdp_model: FSDP,
+    optim: Optimizer,
+    *,
+    upload_to: Optional[str] = None,
+    save_overwrite: bool = False,
+):
+    """
+    Use this to save a state dict for an FSDP model and its optimizer via :module:`torch.distributed.checkpoint`
+    functions. This should be used during distributed training and should be called by all ranks.
+    :param checkpoint_dir: The directory to save to.
+    :param fsdp_model: The FSDP model.
+    :param optim: The FSDP model's optimizer.
+    :param upload_to: Optional, a remote "directory" to upload the checkpoint files to.
+    :param save_overwrite: Overwrite existing files.
+    :raises FileExistsError: If a model and optim checkpoint already exists in ``checkpoint_dir`` and ``save_overwrite=False``.
+    """
+    checkpoint_dir = Path(checkpoint_dir)
+    target_dir = checkpoint_dir / MODEL_AND_OPTIM_FOLDER
+    if save_overwrite:
+        if get_fs_local_rank() == 0:
+            shutil.rmtree(target_dir, ignore_errors=True)
+    elif not dir_is_empty(target_dir):
+        raise FileExistsError(target_dir)
+    barrier()
+    if get_fs_local_rank() == 0:
+        target_dir.mkdir(exist_ok=True, parents=True)
+    barrier()
+    with FSDP.state_dict_type(
+        fsdp_model,
+        state_dict_type=StateDictType.SHARDED_STATE_DICT,
+        state_dict_config=ShardedStateDictConfig(offload_to_cpu=True),
+        optim_state_dict_config=ShardedOptimStateDictConfig(offload_to_cpu=True),
+    ):
+        model_and_optim_state = {
+            "model": fsdp_model.state_dict(),
+            "optim": FSDP.optim_state_dict(fsdp_model, optim),
+        }
+        dist_cp.save_state_dict(
+            model_and_optim_state,
+            RemoteFileSystemWriter(
+                target_dir,
+                upload_to=None if upload_to is None else f"{upload_to.rstrip('/')}/{MODEL_AND_OPTIM_FOLDER}",
+                save_overwrite=save_overwrite,
+            ),
+        )
+def load_fsdp_model_and_optim_state(
+    checkpoint_dir: PathOrStr,
+    fsdp_model: FSDP,
+    optim: Optimizer,
+    *,
+    local_cache: Optional[PathOrStr] = None,
+    load_optimizer_state: bool = True,
+):
+    """
+    Use this to load a state dict for an FSDP model and its optimizer via :module:`torch.distributed.checkpoint`
+    functions. This should be used during distributed training and should be called by all ranks.
+    :param checkpoint_dir: The checkpoint directory to load from. This can be a local or remote directory.
+    :param fsdp_model: The FSDP model.
+    :param optim: The FSDP model's optimizer.
+    :param local_cache: A local cache of the checkpoint directory. Use this when the ``checkpoint_dir`` is a
+        remote "directory" but there might be a cached version of the same artifacts.
+    :param load_optimizer_state: Set to ``False`` to skip loading the optimizer state.
+    :raises FileNotFoundError: If the ``checkpoint_dir`` doesn't contain a model and optimizer checkpoint.
+    """
+    load_path = str(checkpoint_dir).rstrip("/")
+    local_cache = None if local_cache is None else Path(local_cache)
+    with FSDP.state_dict_type(
+        fsdp_model,
+        state_dict_type=StateDictType.SHARDED_STATE_DICT,
+        state_dict_config=ShardedStateDictConfig(offload_to_cpu=True),
+        optim_state_dict_config=ShardedOptimStateDictConfig(offload_to_cpu=True),
+    ):
+        # Load the model state dict in place.
+        log.info("Loading model state...")
+        model_state = {"model": fsdp_model.state_dict()}
+        dist_cp.load_state_dict(
+            model_state,
+            RemoteFileSystemReader(
+                f"{load_path}/{MODEL_AND_OPTIM_FOLDER}",
+                local_cache=None if local_cache is None else local_cache / MODEL_AND_OPTIM_FOLDER,
+            ),
+        )
+        fsdp_model.load_state_dict(model_state["model"])
+        if not load_optimizer_state:
+            return
+        # Load optim state dict in place.
+        log.info("Loading sharded optimizer state...")
+        optim_state = load_sharded_optimizer_state_dict(
+            model_state_dict=model_state["model"],
+            optimizer_key="optim",
+            storage_reader=RemoteFileSystemReader(
+                f"{load_path}/{MODEL_AND_OPTIM_FOLDER}",
+                local_cache=None if local_cache is None else local_cache / MODEL_AND_OPTIM_FOLDER,
+            ),
+        )
+        # optim_state["optim"] = {
+        #    'state': { fqn: { 'grad_norm_exp_avg': Tensor, 'step': Tensor, 'exp_avg': ShardedTensor, 'exp_avg_sq': ShardedTensor } },
+        #    'param_groups': [{ 'param_names': [ fsdp_fqn, ... ], 'params': [ fqn, ... ], ... }],
+        # }
+        del model_state
+        # Make sure tensors are on CPU! PyTorch puts them on GPU even though we have `offload_to_cpu=True`.
+        for state in optim_state["optim"]["state"].values():
+            for k in state.keys():
+                state[k] = state[k].cpu()
+        gc_cuda()
+        load_fsdp_optim_state(fsdp_model, optim, optim_state["optim"])
+def load_fsdp_optim_state(fsdp_model: FSDP, optim: Optimizer, optim_state: Dict[str, Any]):
+    log.info("Flattening sharded optimizer state...")
+    # flattened_osd = {
+    #    'state': { id: { 'grad_norm_exp_avg': Tensor, 'step': Tensor, 'exp_avg': Tensor, 'exp_avg_sq': Tensor } },
+    #    'param_groups': [{ 'param_names': [ fsdp_fqn, ... ], 'params': [ id, ... ], ... }],
+    # }
+    # NOTE: Careful! The order of the these arguments has changed from 2.0 to 2.1... ¯\_(ツ)_/¯
+    if version.parse(torch.__version__) < version.parse("2.1.0"):
+        flattened_osd = FSDP.optim_state_dict_to_load(optim_state, fsdp_model, optim)  # type: ignore
+    else:
+        flattened_osd = FSDP.optim_state_dict_to_load(fsdp_model, optim, optim_state)  # type: ignore
+    del optim_state
+    gc_cuda()
+    log.info("Loading flattened optimizer state...")
+    # Put optim state on CPU since `Optimizer.load_state_dict()` will create a deepcopy of the whole state dict,
+    # which takes up unnecessary GPU memory.
+    for state in flattened_osd["state"].values():
+        for k in state.keys():
+            state[k] = state[k].cpu()
+    gc_cuda()
+    optim.load_state_dict(fix_optim_state_dict(optim, flattened_osd))
+def save_state_dict(
+    checkpoint_dir: PathOrStr,
+    fname: str,
+    state_dict: Dict[str, Any],
+    *,
+    upload_to: Optional[str] = None,
+    save_overwrite: bool = False,
+    synchronize: bool = True,
+):
+    """
+    Save a regular state dict to the file ``fname`` within ``checkpoint_dir`` using :func:`torch.save()`.
+    This can be used during distributed training or not. If during distributed training the ``fname`` should be unique
+    for each rank.
+    :param checkpoint_dir: The directory to save to.
+    :param fname: The target file within ``checkpoint_dir`` to save to. This should be a path relative to the ``checkpoint_dir``.
+    :param state_dict: The state dict to save.
+    :param upload_to: Optional, a remote "directory" to upload the file to.
+    :param save_overwrite: Overwrite existing files.
+    :param synchronize: If ``False``, don't do any distributed synchronization. Use this when only calling
+        this function from a single rank.
+    :raises FileExistsError: If the ``fname`` already exists within ``checkpoint_dir`` and ``save_overwrite=False``.
+    """
+    checkpoint_dir = Path(checkpoint_dir)
+    target_path = checkpoint_dir / fname
+    if save_overwrite:
+        target_path.unlink(missing_ok=True)
+    elif target_path.is_file():
+        raise FileExistsError(target_path)
+    if synchronize:
+        barrier()
+    target_path.parent.mkdir(exist_ok=True, parents=True)
+    if synchronize:
+        barrier()
+    torch.save(state_dict, target_path)
+    if upload_to is not None:
+        upload_target = f"{upload_to.rstrip('/')}/{fname}"
+        log.info(f"Uploading {target_path} to {upload_target}...")
+        upload(target_path, upload_target, save_overwrite=save_overwrite)
+def load_state_dict(
+    checkpoint_dir: PathOrStr,
+    fname: str,
+    *,
+    local_cache: Optional[PathOrStr] = None,
+    map_location: Optional[str] = None,
+):
+    """
+    Load a regular state dict from the file ``fname`` within ``checkpoint_dir`` using :func:`torch.load()`.
+    This can be used during distributed training or not.
+    :param checkpoint_dir: A local or remote checkpoint directory.
+    :param fname: The target file within the ``checkpoint_dir``. This should be a path relative to the ``checkpoint_dir``.
+    :param local_cache: A local cache of the checkpoint directory. Use this when the ``checkpoint_dir`` is a
+        remote "directory" but there might be a cached version of the same artifacts.
+    :raises FileNotFoundError: If ``fname`` doesn't exist in the ``checkpoint_dir`` or the local cache.
+    """
+    if fname.endswith(".pt"):
+        # Try safetensors version first.
+        try:
+            path = resource_path(
+                str(checkpoint_dir).rstrip("/"), fname[:-2] + "safetensors", local_cache=local_cache
+            )
+            return safetensors_file_to_state_dict(path, map_location=map_location)
+        except FileNotFoundError:
+            pass
+    path = resource_path(str(checkpoint_dir).rstrip("/"), fname, local_cache=local_cache)
+    return torch.load(path, map_location=map_location)
+def load_model_state(checkpoint_dir: PathOrStr, model: torch.nn.Module):
+    """
+    Load model state from a distributed FSDP model checkpoint created from :func:`save_fsdp_model_and_optim_state()`.
+    Note that ``model`` should not be wrapped with FSDP.
+    """
+    state_dict = {"model": model.state_dict()}
+    dist_cp.load_state_dict(
+        state_dict,
+        RemoteFileSystemReader(f"{str(checkpoint_dir).rstrip('/')}/{MODEL_AND_OPTIM_FOLDER}"),
+        no_dist=True,
+    )
+    model.load_state_dict(state_dict["model"])
+class RemoteFileSystemWriter(dist_cp.FileSystemWriter):
+    """
+    A subclass of :class:`~torch.distributed.checkpoint.FileSystemWriter` that can upload files
+    directly to a cloud bucket when ``upload_to`` is specified.
+    """
+    def __init__(
+        self,
+        path: PathOrStr,
+        single_file_per_rank: bool = True,
+        sync_files: bool = True,
+        thread_count: Optional[int] = None,
+        per_thread_copy_ahead: int = 10_000_000,
+        upload_to: Optional[str] = None,
+        save_overwrite: bool = False,
+    ) -> None:
+        if thread_count is not None and thread_count <= 0:
+            raise ValueError("thread count must be at least 1")
+        super().__init__(
+            path,
+            single_file_per_rank=single_file_per_rank,
+            sync_files=sync_files,
+            # NOTE: we default to 1 thread here instead of whatever `default_thread_count()`
+            # returns because uploading big checkpoint files with multiple threads causes
+            # boto3 to fail in weird ways.
+            thread_count=thread_count or 1,
+            per_thread_copy_ahead=per_thread_copy_ahead,
+        )
+        self.upload_to = None if upload_to is None else upload_to.rstrip("/")
+        self.save_overwrite = save_overwrite
+    def write_data(
+        self,
+        plan: dist_cp.SavePlan,
+        planner: dist_cp.SavePlanner,
+    ) -> Future[List[WriteResult]]:
+        fut = super().write_data(plan, planner)
+        if self.upload_to is not None:
+            files_to_upload = set()
+            for write_result in fut.wait():
+                files_to_upload.add(write_result.storage_data.relative_path)
+            # Create the global S3 client up front to work around a threading issue in boto.
+            if self.upload_to.startswith("s3://"):
+                _get_s3_client("s3")
+            elif self.upload_to.startswith("r2://"):
+                _get_s3_client("r2")
+            elif self.upload_to.startswith("weka://"):
+                _get_s3_client("weka")
+            with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
+                futures = []
+                for fname in files_to_upload:
+                    source = self.path / fname
+                    target = f"{self.upload_to}/{fname}"
+                    log.info(f"Uploading {source} to {target}...")
+                    futures.append(executor.submit(upload, source, target, save_overwrite=self.save_overwrite))
+                for f in as_completed(futures):
+                    try:
+                        f.result()
+                    except BaseException:
+                        # NOTE: we might get an error here that can't be pickled, which causes a different failure
+                        # later when PyTorch tries to reduce that error across ranks. So here we just make
+                        # sure we're raising a simple error type that can be pickled.
+                        raise OLMoCheckpointError(f"Original error:\n{traceback.format_exc()}")
+        return fut
+    def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
+        super().finish(metadata, results)
+        if self.upload_to is not None:
+            source = self.path / ".metadata"
+            target = f"{self.upload_to}/.metadata"
+            log.info(f"Uploading {source} to {target}...")
+            upload(source, target, save_overwrite=self.save_overwrite)
+class RemoteFileSystemReader(dist_cp.StorageReader):
+    """
+    A :class:`~torch.distributed.checkpoint.StorageReader` based on :class:`~torch.distributed.checkpoint.FileSystemReader`
+    that can read data directly from cloud storage as well as a local directory.
+    """
+    def __init__(
+        self, path: PathOrStr, *, local_cache: Optional[PathOrStr] = None, thread_count: Optional[int] = None
+    ):
+        super().__init__()
+        if thread_count is not None and thread_count <= 0:
+            raise ValueError("thread count must be at least 1")
+        self.path = str(path).rstrip("/")
+        self.cache = None if local_cache is None else Path(local_cache)
+        self.thread_count = thread_count or default_thread_count()
+        self.storage_data: Dict[MetadataIndex, _StorageInfo] = dict()
+        self._metadata: Optional[Metadata] = None
+    def _get_bytes(self, relative_path: str, offset: int, length: int) -> bytes:
+        if self.cache is not None and (path := self.cache / relative_path).is_file():
+            return get_bytes_range(path, offset, length)
+        else:
+            return get_bytes_range(f"{self.path}/{relative_path}", offset, length)
+    def _get_content_for_read(self, read_item: ReadItem) -> Tuple[ReadItem, bytes]:
+        sinfo = self.storage_data[read_item.storage_index]
+        content = self._get_bytes(sinfo.relative_path, sinfo.offset, sinfo.length)
+        return (read_item, content)
+    def read_data(self, plan: dist_cp.LoadPlan, planner: dist_cp.LoadPlanner) -> Future[None]:
+        # Create the global S3 client up front to work around a threading issue in boto.
+        if isinstance(self.path, str):
+            if self.path.startswith("s3://"):
+                _get_s3_client("s3")
+            elif self.path.startswith("r2://"):
+                _get_s3_client("r2")
+            elif self.path.startswith("weka://"):
+                _get_s3_client("weka")
+        with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
+            read_item_content_futures = []
+            for read_item in plan.items:
+                read_item_content_futures.append(executor.submit(self._get_content_for_read, read_item))
+            read_item_content_results = []
+            for f in as_completed(read_item_content_futures):
+                try:
+                    read_item_content_results.append(f.result())
+                except BaseException:
+                    # NOTE: we might get an error here that can't be pickled, which causes a different failure
+                    # later when PyTorch tries to reduce that error across ranks. So here we just make
+                    # sure we're raising a simple error type that can be pickled.
+                    raise OLMoCheckpointError(f"Original error:\n{traceback.format_exc()}")
+        # Modified from `FileSystemReader.read_data()`
+        for read_item, content in read_item_content_results:
+            bytes = io.BytesIO(content)
+            bytes.seek(0)
+            if read_item.type == LoadItemType.BYTE_IO:
+                planner.load_bytes(read_item, bytes)
+            else:
+                tensor = cast(torch.Tensor, torch.load(bytes, map_location="cpu"))
+                tensor = narrow_tensor_by_index(tensor, read_item.storage_offsets, read_item.lengths)
+                target_tensor = planner.resolve_tensor(read_item).detach()
+                assert (
+                    target_tensor.size() == tensor.size()
+                ), f"req {read_item.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+                target_tensor.copy_(tensor)
+                planner.commit_tensor(read_item, target_tensor)
+        fut: Future = Future()
+        fut.set_result(None)
+        return fut
+    def read_metadata(self) -> Metadata:
+        if self._metadata is None:
+            with resource_path(self.path, ".metadata", local_cache=self.cache).open("rb") as metadata_file:
+                self._metadata = pickle.load(metadata_file)
+        return self._metadata
+    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
+        del is_coordinator
+        self.storage_data = metadata.storage_data
+        assert self.storage_data is not None
+    def prepare_local_plan(self, plan: dist_cp.LoadPlan) -> dist_cp.LoadPlan:
+        return plan
+    def prepare_global_plan(self, global_plan: List[dist_cp.LoadPlan]) -> List[dist_cp.LoadPlan]:
+        return global_plan
+class Checkpointer(metaclass=ABCMeta):
+    def __init__(self, cfg: TrainConfig, thread_count: Optional[int] = None):
+        self.cfg = cfg
+        self.thread_count = thread_count or default_thread_count()
+    @abstractmethod
+    def save_checkpoint(
+        self,
+        dir: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        train_state: Dict[str, Any],
+        *,
+        upload_to: Optional[str] = None,
+    ) -> None:
+        raise NotImplementedError
+    @abstractmethod
+    def restore_checkpoint(
+        self,
+        load_path: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+    ) -> Dict[str, Any]:
+        """
+        Restores a checkpoint to the model and optimizer. Returns the remaining trainer state.
+        """
+        raise NotImplementedError
+    def unshard_checkpoint(
+        self,
+        load_path: PathOrStr,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+        load_trainer_state: bool = True,
+        device: Optional[torch.device] = None,
+    ) -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
+        """
+        Unshard a checkpoint.
+        Note this is not marked abstract because child classes are not required to implemented this.
+        """
+        raise NotImplementedError
+    @contextmanager
+    def _temporary_wd(self, dir: PathOrStr) -> Generator[Path, None, None]:
+        # Make sure checkpoint directory doesn't exist unless it's okay to overwrite it.
+        checkpoint_dir = Path(dir)
+        if not dir_is_empty(checkpoint_dir):
+            if self.cfg.save_overwrite:
+                if get_fs_local_rank() == 0:
+                    shutil.rmtree(checkpoint_dir, ignore_errors=True)
+            else:
+                raise FileExistsError(checkpoint_dir)
+        # No need to mkdir here since we'll directly replace the temporary directory with
+        # this directory below.
+        barrier()
+        # Prepare temporary directory. We don't have to be as careful here, we can
+        # just remove it if it already exists.
+        checkpoint_dir_tmp = checkpoint_dir.with_name(checkpoint_dir.name + "-tmp")
+        if get_fs_local_rank() == 0:
+            shutil.rmtree(checkpoint_dir_tmp, ignore_errors=True)
+            checkpoint_dir_tmp.mkdir(exist_ok=True, parents=True)
+        # In the cases where we're using a shared NFS drive between ranks to save checkpoints,
+        # creating the temp directory from rank 0 might not be immediately
+        # realized in the file systems of the other ranks.
+        # So we wait here across all ranks until that tmp checkpoint directory is visible.
+        wait_for(lambda: checkpoint_dir_tmp.exists(), "Waiting for checkpoint directory", timeout=10.0)
+        barrier()
+        # Yield temporary directory for `.save_checkpoint()` to use.
+        yield checkpoint_dir_tmp
+        barrier()
+        # Finally if all went well replace the temporary directory with the actual
+        # checkpoint directory.
+        if get_fs_local_rank() == 0:
+            # Replace temp directory with target checkpoint directory.
+            try:
+                checkpoint_dir_tmp.replace(checkpoint_dir)
+            except FileNotFoundError:
+                # Caught when another (file-system) local rank 0 has already replaced the tmp directory.
+                # This can happen when nodes are saving to a common NFS drive but otherwise have distinct
+                # file-systems.
+                if not checkpoint_dir.exists():
+                    raise
+        # In the cases where we're using a shared NFS drive between ranks to save checkpoints,
+        # replacing the temp directory with the final directory from rank 0 might not be immediately
+        # realized in the file systems of the other ranks.
+        # So we wait here across all ranks until that final checkpoint directory is visible.
+        wait_for(lambda: checkpoint_dir.exists(), "Waiting for checkpoint directory", timeout=10.0)
+        barrier()
+    def _save_config(self, dir: PathOrStr, *, upload_to: Optional[str] = None) -> None:
+        if get_global_rank() == 0:
+            log.info("Saving config...")
+            self.cfg.save(config_path := Path(dir) / "config.yaml")
+            if upload_to is not None:
+                upload_target = f"{upload_to}/config.yaml"
+                log.info(f"Uploading {config_path} to {upload_target}")
+                upload(config_path, upload_target, save_overwrite=self.cfg.save_overwrite)
+class FullCheckpointer(Checkpointer):
+    """
+    A :class:`Checkpointer` that saves a single full model and optimizer state dictionary.
+    """
+    def save_checkpoint(
+        self,
+        dir: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        trainer_state: Dict[str, Any],
+        *,
+        upload_to: Optional[str] = None,
+    ) -> None:
+        with self._temporary_wd(dir) as checkpoint_dir:
+            if isinstance(dist_model, FSDP):
+                with FSDP.state_dict_type(
+                    dist_model,
+                    state_dict_type=StateDictType.FULL_STATE_DICT,
+                    state_dict_config=FullStateDictConfig(rank0_only=True, offload_to_cpu=True),
+                    optim_state_dict_config=FullOptimStateDictConfig(rank0_only=True, offload_to_cpu=True),
+                ):
+                    # We'll write the model and optimizer state dicts individually to reduce (CPU) memory consumption.
+                    # First the model state.
+                    model_state_dict = dist_model.state_dict()
+                    self._write_model_dict(
+                        model_state_dict, checkpoint_dir, upload_to, save_overwrite=self.cfg.save_overwrite
+                    )
+                    # Then the optimizer state.
+                    optim_state_dict = FSDP.optim_state_dict(dist_model, optim)
+                    self._write_optim_dict(
+                        optim_state_dict, checkpoint_dir, upload_to, save_overwrite=self.cfg.save_overwrite
+                    )
+            elif isinstance(dist_model, DDP):
+                # _write_model_dict and _write_optim_dict only write checkpoints for rank 0
+                # First, get the model state dict from DDP wrapped model
+                model_state_dict = dist_model.module.state_dict()
+                self._write_model_dict(
+                    model_state_dict, checkpoint_dir, upload_to, save_overwrite=self.cfg.save_overwrite
+                )
+                # Then get the optimizer state dict
+                optim_state_dict = optim.state_dict()
+                self._write_optim_dict(
+                    optim_state_dict, checkpoint_dir, upload_to, save_overwrite=self.cfg.save_overwrite
+                )
+            else:
+                log.info(
+                    "`FullCheckpointer.save_checkpoint` only supported for FSDP and DDP distributed strategies!"
+                )
+            # Save trainer state.
+            if get_global_rank() == 0:
+                log.info("Saving trainer state...")
+                save_state_dict(
+                    checkpoint_dir,
+                    "train.pt",
+                    trainer_state,
+                    upload_to=upload_to,
+                    save_overwrite=self.cfg.save_overwrite,
+                    synchronize=False,
+                )
+            # Save config.
+            self._save_config(checkpoint_dir, upload_to=upload_to)
+    def restore_checkpoint(
+        self,
+        load_path: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+    ) -> Dict[str, Any]:
+        if isinstance(dist_model, FSDP):
+            with FSDP.state_dict_type(
+                dist_model,
+                state_dict_type=StateDictType.FULL_STATE_DICT,
+                state_dict_config=FullStateDictConfig(rank0_only=False, offload_to_cpu=True),
+                optim_state_dict_config=FullOptimStateDictConfig(rank0_only=False, offload_to_cpu=True),
+            ):
+                with torch.no_grad():
+                    # fill everything with NaN, so we can check afterwards that every parameter has been restored
+                    for module_name, module in dist_model.named_modules():
+                        if not isinstance(module, FSDP):
+                            continue
+                        for param in module.params:
+                            param.fill_(torch.nan)
+                    # restore params from checkpoint
+                    state_dict_to_load = load_state_dict(
+                        load_path, "model.pt", local_cache=local_cache, map_location="cpu"
+                    )
+                    (
+                        state_dict_to_load,
+                        og_keys_to_new,
+                    ) = dist_model._fsdp_wrapped_module._make_state_dict_compatible(state_dict_to_load)
+                    for module_name, module in dist_model.named_modules():
+                        if not isinstance(module, FSDP):
+                            continue
+                        for param in module.params:
+                            assert param._is_flat_param
+                            for fqn, spi in zip(param._fqns, param._shard_param_infos):
+                                if not spi.in_shard:
+                                    continue
+                                key = f"{module_name}.{fqn}"
+                                key = key.replace("_fsdp_wrapped_module.", "")
+                                key = key.lstrip(".")
+                                t = state_dict_to_load[key]
+                                t = t.flatten()
+                                param[spi.offset_in_shard : spi.offset_in_shard + spi.numel_in_shard].copy_(
+                                    t[spi.intra_param_start_idx : spi.intra_param_end_idx + 1]
+                                )
+                    # make sure that every parameter has been restored
+                    for module_name, module in dist_model.named_modules():
+                        if not isinstance(module, FSDP):
+                            continue
+                        for param in module.params:
+                            if torch.isnan(param).any():
+                                raise ValueError(
+                                    f"Module '{module_name}' contains NaNs, this is likely a bug restoring from full checkpoints"
+                                )
+                # Load optimizer state.
+                if load_optimizer_state:
+                    optim_state_dict_to_load = load_state_dict(
+                        load_path, "optim.pt", local_cache=local_cache, map_location="cpu"
+                    )
+                    optim_state_dict_to_load = self._make_optim_state_dict_compatible(
+                        optim_state_dict_to_load,
+                        og_keys_to_new,
+                    )
+                    gc.collect()
+                    torch.cuda.empty_cache()
+                    barrier()
+                    for turn in range(get_local_world_size()):
+                        log.info("Loading optimizer state turn %d ...", turn)
+                        if turn == get_local_rank():
+                            load_fsdp_optim_state(dist_model, optim, optim_state_dict_to_load)
+                            gc.collect()
+                            torch.cuda.empty_cache()
+                        barrier()
+                    del optim_state_dict_to_load
+        elif isinstance(dist_model, DDP):
+            # Load model state.
+            with torch.no_grad():
+                state_dict_to_load = load_state_dict(
+                    load_path, "model.pt", local_cache=local_cache, map_location="cpu"
+                )
+                dist_model.module.load_state_dict(state_dict_to_load, strict=True)
+            # Load optimizer state.
+            if load_optimizer_state:
+                optim_state_dict_to_load = load_state_dict(
+                    load_path, "optim.pt", local_cache=local_cache, map_location="cpu"
+                )
+                optim.load_state_dict(optim_state_dict_to_load)
+            gc.collect()
+            torch.cuda.empty_cache()
+            barrier()
+        else:
+            raise NotImplementedError(
+                "`FullCheckpointer.restore_checkpoint` only supported for FSDP and DDP distributed strategies!"
+            )
+        # Load other state.
+        try:
+            trainer_state = load_state_dict(load_path, "train.pt", local_cache=local_cache)
+        except FileNotFoundError:
+            # for backwards compatibility
+            trainer_state = load_state_dict(load_path, "other.pt", local_cache=local_cache)
+        barrier()
+        return trainer_state
+    def _write_model_dict(self, model_state_dict, checkpoint_dir, upload_to, save_overwrite):
+        if get_global_rank() == 0:
+            log.info("Saving model state...")
+            save_state_dict(
+                checkpoint_dir,
+                "model.pt",
+                model_state_dict,
+                upload_to=upload_to,
+                save_overwrite=save_overwrite,
+                synchronize=False,
+            )
+        del model_state_dict
+        barrier()
+    def _write_optim_dict(self, optim_state_dict, checkpoint_dir, upload_to, save_overwrite):
+        if get_global_rank() == 0:
+            log.info("Saving optim state...")
+            save_state_dict(
+                checkpoint_dir,
+                "optim.pt",
+                optim_state_dict,
+                upload_to=upload_to,
+                save_overwrite=save_overwrite,
+                synchronize=False,
+            )
+        del optim_state_dict
+        barrier()
+    def _make_optim_state_dict_compatible(
+        self, optim_state_dict: Dict[str, Any], og_keys_to_new: Dict[str, Set[str]]
+    ) -> Dict[str, Any]:
+        # This state dict comes in two forms: one where the state keys are integers and one where the
+        # keys are fully qualified parameter names. The latter case is easier to deal with here so we
+        # first transform the integer key form into the FQN key form.
+        if isinstance(optim_state_dict["param_groups"][0]["params"][0], int):
+            id_to_fqn: Dict[int, str] = {}
+            for group in optim_state_dict["param_groups"]:
+                new_param_names = []
+                for fqn, id in zip(group["param_names"], group["params"]):
+                    fqn = fqn.replace("_fsdp_wrapped_module.", "")
+                    id_to_fqn[id] = fqn
+                    new_param_names.append(fqn)
+                group["param_names"] = new_param_names
+                group["params"] = new_param_names
+            for id in list(optim_state_dict["state"].keys()):
+                optim_state_dict["state"][id_to_fqn[id]] = optim_state_dict["state"].pop(id)
+        else:
+            # Otherwise we still want to clean up the param names to remove the "_fsdp_wrapped_module." prefix.
+            for group in optim_state_dict["param_groups"]:
+                group["param_names"] = [fqn.replace("_fsdp_wrapped_module.", "") for fqn in group["param_names"]]
+                group["params"] = [fqn.replace("_fsdp_wrapped_module.", "") for fqn in group["params"]]
+                assert group["param_names"] == group["params"]
+            for key in list(optim_state_dict["state"].keys()):
+                optim_state_dict["state"][key.replace("_fsdp_wrapped_module.", "")] = optim_state_dict[
+                    "state"
+                ].pop(key)
+        # Now we can transform the state dict by renaming parameters according to `og_keys_to_new`.
+        # First fix param names in the state.
+        for og_key, new_keys in og_keys_to_new.items():
+            og_state = optim_state_dict["state"].pop(og_key, None)
+            if og_state is None:
+                continue
+            for i, new_key in enumerate(new_keys):
+                if i == len(new_keys) - 1:
+                    optim_state_dict["state"][new_key] = og_state
+                else:
+                    optim_state_dict["state"][new_key] = deepcopy(og_state)
+        # Now fix param names in the param groups.
+        for group in optim_state_dict["param_groups"]:
+            og_names = group["params"]
+            new_names = []
+            for og_key in og_names:
+                for new_key in og_keys_to_new[og_key]:
+                    new_names.append(new_key)
+            group["params"] = new_names
+            group["param_names"] = new_names
+        return optim_state_dict
+    def load_checkpoint(
+        self,
+        load_path: PathOrStr,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+        device: Optional[torch.device] = None,
+    ) -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, Any]]]:
+        device = device if device is not None else torch.device("cpu")
+        model_state = load_state_dict(load_path, "model.pt", local_cache=local_cache, map_location=device)  # type: ignore
+        optim_state = None
+        if load_optimizer_state:
+            optim_state = load_state_dict(load_path, "optim.pt", local_cache=local_cache, map_location=device)  # type: ignore
+        return model_state, optim_state
+class TorchNewStyleShardedCheckpointer(Checkpointer):
+    """
+    A sharded :class:`Checkpointer` that uses PyTorch's new distributed checkpointing functionality.
+    """
+    def save_checkpoint(
+        self,
+        dir: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        trainer_state: Dict[str, Any],
+        *,
+        upload_to: Optional[str] = None,
+    ) -> None:
+        assert isinstance(
+            dist_model, FSDP
+        ), f"{self.__class__.__name__} is being called to save a model where `distributed_strategy` is not FSDP."
+        with self._temporary_wd(dir) as checkpoint_dir:
+            # Save model and optim state.
+            save_fsdp_model_and_optim_state(
+                checkpoint_dir,
+                dist_model,
+                optim,
+                upload_to=upload_to,
+                save_overwrite=self.cfg.save_overwrite,
+            )
+            # Save trainer state.
+            log.info("Saving trainer state...")
+            save_state_dict(
+                checkpoint_dir,
+                f"train/rank{get_global_rank()}.pt",
+                trainer_state,
+                upload_to=upload_to,
+                save_overwrite=self.cfg.save_overwrite,
+            )
+            # Save config.
+            self._save_config(checkpoint_dir, upload_to=upload_to)
+    def restore_checkpoint(
+        self,
+        load_path: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+    ) -> Dict[str, Any]:
+        # Load model and optimizer state in place.
+        log.info("Loading model and optimizer state...")
+        assert isinstance(
+            dist_model, FSDP
+        ), f"{self.__class__.__name__} is being called to load a model where `distributed_strategy` is not FSDP."
+        load_fsdp_model_and_optim_state(
+            load_path,
+            dist_model,
+            optim,
+            local_cache=local_cache,
+            load_optimizer_state=load_optimizer_state,
+        )
+        # Load trainer state dict.
+        log.info("Loading trainer state...")
+        try:
+            trainer_state = load_state_dict(
+                load_path, f"train/rank{get_global_rank()}.pt", local_cache=local_cache
+            )
+        except FileNotFoundError:
+            # Fall back to rank 0 train state.
+            # This can happen when we're restoring a checkpoint with a different world size.
+            trainer_state = load_state_dict(load_path, "train/rank0.pt", local_cache=local_cache)
+        barrier()
+        return trainer_state
+class TorchLegacyShardedCheckpointer(Checkpointer):
+    """
+    A sharded :class:`Checkpointer` that just uses `torch.save()` with extra logic for handling FSDP model
+    and optim state.
+    The world size must be kept consistent when using this checkpointer.
+    """
+    def __init__(self, cfg: TrainConfig, thread_count: Optional[int] = None, use_shared_mem_impl: bool = False):
+        super().__init__(cfg, thread_count)
+        self.use_shared_mem_impl = use_shared_mem_impl
+    def save_checkpoint(
+        self,
+        dir: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        trainer_state: Dict[str, Any],
+        *,
+        upload_to: Optional[str] = None,
+    ) -> None:
+        assert isinstance(
+            dist_model, FSDP
+        ), f"{self.__class__.__name__} is being called to save a model where `distributed_strategy` is not FSDP."
+        with self._temporary_wd(dir) as checkpoint_dir:
+            with FSDP.state_dict_type(
+                dist_model,
+                state_dict_type=StateDictType.SHARDED_STATE_DICT,
+                state_dict_config=ShardedStateDictConfig(offload_to_cpu=True),
+                optim_state_dict_config=ShardedOptimStateDictConfig(offload_to_cpu=True),
+            ):
+                state_dict = {
+                    "model": dist_model.state_dict(),
+                    "optim": FSDP.optim_state_dict(dist_model, optim),
+                    **trainer_state,
+                }
+                save_state_dict(
+                    checkpoint_dir,
+                    f"rank{get_global_rank()}.pt",
+                    state_dict,
+                    upload_to=upload_to,
+                    save_overwrite=self.cfg.save_overwrite,
+                )
+            # Save config.
+            self._save_config(checkpoint_dir, upload_to=upload_to)
+    def restore_checkpoint(
+        self,
+        load_path: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+    ) -> Dict[str, Any]:
+        assert isinstance(
+            dist_model, FSDP
+        ), f"{self.__class__.__name__} is being called to load a model where `distributed_strategy` is not FSDP."
+        with FSDP.state_dict_type(
+            dist_model,
+            state_dict_type=StateDictType.SHARDED_STATE_DICT,
+            state_dict_config=ShardedStateDictConfig(offload_to_cpu=True),
+            optim_state_dict_config=ShardedOptimStateDictConfig(offload_to_cpu=True),
+        ):
+            # Deserialize state dict.
+            state_dict = load_state_dict(
+                load_path, f"rank{get_global_rank()}.pt", local_cache=local_cache, map_location="cpu"
+            )
+            # Load model and optimizer state.
+            log.info("Loading model state...")
+            dist_model.load_state_dict(state_dict["model"])
+            del state_dict["model"]
+            if load_optimizer_state:
+                log.info("Loading optimizer state...")
+                load_fsdp_optim_state(dist_model, optim, state_dict["optim"])
+            del state_dict["optim"]
+        barrier()
+        return state_dict
+    def unshard_checkpoint(
+        self,
+        load_path: PathOrStr,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+        load_trainer_state: bool = True,
+        device: Optional[torch.device] = None,
+    ) -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
+        assert local_cache is None, "this method currently only supports local files"
+        full_state_dict = self._unshard(load_path, device or torch.device("cpu"), skip_keys={"rng"})
+        model_state = full_state_dict.pop("model")
+        optim_state = full_state_dict.pop("optim")
+        return (
+            model_state,
+            optim_state if load_optimizer_state else None,
+            full_state_dict if load_trainer_state else None,
+        )
+    def _copy_sharded_tensors_to_shared_mem(self, state: Dict, world_size: int, rank: int, key: Tuple):
+        key = tuple() if key is None else key
+        if isinstance(state, (list, tuple, set)):
+            for i, sub_state in enumerate(state):
+                self._copy_sharded_tensors_to_shared_mem(sub_state, world_size, rank, key + (i,))
+        elif isinstance(state, dict):
+            for name in state.keys():
+                self._copy_sharded_tensors_to_shared_mem(state[name], world_size, rank, key + (name,))
+        elif isinstance(state, ShardedTensor):
+            self._copy_sharded_tensor_to_shared_mem(state, world_size, rank, key)
+            return
+        else:
+            return
+    def _get_shard_placement_and_rank_sizes(
+        self, shards_metadata: List[ShardMetadata], world_size: int
+    ) -> Tuple[Dict[ShardMetadata, Tuple[int, int]], List[int]]:
+        def shard_size(shard_md):
+            return reduce((lambda x, y: x * y), shard_md.shard_sizes)  # type: ignore[attr-defined]
+        rank_sizes = [0 for _ in range(world_size)]
+        shard_placement: Dict[ShardMetadata, Tuple[int, int]] = {}
+        for shard_md in shards_metadata:
+            shard_rank = cast(_remote_device, shard_md.placement).rank()
+            assert shard_rank is not None
+            if shard_rank >= world_size:
+                raise RuntimeError(f"Shard rank {shard_rank} exceeds world size {world_size}")
+            shard_placement[shard_md] = (shard_rank, rank_sizes[shard_rank])
+            rank_sizes[shard_rank] += shard_size(shard_md)
+        return shard_placement, rank_sizes
+    def _copy_sharded_tensor_to_shared_mem(
+        self, sharded_tensor: ShardedTensor, world_size: int, rank: int, key: Tuple
+    ) -> Any:
+        shard0_md = sharded_tensor.metadata()
+        shard_placement, rank_sizes = self._get_shard_placement_and_rank_sizes(
+            shard0_md.shards_metadata, world_size
+        )
+        rank_size = rank_sizes[rank]
+        assert rank_size >= 0
+        if rank_size == 0:
+            return
+        assert shard0_md.tensor_properties.dtype == torch.float32, "Expected sharded tensor to be fp32"
+        numpy_type = np.float32
+        sharded_memory_name = "-".join(key + (str(rank),))
+        shm = shared_memory.SharedMemory(
+            create=True, size=rank_size * np.dtype(numpy_type).itemsize, name=sharded_memory_name
+        )
+        np_arr = np.ndarray((rank_size,), dtype=numpy_type, buffer=shm.buf)
+        for local_shard in sharded_tensor.local_shards():
+            shard_rank = cast(_remote_device, local_shard.metadata.placement).rank()
+            assert shard_rank == rank
+            src = local_shard.tensor.flatten()
+            shard_offset = shard_placement[local_shard.metadata][1]
+            np_arr[shard_offset : shard_offset + src.numel()] = src.numpy()
+        shm.close()
+    def _copy_sharded_data_to_shared_mem(self, world_size: int, shard_filepath: Path):
+        shard_number = int(shard_filepath.name[4:-3])
+        log.info("Starting unsharding shard number %d to shared memory", shard_number)
+        with self._patch_sharded_tensor_load():
+            shard = torch.load(shard_filepath, map_location="cpu")
+            log.debug("Done loading shard number %d", shard_number)
+        self._copy_sharded_tensors_to_shared_mem(
+            shard, world_size, shard_number, (str(shard_filepath.parent).replace("/", "_"),)
+        )
+        log.info("Done unsharding shard number %d to shared memory", shard_number)
+    def _unshard_using_sharded_mem(
+        self, state: Any, world_size: int, device: torch.device, shard_dir: PathOrStr
+    ) -> Any:
+        return self._unshard_state_using_shared_mem(state, world_size, device, (str(shard_dir).replace("/", "_"),))
+    def _unshard_state_using_shared_mem(
+        self, state: Any, world_size: int, device: torch.device, key: Tuple
+    ) -> Any:
+        if isinstance(state, (list, tuple, set)):
+            return state.__class__(
+                self._unshard_state_using_shared_mem(sub_state, world_size, device, key + (i,))
+                for i, sub_state in enumerate(state)
+            )
+        elif isinstance(state, dict):
+            return {
+                name: self._unshard_state_using_shared_mem(state[name], world_size, device, key + (name,))
+                for name in state.keys()
+            }
+        elif isinstance(state, ShardedTensor):
+            return self._unshard_tensor_using_shared_mem(state, world_size, device, key)
+        elif isinstance(state, torch.Tensor):
+            return state.to(device=device)
+        else:
+            return state
+    def _unshard_tensor_using_shared_mem(
+        self, sharded_tensor: ShardedTensor, world_size: int, device: torch.device, key: Tuple
+    ) -> torch.Tensor:
+        shard0_md = sharded_tensor.metadata()
+        def shard_size(shard_md):
+            return reduce((lambda x, y: x * y), shard_md.shard_sizes)  # type: ignore[attr-defined]
+        shard_placement, rank_sizes = self._get_shard_placement_and_rank_sizes(
+            shard0_md.shards_metadata, world_size
+        )
+        assert shard0_md.tensor_properties.dtype == torch.float32, "Expected sharded tensor to be fp32"
+        numpy_type = np.float32
+        out = torch.empty(
+            *sharded_tensor.metadata().size, dtype=sharded_tensor.metadata().tensor_properties.dtype, device=device
+        )
+        dims = len(sharded_tensor.metadata().size)
+        for shard_md, (rank, rank_offset) in shard_placement.items():
+            if rank >= world_size:
+                raise RuntimeError(f"Shard rank {rank} exceeds world size {world_size}")
+            sharded_memory_name = "-".join(key + (str(rank),))
+            shm = shared_memory.SharedMemory(name=sharded_memory_name)
+            rank_size = rank_sizes[rank]
+            assert rank_size >= 0
+            if rank_size == 0:
+                continue
+            np_arr = np.ndarray((rank_size,), dtype=numpy_type, buffer=shm.buf)
+            tensor = torch.from_numpy(np_arr)[rank_offset : rank_offset + shard_size(shard_md)]
+            tensor = tensor.view(shard_md.shard_sizes)
+            out_narrow_view = out
+            for dim in range(dims):
+                out_narrow_view = out_narrow_view.narrow(
+                    dim,
+                    shard_md.shard_offsets[dim],
+                    shard_md.shard_sizes[dim],
+                )
+            out_narrow_view.copy_(tensor)
+            shm.close()
+            shm.unlink()
+        return out
+    @contextmanager
+    def _patch_sharded_tensor_load(self):
+        """
+        Monkeypatch for torch's ShardedTensor, so we can unpickle without having torch.distributed set up.
+        """
+        def _rebuild_from_type_v2_monkey(func, new_type, args, state):
+            ret = func(*args)
+            if type(ret) is not new_type:
+                ret = ret.as_subclass(new_type)
+            # Shortcut the construction of ShardedTensor
+            # This is in the top 5 of my worst hacks.
+            if isinstance(ret, ShardedTensor):
+                ret._local_shards, ret._metadata, _, ret._sharding_spec, ret._init_rrefs = state
+                return ret
+            # The rest of this function ought to be in the top 5 of somebody else's worst hacks.
+            # Tensor does define __setstate__ even though it doesn't define
+            # __getstate__. So only use __setstate__ if it is NOT the one defined
+            # on Tensor
+            if getattr(ret.__class__, "__setstate__", torch.Tensor.__setstate__) is not torch.Tensor.__setstate__:
+                ret.__setstate__(state)
+            else:
+                ret = torch._utils._set_obj_state(ret, state)
+            return ret
+        original_rebuild_from_type_v2 = torch._tensor._rebuild_from_type_v2
+        try:
+            torch._tensor._rebuild_from_type_v2 = _rebuild_from_type_v2_monkey
+            yield
+        finally:
+            torch._tensor._rebuild_from_type_v2 = original_rebuild_from_type_v2
+    def _unshard_using_shared_memory(
+        self, input_dir: PathOrStr, device: torch.device, skip_keys: Optional[Set[str]] = None
+    ):
+        """
+        This unsharding implementation consists of:
+        1. Loading each shard on a separate process and copying their sharded tensors to shared memory.
+        2. Loading 1 shard on the main process as a base unsharded object.
+        3. Using the sharded tensors in shared memory to populate the base unsharded object.
+        This implementation is an alternative to a prior implementation that instead loaded
+        all shards using threads, because that implementation turned out to
+        be extremely slow (e.g. 6+ hours) sometimes when the world size was 1024.
+        The current implementation is slower than the old one in many scenarios,
+        but is significantly faster in the above mentioned case (e.g. 30 minutes)
+        if there are enough CPUs.
+        We keep the other implementation since this once can be more unreliable,
+        likely due to its dependence on a large amount of shared memory.
+        """
+        input_dir = Path(input_dir)
+        skip_keys = skip_keys or set()
+        shard_filepaths = list(input_dir.glob("rank*.pt"))
+        world_size = len(shard_filepaths)
+        if world_size == 0:
+            raise RuntimeError("No shards found for unsharding")
+        log.info("Number of shards: %d", world_size)
+        shard_size_gb = shard_filepaths[0].stat().st_size / (1024 * 1024 * 1024)
+        min_ram_required_estimate_gb = shard_size_gb * world_size
+        log.info(
+            "Shards are %.2fGB each, at least %.2fGB RAM is required", shard_size_gb, min_ram_required_estimate_gb
+        )
+        log.info("Copying sharded tensors to shared memory using multiple processes")
+        # Copy sharded data to shared memory using multiple processes, so this process can load
+        # from memory rather than disk. We spawn a new process instead of forking since shared memory
+        # appears to get deleted when forked processes end for some reason.
+        executor = ProcessPoolExecutor(
+            mp_context=mp.get_context("spawn"), initializer=util.prepare_cli_environment
+        )
+        futures = []
+        for shard_filepath in shard_filepaths:
+            shard_rank = int(shard_filepath.name[4:-3])
+            if shard_rank >= world_size:
+                raise RuntimeError(
+                    f"Shard rank {shard_rank} of file {shard_filepath} exceeds world size {world_size}"
+                )
+            futures.append(executor.submit(self._copy_sharded_data_to_shared_mem, world_size, shard_filepath))
+        for f in as_completed(futures):
+            f.result()
+        executor.shutdown()
+        log.info("Loading a shard on the main process to be unsharded state")
+        with self._patch_sharded_tensor_load():
+            state = torch.load(shard_filepaths[0], map_location="cpu")
+        for key in skip_keys:
+            if key in state:
+                del state[key]
+        log.info("Unsharding from %d shards ...", world_size)
+        return self._unshard_using_sharded_mem(state, world_size, device, input_dir)
+    def _unshard(self, input_dir: PathOrStr, device: torch.device, skip_keys: Optional[Set[str]] = None):
+        if self.use_shared_mem_impl:
+            return self._unshard_using_shared_memory(input_dir, device, skip_keys)
+        input_dir = Path(input_dir)
+        skip_keys = skip_keys or set()
+        with self._patch_sharded_tensor_load():
+            # We load in threads because it's faster.
+            executor = ThreadPoolExecutor()
+            shards_dict = {}
+            for shard_name in input_dir.glob("rank*.pt"):
+                log.info("Loading %s ...", shard_name)
+                shard_number = int(shard_name.name[4:-3])  # shard names look like "rankXX.pt"
+                shards_dict[shard_number] = executor.submit(torch.load, shard_name, map_location="cpu")
+            shards = [None] * len(shards_dict)
+            for rank, shard_future in shards_dict.items():
+                shard = shard_future.result()
+                for key in skip_keys:
+                    if key in shard:
+                        del shard[key]
+                shards[rank] = shard
+            assert all(shard is not None for shard in shards)
+            executor.shutdown()
+            del shards_dict
+            log.info("Unsharding from %d shards ...", len(shards))
+            unsharded_state_dict = self._unshard_object(shards, device=device)
+            # At this point in time we need 2x memory :-(
+            del shards
+            return unsharded_state_dict
+    def _unshard_object(self, os: List[Any], device: torch.device) -> Any:
+        rank0_item = os[0]
+        assert all(type(o) is type(rank0_item) for o in os)
+        if isinstance(rank0_item, str):
+            assert all(o == rank0_item for o in os)
+            return rank0_item
+        elif isinstance(rank0_item, (list, tuple, set)):
+            assert all(len(o) == len(rank0_item) for o in os)
+            return rank0_item.__class__(self._unshard_object(o, device=device) for o in zip(*os))
+        elif isinstance(rank0_item, dict):
+            assert all(o.keys() == rank0_item.keys() for o in os)
+            return {key: self._unshard_object([o[key] for o in os], device=device) for key in rank0_item.keys()}
+        elif isinstance(rank0_item, ShardedTensor):
+            return self._gather(os, device=device)
+        else:
+            assert all(self._objects_are_equal(o, rank0_item) for o in os)
+            return rank0_item
+    def _gather(self, shards: List[ShardedTensor], device: torch.device) -> torch.Tensor:
+        world_size = len(shards)
+        shard0_md = shards[0].metadata()
+        # Make sure all shards agree on the metadata
+        assert all(shard.metadata() == shard0_md for shard in shards)
+        # Make sure the nth shard expects to be the nth shard.
+        assert all(
+            shard_md.placement.rank() == rank  # type: ignore
+            for rank, shard_md in enumerate(shard0_md.shards_metadata)
+        )
+        def shard_size(shard_md):
+            return reduce((lambda x, y: x * y), shard_md.shard_sizes)  # type: ignore[attr-defined]
+        rank_sizes = [0 for _ in range(world_size)]
+        max_rank_size = 0
+        shard_placement: Dict[ShardMetadata, Tuple[int, int]] = {}
+        for shard_md in shard0_md.shards_metadata:
+            shard_rank = cast(_remote_device, shard_md.placement).rank()
+            assert shard_rank is not None
+            shard_placement[shard_md] = (shard_rank, rank_sizes[shard_rank])
+            rank_sizes[shard_rank] += shard_size(shard_md)
+            max_rank_size = max(max_rank_size, rank_sizes[shard_rank])
+        gather_list: List[torch.Tensor] = [torch.empty((max_rank_size,)) for _ in range(world_size)]
+        datas = []
+        with torch.no_grad():
+            for shard in shards:
+                data = torch.empty(max_rank_size)
+                for local_shard in shard.local_shards():
+                    src = local_shard.tensor.flatten()
+                    shard_offset = shard_placement[local_shard.metadata][1]
+                    data[shard_offset : shard_offset + src.numel()].copy_(src)
+                datas.append(data)
+        # torch.gather in a nutshell
+        for rank, data in enumerate(datas):
+            gather_list[rank].copy_(data)
+        full_size = shard0_md.size
+        out = torch.empty(*full_size, dtype=shard0_md.tensor_properties.dtype, device=device)
+        dims = len(full_size)
+        for shard_md in shard0_md.shards_metadata:
+            rank, rank_offset = shard_placement[shard_md]
+            tensor = gather_list[rank]
+            tensor = tensor[rank_offset : rank_offset + shard_size(shard_md)]
+            tensor = tensor.view(shard_md.shard_sizes)
+            out_narrow_view = out
+            for dim in range(dims):
+                out_narrow_view = out_narrow_view.narrow(
+                    dim,
+                    shard_md.shard_offsets[dim],
+                    shard_md.shard_sizes[dim],
+                )
+            out_narrow_view.copy_(tensor)
+        return out
+    def _objects_are_equal(self, a: Any, b: Any) -> bool:
+        if type(a) is not type(b):
+            return False
+        if isinstance(a, np.ndarray):
+            return np.array_equal(a, b)
+        elif isinstance(a, torch.Tensor):
+            return torch.equal(a, b)
+        else:
+            return a == b
+@dataclass
+class _LocalShardedCheckpointerMetadata(BaseConfig):
+    world_size: int = field(default_factory=get_world_size)
+@dataclass
+class _FlatParamShard:
+    full_shape: torch.Size
+    shard_offsets: Tuple[int, int]
+    shard_data: Optional[torch.Tensor]
+    def copy_into(self, full_tensor: torch.Tensor) -> None:
+        assert self.shard_data is not None
+        full_tensor_shard_view = full_tensor.view(-1)[self.shard_offsets[0] : self.shard_offsets[1] + 1]
+        assert self.shard_data.shape == full_tensor_shard_view.shape
+        full_tensor_shard_view.copy_(self.shard_data)
+class LocalShardedCheckpointer(Checkpointer):
+    """
+    A sharded :class:`Checkpointer` that directly saves the local FSDP flat params data.
+    The optimizer state is saved directly with `torch.save()` without reformatting via FSDP methods.
+    The world size must be kept consistent when using this checkpointer. However, you can easily
+    reconstruct a full unsharded model and/or optimizer state dictionary from a single Python process
+    using :meth:`unshard_checkpoint()` (no distributed initialization required).
+    """
+    # These correspond to metadata attributes on `torch.distributed.fsdp.flat_param.FlatParameter`.
+    _FLAT_PARAM_METADATA_TO_SAVE = (
+        "_fqns",
+        "_shard_param_offsets",
+        "_shard_indices",
+        "_numels",
+        "_numels_with_padding",
+        "_shapes",
+        "_shard_numel_padded",
+        "_shard_param_infos",
+    )
+    def _fsdp_modules(self, fsdp_model: FSDP) -> List[Tuple[str, FSDP]]:
+        """
+        Returns a list of FSDP modules with their FQN.
+        """
+        modules = []
+        for name, module in fsdp_model.named_modules():
+            if isinstance(module, FSDP):
+                modules.append((name, module))
+        return modules
+    def _prepare_fsdp_model(self, fsdp_model: FSDP) -> None:
+        from torch.distributed.fsdp._runtime_utils import _lazy_init
+        # TODO (epwalsh): I'm not sure if this is necessary, but this is what PyTorch does before saving/loading
+        # an FSDP state dict through the built-in methods.
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        _lazy_init(fsdp_model, fsdp_model)
+    def _fsdp_handles(self, fsdp_model: FSDP) -> List[FlatParamHandle]:
+        if version.parse(torch.__version__) < version.parse("2.1.0"):
+            return fsdp_model._handles  # type: ignore
+        elif version.parse(torch.__version__) < version.parse("2.3.0"):
+            # Handle could be None if the FSDP wrapper doesn't manage any parameters.
+            if hasattr(fsdp_model, "_handle") and fsdp_model._handle is not None:
+                return [fsdp_model._handle]  # type: ignore
+            else:
+                return []
+        else:
+            # Need to verify FSDP internals with newer versions.
+            raise NotImplementedError
+    @torch.no_grad()
+    def _get_flat_param_state_to_save(self, fsdp_model: FSDP) -> Dict[str, Any]:
+        self._prepare_fsdp_model(fsdp_model)
+        module_data = []
+        for module_fqn, fsdp_module in self._fsdp_modules(fsdp_model):
+            handle_data = []
+            for handle in self._fsdp_handles(fsdp_module):
+                data: Dict[str, Any] = {}
+                # This is a `FlatParameter` instance.
+                # See `torch.distributed.fsdp.flat_param` for the API.
+                flat_param = handle.flat_param
+                data["flat_param.data"] = flat_param.detach()
+                for key in self._FLAT_PARAM_METADATA_TO_SAVE:
+                    if hasattr(flat_param, key):
+                        data[f"flat_param.{key}"] = getattr(flat_param, key)
+                handle_data.append(data)
+            module_data.append({"handles": handle_data, "name": module_fqn})
+        return {"modules": module_data}
+    @torch.no_grad()
+    def _load_flat_param_state(self, fsdp_model: FSDP, model_state: Dict[str, Any]):
+        """Load the state produced from `self._get_flat_param_state_to_save()`."""
+        self._prepare_fsdp_model(fsdp_model)
+        fsdp_modules = self._fsdp_modules(fsdp_model)
+        assert len(model_state["modules"]) == len(fsdp_modules)
+        for (_, fsdp_module), module_data in zip(fsdp_modules, model_state["modules"]):
+            handles = self._fsdp_handles(fsdp_module)
+            assert len(handles) == len(module_data["handles"])
+            for handle, data in zip(handles, module_data["handles"]):
+                flat_param = handle.flat_param
+                # Make sure metadata matches.
+                for key in self._FLAT_PARAM_METADATA_TO_SAVE:
+                    if hasattr(flat_param, key):
+                        assert getattr(flat_param, key) == data[f"flat_param.{key}"]
+                # Load the flat sharded data.
+                flat_param.copy_(data["flat_param.data"])
+    def _save_metadata(self, dir: PathOrStr, *, upload_to: Optional[str] = None) -> None:
+        if get_fs_local_rank() == 0:
+            log.info("Saving metadata...")
+            metadata = _LocalShardedCheckpointerMetadata()
+            metadata.save(metadata_path := Path(dir) / "metadata.yaml")
+            if upload_to is not None and get_global_rank() == 0:
+                upload_target = f"{upload_to}/metadata.yaml"
+                log.info(f"Uploading {metadata_path} to {upload_target}")
+                upload(metadata_path, upload_target, save_overwrite=self.cfg.save_overwrite)
+    def _load_metadata(
+        self, load_path: PathOrStr, *, local_cache: Optional[PathOrStr] = None
+    ) -> _LocalShardedCheckpointerMetadata:
+        metadata_path = resource_path(load_path, "metadata.yaml", local_cache=local_cache)
+        return _LocalShardedCheckpointerMetadata.load(metadata_path)
+    def save_checkpoint(
+        self,
+        dir: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        trainer_state: Dict[str, Any],
+        *,
+        upload_to: Optional[str] = None,
+    ) -> None:
+        assert isinstance(
+            dist_model, FSDP
+        ), f"{self.__class__.__name__} is being called to save a model where `distributed_strategy` is not FSDP."
+        with self._temporary_wd(dir) as checkpoint_dir:
+            # Gather local FSDP flat params data to save.
+            # We also save some flat param metadata like the corresponding fully qualified names (fqns)
+            # of each original parameter so we can validate that the sharding is the same when loading
+            # one of these checkpoints.
+            log.info("Saving local FSDP flat params data...")
+            save_state_dict(
+                checkpoint_dir,
+                f"model/rank{get_global_rank()}.pt",
+                self._get_flat_param_state_to_save(dist_model),
+                upload_to=upload_to,
+                save_overwrite=self.cfg.save_overwrite,
+            )
+            # Save optimizer state.
+            log.info("Saving local optimizer state...")
+            save_state_dict(
+                checkpoint_dir,
+                f"optim/rank{get_global_rank()}.pt",
+                optim.state_dict(),
+                upload_to=upload_to,
+                save_overwrite=self.cfg.save_overwrite,
+            )
+            # Save trainer state.
+            log.info("Saving trainer state...")
+            save_state_dict(
+                checkpoint_dir,
+                f"train/rank{get_global_rank()}.pt",
+                trainer_state,
+                upload_to=upload_to,
+                save_overwrite=self.cfg.save_overwrite,
+            )
+            # Save metadata.
+            self._save_metadata(checkpoint_dir, upload_to=upload_to)
+            # Save config. We do this last b/c the presence of a config in a remote checkpoint
+            # "directory" indicates that the folder is valid, as a opposed to a partially
+            # uploaded checkpoint directory that failed before completing.
+            self._save_config(checkpoint_dir, upload_to=upload_to)
+    def restore_checkpoint(
+        self,
+        load_path: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+    ) -> Dict[str, Any]:
+        # Load metadata and make sure checkpoint is compatible.
+        metadata = self._load_metadata(load_path, local_cache=local_cache)
+        assert metadata.world_size == get_world_size()
+        # Load local FSDP flat param data.
+        log.info("Loading local FSDP flat params data...")
+        assert isinstance(
+            dist_model, FSDP
+        ), f"{self.__class__.__name__} is being called to load a model where `distributed_strategy` is not FSDP."
+        model_state = load_state_dict(
+            load_path, f"model/rank{get_global_rank()}.pt", local_cache=local_cache, map_location="cpu"
+        )
+        self._load_flat_param_state(dist_model, model_state)
+        del model_state
+        # Load local optim state.
+        if load_optimizer_state:
+            log.info("Loading local optimizer state...")
+            optim_state = load_state_dict(
+                load_path, f"optim/rank{get_global_rank()}.pt", local_cache=local_cache, map_location="cpu"
+            )
+            # HACK/TODO (epwalsh): When we use adaptive clipping we track the 'grad_norm_exp_avg' for every param
+            # in every rank, and keep this in the optimizer state. But this causes issues when loading the
+            # state since torch sees the state is non-empty for some params which would normally be empty,
+            # and then assumes it should have all of the other state tensors for that param, which is doesn't.
+            # So for now we just remove 'grad_norm_exp_avg' everywhere from the state, which resets that metric.
+            # Not the end of the world but there's probably a better way around this without resetting
+            # the metric.
+            for param_id in list(optim_state["state"].keys()):
+                state = optim_state["state"][param_id]
+                if "grad_norm_exp_avg" in state:
+                    del state["grad_norm_exp_avg"]
+                if len(state) == 0:
+                    del optim_state["state"][param_id]
+            optim.load_state_dict(optim_state)
+            del optim_state
+        # Load local trainer state.
+        log.info("Loading local trainer state...")
+        trainer_state = load_state_dict(load_path, f"train/rank{get_global_rank()}.pt", local_cache=local_cache)
+        barrier()
+        return trainer_state
+    def _iter_flat_param_shards(
+        self, model_state: Dict[str, Any]
+    ) -> Generator[Tuple[str, _FlatParamShard], None, None]:
+        for module_data in model_state["modules"]:
+            module_prefix = module_data["name"].replace("_fsdp_wrapped_module.", "")
+            for handle in module_data["handles"]:
+                flat_data = handle["flat_param.data"]
+                if (num_padding := handle["flat_param._shard_numel_padded"]) > 0:
+                    # If there's padding in the flat param it should be on the right.
+                    assert (flat_data[-num_padding:] == 0).all()
+                # NOTE: this changes depending on the torch version, but we don't do a version
+                # check since we might be trying to unshard an old checkpoint that was stored
+                # with a different torch version than we're currently running with.
+                if "flat_param._shard_indices" in handle:
+                    # torch <=2.0.1
+                    param_start = handle["flat_param._shard_indices"][0]
+                    current_flat_index = 0
+                    for relative_fqn, full_shape, (offset_start, offset_end) in zip(
+                        handle["flat_param._fqns"][param_start:],
+                        handle["flat_param._shapes"][param_start:],
+                        handle["flat_param._shard_param_offsets"],
+                    ):
+                        root_fqn = relative_fqn if not module_prefix else f"{module_prefix}.{relative_fqn}"
+                        numel_shard = offset_end - offset_start + 1
+                        flat_param_shard = _FlatParamShard(
+                            full_shape=full_shape,
+                            shard_offsets=(offset_start, offset_end),
+                            shard_data=flat_data[current_flat_index : current_flat_index + numel_shard],
+                        )
+                        current_flat_index += numel_shard
+                        yield root_fqn, flat_param_shard
+                else:
+                    # torch >=2.1.0
+                    for relative_fqn, full_shape, shard_param_info in zip(
+                        handle["flat_param._fqns"],
+                        handle["flat_param._shapes"],
+                        handle["flat_param._shard_param_infos"],
+                    ):
+                        if not shard_param_info.in_shard:
+                            continue
+                        root_fqn = relative_fqn if not module_prefix else f"{module_prefix}.{relative_fqn}"
+                        flat_param_shard = _FlatParamShard(
+                            full_shape=full_shape,
+                            shard_offsets=(
+                                shard_param_info.intra_param_start_idx,
+                                shard_param_info.intra_param_end_idx,
+                            ),
+                            shard_data=flat_data[
+                                shard_param_info.offset_in_shard : shard_param_info.offset_in_shard
+                                + shard_param_info.numel_in_shard
+                            ],
+                        )
+                        yield root_fqn, flat_param_shard
+    def unshard_checkpoint(
+        self,
+        load_path: PathOrStr,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+        load_trainer_state: bool = True,
+        device: Optional[torch.device] = None,
+    ) -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
+        device = device or torch.device("cpu")
+        metadata = self._load_metadata(load_path, local_cache=local_cache)
+        # Gather paths model state, potentially downloading them.
+        log.info("Gathering model state dicts...")
+        model_state_paths = self._gather_state_dict_paths(
+            load_path, "model", metadata.world_size, local_cache=local_cache
+        )
+        # Load model state dicts one-by-one, materializing and populating the full parameters as we go.
+        log.info("Materializing full parameters...")
+        full_model_state: Dict[str, torch.Tensor] = {}
+        # We keep a copy of the flat param metadata minus the actual tensors so we can reconstruct
+        # the full optimizer state below without having to reload the model state dicts.
+        flat_params_data: Dict[int, Dict[str, _FlatParamShard]] = defaultdict(dict)
+        for rank, path in enumerate(model_state_paths):
+            log.info(f"Loading shards from rank {rank}...")
+            model_state = torch.load(path, map_location="cpu")
+            for root_fqn, flat_param_shard in self._iter_flat_param_shards(model_state):
+                if root_fqn not in full_model_state:
+                    log.info(
+                        f"Materializing full parameter '{root_fqn}' with shape {flat_param_shard.full_shape}..."
+                    )
+                    assert flat_param_shard.shard_data is not None
+                    full_model_state[root_fqn] = torch.empty(
+                        flat_param_shard.full_shape, dtype=flat_param_shard.shard_data.dtype, device=device
+                    )
+                    # Fill with NaNs so we can validate that the whole parameter has been populated
+                    # afterwards.
+                    full_model_state[root_fqn].fill_(torch.nan)
+                # Copy over the local shard to the relevant part of the full parameter.
+                full_param = full_model_state[root_fqn]
+                log.info(f"Loading rank {rank} shard for '{root_fqn}'...")
+                flat_param_shard.copy_into(full_param)
+                flat_params_data[rank][root_fqn] = replace(flat_param_shard, shard_data=None)
+        log.info("Validating full parameters...")
+        for key, tensor in full_model_state.items():
+            if torch.isnan(tensor).any():
+                raise ValueError(f"Parameter '{key}' contains NaNs, this is likely a bug with the unsharder")
+        trainer_state: Optional[Dict[str, Any]] = None
+        if load_trainer_state:
+            trainer_state = load_state_dict(load_path, "train/rank0.pt", local_cache=local_cache)
+        if not load_optimizer_state:
+            return full_model_state, None, trainer_state
+        log.info("Gathering optim state dicts...")
+        optim_state_paths = self._gather_state_dict_paths(
+            load_path, "optim", metadata.world_size, local_cache=local_cache
+        )
+        log.info("Materializing full optim state...")
+        full_optim_state: Dict[str, Any] = {"state": defaultdict(dict)}
+        fqn_to_id: Dict[str, int] = {}
+        id_to_fqn: Dict[int, str] = {}
+        for rank, path in enumerate(optim_state_paths):
+            log.info(f"Loading sharded optim state from rank {rank}...")
+            optim_state = torch.load(path, map_location="cpu")
+            # Initialize param groups.
+            # We assume parameter groups are the same across all ranks.
+            # The only thing that differs across ranks is the state for each local sharded param.
+            if "param_groups" not in full_optim_state:
+                full_optim_state["param_groups"] = optim_state["param_groups"]
+            else:
+                assert full_optim_state["param_groups"] == optim_state["param_groups"]
+            # Generate mapping of parameter FQNs to optimizer param IDs and vice-versa.
+            if not fqn_to_id or not id_to_fqn:
+                for group in full_optim_state["param_groups"]:
+                    for fqn, id in zip(group["param_names"], group["params"]):
+                        fqn = fqn.replace("_fsdp_wrapped_module.", "")
+                        fqn_to_id[fqn] = id
+                        id_to_fqn[id] = fqn
+            # Iterate over local shard state and copy into the full state.
+            for id, shard_state in optim_state["state"].items():
+                fqn = id_to_fqn[id]
+                flat_param_shard = flat_params_data[rank].get(fqn)  # type: ignore[assignment]
+                full_state = full_optim_state["state"][id]
+                for key, shard_value in shard_state.items():
+                    assert isinstance(shard_value, torch.Tensor)
+                    if shard_value.shape == torch.Size([]):
+                        # Add singleton tensors directly to full state. These should be the same across
+                        # all ranks.
+                        assert key in ("step", "grad_norm_exp_avg")  # sanity check
+                        if key not in full_state:
+                            full_state[key] = shard_value.to(device)
+                        else:
+                            assert full_state[key] == shard_value
+                    else:
+                        # Otherwise we have a sharded param state.
+                        # If the corresponding full param state hasn't been materialized yet, do so now.
+                        assert flat_param_shard is not None, f"missing flat_params_data for {fqn} from rank {rank}"
+                        if key not in full_state:
+                            log.info(
+                                f"Materializing full state '{key}' for '{fqn}' with shape {flat_param_shard.full_shape}..."
+                            )
+                            full_state[key] = torch.empty(
+                                flat_param_shard.full_shape, dtype=shard_value.dtype, device=device
+                            )
+                        full_state_value = full_state[key]
+                        # Copy over the local shard state to the relevant part of the full parameter state.
+                        log.info(f"Loading rank {rank} shard state of '{key}' for '{fqn}'...")
+                        replace(flat_param_shard, shard_data=shard_value).copy_into(full_state_value)
+        # Lastly, clean up the parameter names in param groups.
+        for group in full_optim_state["param_groups"]:
+            group["param_names"] = [n.replace("_fsdp_wrapped_module.", "") for n in group["param_names"]]
+        return full_model_state, full_optim_state, trainer_state
+    def _get_state_dict_path(
+        self,
+        load_path: PathOrStr,
+        state_dict_type: str,
+        rank: int,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        progress=None,
+    ) -> Tuple[int, Path]:
+        fname = f"{state_dict_type}/rank{rank}.pt"
+        return rank, resource_path(str(load_path).rstrip("/"), fname, local_cache=local_cache, progress=progress)
+    def _gather_state_dict_paths(
+        self,
+        load_path: PathOrStr,
+        state_dict_type: str,
+        world_size: int,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+    ) -> List[Path]:
+        progress = get_progress_bar()
+        with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
+            futures = []
+            for rank in range(world_size):
+                future = executor.submit(
+                    self._get_state_dict_path,
+                    load_path,
+                    state_dict_type,
+                    rank,
+                    local_cache=local_cache,
+                    progress=progress,
+                )
+                futures.append(future)
+            results: Dict[int, Path] = {}
+            for future in as_completed(futures):
+                rank, path = future.result()
+                results[rank] = path
+        return [results[rank] for rank in range(world_size)]
+class OlmoCoreCheckpointer(Checkpointer):
+    def save_checkpoint(
+        self,
+        dir: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        trainer_state: Dict[str, Any],
+        *,
+        upload_to: Optional[str] = None,
+    ) -> None:
+        from olmo_core.distributed.checkpoint import (  # type: ignore
+            save_model_and_optim_state,
+        )
+        with self._temporary_wd(dir) as checkpoint_dir:
+            log.info("Saving model and optim state...")
+            if get_fs_local_rank() == 0:
+                (checkpoint_dir / "model").mkdir(exist_ok=True, parents=True)
+                (checkpoint_dir / "optim").mkdir(exist_ok=True, parents=True)
+                (checkpoint_dir / "train").mkdir(exist_ok=True, parents=True)
+            wait_for(
+                lambda: (checkpoint_dir / "model").exists(), "Waiting for checkpoint model directory", timeout=10.0
+            )
+            wait_for(
+                lambda: (checkpoint_dir / "optim").exists(), "Waiting for checkpoint optim directory", timeout=10.0
+            )
+            wait_for(
+                lambda: (checkpoint_dir / "train").exists(), "Waiting for checkpoint train directory", timeout=10.0
+            )
+            local_files_created = save_model_and_optim_state(checkpoint_dir, dist_model, optim)
+            if upload_to is not None:
+                for path in local_files_created:
+                    path = Path(path)
+                    upload_target = f"{upload_to.rstrip('/')}/{path.relative_to(checkpoint_dir)}"
+                    log.info(f"Uploading {path} to {upload_target}...")
+                    upload(path, upload_target, save_overwrite=self.cfg.save_overwrite)
+            log.info("Saving trainer state...")
+            save_state_dict(
+                checkpoint_dir,
+                f"train/rank{get_global_rank()}.pt",
+                trainer_state,
+                upload_to=upload_to,
+                save_overwrite=self.cfg.save_overwrite,
+            )
+            self._save_config(checkpoint_dir, upload_to=upload_to)
+    def restore_checkpoint(
+        self,
+        load_path: PathOrStr,
+        dist_model: nn.Module,
+        optim: Optimizer,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+    ) -> Dict[str, Any]:
+        from olmo_core.distributed.checkpoint import (  # type: ignore
+            load_model_and_optim_state,
+        )
+        log.info("Loading model and optim state...")
+        load_model_and_optim_state(load_path, dist_model, optim if load_optimizer_state else None)
+        log.info("Loading trainer state...")
+        try:
+            trainer_state = load_state_dict(
+                load_path, f"train/rank{get_global_rank()}.pt", local_cache=local_cache
+            )
+        except FileNotFoundError:
+            # Fall back to rank 0 train state.
+            # This can happen when we're restoring a checkpoint with a different world size.
+            trainer_state = load_state_dict(load_path, "train/rank0.pt", local_cache=local_cache)
+        barrier()
+        return trainer_state
+    def unshard_checkpoint(
+        self,
+        load_path: PathOrStr,
+        *,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+        load_trainer_state: bool = True,
+        device: Optional[torch.device] = None,
+    ) -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, Any]], Optional[Dict[str, Any]]]:
+        from olmo_core.distributed.checkpoint import (  # type: ignore
+            unshard_model_state,
+            unshard_optim_state,
+        )
+        model_state = unshard_model_state(load_path, device=device)
+        optim_state: Optional[Dict[str, Any]] = None
+        train_state: Optional[Dict[str, Any]] = None
+        if load_optimizer_state:
+            optim_state = cast(Dict[str, Any], unshard_optim_state(load_path, device=device))
+        if load_trainer_state:
+            train_state = load_state_dict(load_path, "train/rank0.pt", local_cache=local_cache)
+        return model_state, optim_state, train_state
+def build_sharded_checkpointer(
+    cfg: TrainConfig, *, name: Optional[ShardedCheckpointerType] = None, use_shared_mem_impl: bool = False
+) -> Checkpointer:
+    name = name or cfg.sharded_checkpointer
+    if name == ShardedCheckpointerType.torch_new:
+        return TorchNewStyleShardedCheckpointer(cfg)
+    elif name == ShardedCheckpointerType.torch_legacy:
+        return TorchLegacyShardedCheckpointer(cfg, use_shared_mem_impl=use_shared_mem_impl)
+    elif name == ShardedCheckpointerType.local:
+        return LocalShardedCheckpointer(cfg)
+    elif name == ShardedCheckpointerType.olmo_core:
+        return OlmoCoreCheckpointer(cfg)
+    else:
+        raise NotImplementedError(name)

config.json CHANGED Viewed

@@ -1,12 +1,10 @@
 {
-  "_name_or_path": "/nfs100/dongyh/FANformer-1B",
   "activation_type": "swiglu",
   "alibi": false,
   "alibi_bias_max": 8.0,
   "architectures": [
     "OLMoForCausalLM"
   ],
-  "att_nolinear": false,
   "attention_activation": null,
   "attention_dropout": 0.0,
   "attention_layer_norm": false,
@@ -25,7 +23,6 @@
   "embedding_layer_norm": false,
   "embedding_size": 50304,
   "eos_token_id": 50279,
-  "ffn_activation": null,
   "flash_attention": true,
   "include_bias": false,
   "init_cutoff_factor": null,
@@ -55,17 +52,9 @@
   "rope_theta": 10000,
   "scale_emb_init": false,
   "scale_logits": false,
-  "torch_dtype": "float32",
-  "transformers_version": "4.49.0",
-  "use_A": false,
-  "use_ATF": true,
   "use_cache": true,
-  "use_fpn": false,
-  "use_fpneq": false,
-  "use_fpnnow": false,
-  "use_fpnpn": false,
-  "use_mod": false,
-  "use_mod_ffn": 0,
   "vocab_size": 50280,
   "weight_tying": true
 }

 {
   "activation_type": "swiglu",
   "alibi": false,
   "alibi_bias_max": 8.0,
   "architectures": [
     "OLMoForCausalLM"
   ],
   "attention_activation": null,
   "attention_dropout": 0.0,
   "attention_layer_norm": false,
   "embedding_layer_norm": false,
   "embedding_size": 50304,
   "eos_token_id": 50279,
   "flash_attention": true,
   "include_bias": false,
   "init_cutoff_factor": null,
   "rope_theta": 10000,
   "scale_emb_init": false,
   "scale_logits": false,
+  "transformers_version": "4.46.0",
   "use_cache": true,
+  "use_ATF": true,
   "vocab_size": 50280,
   "weight_tying": true
 }

config.py ADDED Viewed

	@@ -0,0 +1,1371 @@

+from __future__ import annotations
+from copy import deepcopy
+from dataclasses import asdict, dataclass, field
+from glob import glob
+from pathlib import Path
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
+import numpy as np
+import torch
+from omegaconf import DictConfig, ListConfig
+from omegaconf import OmegaConf as om
+from omegaconf.errors import OmegaConfBaseException
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from .aliases import PathOrStr
+from .exceptions import OLMoConfigurationError
+from .util import StrEnum
+__all__ = [
+    "ActivationType",
+    "ActivationCheckpointingStrategy",
+    "BlockType",
+    "LayerNormType",
+    "InitFnType",
+    "ModelConfig",
+    "OptimizerType",
+    "OptimizerConfig",
+    "SchedulerType",
+    "SchedulerConfig",
+    "DataConfig",
+    "InstanceFilterConfig",
+    "EvaluatorConfig",
+    "TokenizerConfig",
+    "TrainConfig",
+    "PaddingDirection",
+    "TruncationDirection",
+    "SpeedMonitorConfig",
+    "WandbConfig",
+    "CompilerConfig",
+    "WandbConfig",
+    "DDPConfig",
+    "DistributedStrategy",
+    "DDPGradSyncMode",
+    "FSDPPrecision",
+    "FSDPWrapStrategy",
+    "FSDPConfig",
+    "SingleGPUConfig",
+    "CheckpointType",
+]
+C = TypeVar("C", bound="BaseConfig")
+D = TypeVar("D", bound="DictConfig|ListConfig")
+class BaseConfig:
+    @classmethod
+    def _register_resolvers(cls, validate_paths: bool = True):
+        # Expands path globs into a list.
+        def path_glob(*paths) -> List[str]:
+            out = []
+            for path in paths:
+                matches = sorted(glob(path))
+                if not matches and validate_paths:
+                    raise FileNotFoundError(f"{path} does not match any files or dirs")
+                out.extend(matches)
+            return out
+        # Chooses the first path in the arguments that exists.
+        def path_choose(*paths) -> str:
+            from .util import is_url
+            for path in paths:
+                if is_url(path) or Path(path).exists():
+                    return path
+            if validate_paths:
+                raise FileNotFoundError(", ".join(paths))
+            else:
+                return ""
+        # Finds the latest checkpoint in a folder.
+        def path_last_checkpoint(path) -> str:
+            from .util import find_latest_checkpoint
+            latest_checkpoint = find_latest_checkpoint(path)
+            if latest_checkpoint is None:
+                if validate_paths:
+                    raise FileNotFoundError(f"Could not find a latest checkpoint at {path}")
+                else:
+                    return ""
+            else:
+                return str(latest_checkpoint)
+        om.register_new_resolver("path.glob", path_glob, replace=True)
+        om.register_new_resolver("path.choose", path_choose, replace=True)
+        om.register_new_resolver("path.last_checkpoint", path_last_checkpoint, replace=True)
+    @classmethod
+    def update_legacy_settings(cls, config: D) -> D:
+        """
+        Update the legacy config settings whose schemas have undergone backwards-incompatible changes.
+        """
+        return config
+    @classmethod
+    def new(cls: Type[C], **kwargs) -> C:
+        cls._register_resolvers()
+        conf = om.structured(cls)
+        try:
+            if kwargs:
+                conf = om.merge(conf, kwargs)
+            return cast(C, om.to_object(conf))
+        except OmegaConfBaseException as e:
+            raise OLMoConfigurationError(str(e))
+    @classmethod
+    def load(
+        cls: Type[C],
+        path: PathOrStr,
+        overrides: Optional[List[str]] = None,
+        key: Optional[str] = None,
+        validate_paths: bool = True,
+    ) -> C:
+        """Load from a YAML file."""
+        cls._register_resolvers(validate_paths=validate_paths)
+        schema = om.structured(cls)
+        try:
+            raw = om.load(str(path))
+            if key is not None:
+                raw = raw[key]  # type: ignore
+            raw = cls.update_legacy_settings(raw)
+            conf = om.merge(schema, raw)
+            if overrides:
+                conf = om.merge(conf, om.from_dotlist(overrides))
+            return cast(C, om.to_object(conf))
+        except OmegaConfBaseException as e:
+            raise OLMoConfigurationError(str(e))
+    def save(self, path: PathOrStr) -> None:
+        """Save to a YAML file."""
+        om.save(config=self, f=str(path))
+    def asdict(self, exclude: Optional[Iterable[str]] = None) -> Dict[str, Any]:
+        out = asdict(self)  # type: ignore
+        if exclude is not None:
+            for name in exclude:
+                if name in out:
+                    del out[name]
+        return out
+    def update_with(self, **kwargs):
+        result = deepcopy(self)
+        for key, value in kwargs.items():
+            setattr(result, key, value)
+        return result
+class LayerNormType(StrEnum):
+    default = "default"
+    """
+    The default LayerNorm implementation, equivalent to PyTorch's built-in version.
+    """
+    low_precision = "low_precision"
+    """
+    A low-precision version of the default LayerNorm.
+    """
+    rms = "rms"
+    """
+    An RMSNorm implementation. When using ``torch.compile`` this is
+    probably the fastest implementation.
+    """
+class ActivationType(StrEnum):
+    gelu = "gelu"
+    relu = "relu"
+    swiglu = "swiglu"
+class BlockType(StrEnum):
+    sequential = "sequential"
+    llama = "llama"
+    """
+    A block similar to the sequential block with slightly different
+    implementations of operations like attention to imitate the behavior of Llama.
+    """
+class InitFnType(StrEnum):
+    mitchell = "mitchell"
+    """
+    The strategy suggested to us by Mitchell Wortsman from UW.
+    This uses a truncated normal distribution with an adaptive standard deviation that depends
+    on the size of the weights as well as the depth of the layer.
+    """
+    normal = "normal"
+    """
+    All weights are initialized from the same normal distribution.
+    """
+    kaiming_normal = "kaiming_normal"
+    """
+    All weights are initialized with the Kaiming method from a normal distribution.
+    Note this currently won't work with FSDP.
+    """
+    fan_in = "fan_in"
+    """
+    "Fan-in variance scaling", i.e. normal with a standard deviation of ``1/sqrt(d_in)`` where ``d_in``
+    is the input dimensionality of the kernel.
+    """
+    full_megatron = "full_megatron"
+    """
+    This is what metaseq calls "full megatron init". It is the init used for Llama 2.
+    """
+@dataclass
+class ModelConfig(BaseConfig):
+    """
+    OLMo (model) configuration.
+    """
+    # Note that the defaults for these attributes are equivalent to the base GPT2 model.
+    d_model: int = 768
+    """
+    The hidden size of the model.
+    """
+    n_heads: int = 12
+    """
+    The number of self-attention heads.
+    """
+    n_kv_heads: Optional[int] = None
+    """
+    The number of heads to use for keys and values. Defaults to `n_heads`.
+    Set this to ``None`` or ``n_heads`` for normal multi-head attention.
+    Set this to 1 for multi-query attention.
+    Set it to some in-between value for Llama2-style grouped query attention.
+    """
+    clip_qkv: Optional[float] = None
+    """
+    Clip QKV to this value when set.
+    """
+    n_layers: int = 12
+    """
+    The number of layers/blocks.
+    """
+    mlp_ratio: int = 4
+    """
+    The ratio of the inner MLP dimensionality to ``d_model``.
+    This is only used when ``mlp_hidden_size`` is not set.
+    """
+    mlp_hidden_size: Optional[int] = None
+    """
+    Set the exact hidden size for the MLP. Otherwise the inner MLP hidden size will be set to `mlp_ratio * d_model`.
+    """
+    activation_type: ActivationType = ActivationType.swiglu
+    """
+    The activation function to use within the MLP layers.
+    """
+    block_type: BlockType = BlockType.sequential
+    """
+    The transformer block implementation.
+    """
+    block_group_size: int = 1
+    """
+    The number of blocks to group together into a single parent block.
+    This has no affect on the number of parameters in the model and is only used to wrap groups
+    of blocks together with a single FSDP wrapper during training.
+    """
+    alibi: bool = False
+    """
+    If ``True``, use ALiBi embeddings. Mutually exclusive with ``rope``.
+    """
+    alibi_bias_max: float = 8.0
+    """
+    Maximum absolute value of ALiBi bias.
+    """
+    rope: bool = False
+    """
+    Use rotary positional embeddings (RoPE). Mutually exclusive with ``alibi``.
+    """
+    rope_full_precision: bool = True
+    """
+    If ``True``, apply RoPE embeddings at full precision regardless of the input type. Otherwise,
+    apply RoPE at the precision of the input.
+    """
+    rope_theta: int = 10_000
+    """
+    The theta setting for RoPE.
+    """
+    flash_attention: bool = False
+    """
+    If ``True``, use ``FlashAttention``.
+    """
+    attention_dropout: float = 0.1
+    """
+    The dropout probability within the attention modules.
+    """
+    multi_query_attention: Optional[bool] = None
+    """
+    Deprecated. Use n_kv_heads instead.
+    """
+    attention_layer_norm: bool = False
+    """
+    Apply layer norm to the keys and queries within the attention mechanism.
+    This can help stabilize training.
+    """
+    residual_dropout: float = 0.1
+    """
+    The dropout probability for the MLP and attention output within each block.
+    """
+    embedding_dropout: float = 0.1
+    """
+    The dropout probability for embeddings.
+    """
+    embedding_layer_norm: bool = False
+    """
+    Apply layer norm directly to the embeddings.
+    """
+    layer_norm_type: LayerNormType = LayerNormType.default
+    """
+    The layernorm implementation to use.
+    """
+    layer_norm_with_affine: bool = True
+    """
+    Whether to include bias and weight parameters for the layer norms.
+    This only affects layer norms that are immediately followed by a linear layer in the forward pass,
+    so everything except QK-norms. To turn off affines for QK norms as well, set :attr:`attention_layer_norm_with_affine`
+    to ``False``.
+    """
+    layer_norm_eps: float = 1e-05
+    attention_layer_norm_with_affine: bool = True
+    """
+    Toggle affine transform for the QK norms.
+    """
+    max_sequence_length: int = 1024
+    """
+    The maximum input sequence length supported by the model.
+    """
+    include_bias: bool = True
+    """
+    Whether or not to include bias parameters in linear layers.
+    In PaLM, they got rid of all bias terms because they found that large
+    models tend to have near 0 bias terms anyway.
+    """
+    bias_for_layer_norm: Optional[bool] = None
+    """
+    Whether or not to include bias parameters in layer norm.
+    This is separate from the include_bias parameter, because of a ROCm crash when biases are disabled in
+    layer norm.
+    When this is None (the default), it inherits the setting from include_bias.
+    """
+    scale_logits: bool = False
+    """
+    If ``True``, scale the output logits by ``1 / sqrt(d_model)``.
+    """
+    vocab_size: int = 50257
+    """
+    Vocabulary size of the model.
+    """
+    embedding_size: Optional[int] = 50304
+    """
+    The number of embeddings, i.e. the number of tokens. If set to ``None`` it will default
+    to ``vocab_size``. If ``vocab_size`` is not a multiple of 128, setting this to the
+    next multiple of 128 that's greater than ``vocab_size`` can improve throughput
+    substantially.
+    """
+    weight_tying: bool = True
+    """
+    Whether to tie output linear weights to the input embedding.
+    """
+    eos_token_id: int = 50256
+    """
+    The ID of the end-of-sentence special token.
+    """
+    pad_token_id: int = 50256
+    """
+    The ID of the token to use for padding. Defaults to the ID of the EOS token.
+    """
+    init_device: Optional[str] = None
+    """
+    The torch device to use when initializing the model parameters, e.g. "cpu", "cuda:0", "meta".
+    """
+    init_fn: InitFnType = InitFnType.normal
+    """
+    The weight initialization strategy.
+    """
+    init_std: float = 0.02
+    """
+    The standard deviation to use when initializing weights with a "fixed distribution" ``init_fn``, such
+    as "normal".
+    """
+    init_cutoff_factor: Optional[float] = None
+    """
+    A positive factor used to scale the cutoff values when initializing weights with a "fixed distribution" ``init_fn``, such
+    as "normal". Setting this to None means values are not cutoff.
+    """
+    precision: Optional[str] = None
+    """
+    Precision used to train/evaluate with. You shouldn't set this directly.
+    See :data:`TrainConfig.precision` instead.
+    """
+    scale_emb_init: bool = False
+    """
+    If ``True``, embeddings are scaled up by ``sqrt(d_model)`` during initialization.
+    Currently this is only used with `full_megatron` init when ``emb_init_std`` is unset.
+    """
+    emb_init_std: Optional[float] = None
+    """
+    Override the standard deviation to use when initializing the embedding weights.
+    """
+    norm_after: bool = False
+    """
+    Apply norm after the attention/feedforward layers rather than before, as introduced in the Swin transformer paper (Liu et al).
+    """
+    use_ATF: Optional[bool] = False
+    p_ratio: float = 0.25
+    attention_activation: Optional[str] = None
+    @property
+    def effective_n_kv_heads(self) -> int:
+        if self.n_kv_heads is None:
+            if self.multi_query_attention is True:
+                return 1
+            else:
+                return self.n_heads
+        else:
+            if self.multi_query_attention is None:
+                return self.n_kv_heads
+            if self.multi_query_attention:
+                n_kv_heads_should_be = 1
+            else:
+                n_kv_heads_should_be = self.n_heads
+            if self.n_kv_heads == n_kv_heads_should_be:
+                return n_kv_heads_should_be
+            else:
+                raise OLMoConfigurationError(
+                    "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
+                )
+class OptimizerType(StrEnum):
+    lionw = "lionw"
+    adamw = "adamw"
+@dataclass
+class OptimizerConfig(BaseConfig):
+    name: OptimizerType = OptimizerType.lionw
+    learning_rate: float = 1.0e-4
+    weight_decay: float = 0.01
+    betas: Tuple[float, float] = (0.9, 0.95)
+    eps: float = 1e-5
+    no_decay_norm_and_bias: Optional[bool] = None
+    """
+    Deprecated. Use ``decay_norm_and_bias`` and ``decay_embeddings`` instead.
+    """
+    selective_updates: bool = False
+    """
+    If ``True``, optimizer parameter and state updates are skipped when the corresponding gradient is 0.
+    """
+    decay_norm_and_bias: bool = False
+    decay_embeddings: bool = False
+    metrics_log_interval: Optional[int] = None
+    """
+    The interval with which to collect and log detailed parameter-specific metrics.
+    This only applies when logging to W&B, since these metrics won't be logged to the console.
+    If not set, defaults to the wandb `log_interval`.
+    """
+    record_update_metrics: bool = False
+    """
+    Whether to record detailed metrics about the optimizer's parameter updates, like the norm and max
+    of the update with AdamW.
+    """
+    def __post_init__(self):
+        self.betas = tuple(self.betas)  # type: ignore[assignment]
+    @classmethod
+    def update_legacy_settings(cls, config: D) -> D:
+        new_config = config.copy()
+        if om.is_dict(new_config):
+            assert isinstance(new_config, DictConfig)
+            if hasattr(new_config, "name") and new_config.name == "decoupled_lionw":
+                new_config.name = "lionw"
+                if hasattr(new_config, "eps"):
+                    del new_config.eps
+        return new_config
+class SchedulerType(StrEnum):
+    cosine_with_warmup = "cosine_with_warmup"
+    linear_with_warmup = "linear_with_warmup"
+    inverse_sqrt_with_warmup = "inverse_sqrt_with_warmup"
+    max_scheduler = "max_scheduler"
+    constant = "constant"
+    cosine_linear_envelope = "cosine_linear_envelope"
+    constant_with_warmup = "constant_with_warmup"
+class SchedulerUnits(StrEnum):
+    steps = "steps"
+    tokens = "tokens"
+@dataclass
+class SchedulerConfig(BaseConfig):
+    name: SchedulerType = SchedulerType.cosine_with_warmup
+    units: SchedulerUnits = SchedulerUnits.steps
+    t_warmup: Union[int, float] = 100
+    t_max: Optional[Union[int, float]] = None
+    alpha_f: float = 0.1
+    grad_clip_warmup_steps: Optional[Union[int, float]] = None
+    """
+    The warmup period for which the max grad norm (or norm ratio) will be set to its
+    warmup value of `max_grad_norm * grad_clip_warmup_factor`.
+    """
+    grad_clip_warmup_factor: Optional[float] = None
+    """
+    The ratio of the max allowed gradient norm (or norm ratio) for clipping during the warmup period
+    vs after the warmup period.
+    """
+    warmup_min_lr: Optional[float] = None
+    """
+    The starting LR during the warmup period. If not set this defaults to 10% of
+    the target LR.
+    """
+class PaddingDirection(StrEnum):
+    right = "right"
+    left = "left"
+@dataclass
+class InstanceFilterConfig(BaseConfig):
+    repetition_max_period: int = 13
+    repetition_min_period: int = 1
+    repetition_max_count: int = 32
+@dataclass
+class DataConfig(BaseConfig):
+    paths: Optional[List[str]] = None
+    memmap_dtype: str = "uint16"
+    datasets: Optional[Dict[str, List[str]]] = None
+    label_mask_paths: Optional[List[str]] = None
+    pad_direction: PaddingDirection = PaddingDirection.right
+    generate_attention_mask: bool = False
+    generate_doc_lengths: bool = False
+    num_workers: int = 0
+    drop_last: bool = False
+    pin_memory: bool = False
+    prefetch_factor: Optional[int] = None
+    persistent_workers: bool = False
+    timeout: int = 0
+    seed: Optional[int] = None
+    instance_filter: Optional[InstanceFilterConfig] = None
+    custom_dataset: Optional[CustomDatasetConfig] = None
+    @property
+    def effective_memmap_dtype(self):
+        try:
+            # getattr will check this is part of numpy module, while np.dtype will check
+            # if this is a valid numpy dtype.
+            np.dtype(dtype := getattr(np, self.memmap_dtype))
+        except (AttributeError, TypeError) as e:
+            raise TypeError(f"Value {self.memmap_dtype} is not a valid numpy type") from e
+        return dtype
+@dataclass
+class CustomDatasetCollatorConfig(BaseConfig):
+    input_id_field: str = "input_ids"  #: The field in the dataset items that contains the input token IDs.
+    attention_mask_field: Optional[str] = None  #: The field in the dataset items that contains the attention mask.
+    attention_bias_field: Optional[str] = None  #: The field in the dataset items that contains the attention bias.
+    label_mask_field: Optional[str] = None  #: The field in the dataset items that contains the label mask.
+    index_field: Optional[str] = None  #: The field in the dataset items that contains the index of the item.
+    instance_mask_field: Optional[str] = None  #: The field in the dataset items that contains the instance mask.
+    doc_lens_field: Optional[str] = None  #: The field in the dataset items that contains the document lengths.
+    metadata_field: Optional[str] = None  #: The field in the dataset items that contains the metadata.
+@dataclass
+class CustomDatasetConfig(BaseConfig):
+    name: str  #: The name of the custom dataset class or function that will be used to load the dataset.
+    module: Optional[
+        str
+    ] = None  #: The module where the custom dataset class is defined. If not set, the module will be inferred from the class name.
+    args: Optional[Dict[str, Any]] = None  #: The arguments to pass to the custom dataset class or function
+    collate_fn: Optional[
+        str
+    ] = None  #: The name of the collate function to use for the custom dataset. Assumes the collate function is defined in the same module as the custom dataset class unless specified otherwise using the full object path.
+    token_field: Optional[str] = None  #: The field in the dataset items that contains the tokenized text.
+    collate_config: Optional[CustomDatasetCollatorConfig] = field(
+        default_factory=CustomDatasetCollatorConfig
+    )  #: The configuration for the collate function to use for the custom dataset.
+class EvaluatorType(StrEnum):
+    downstream = "downstream"
+    lm = "lm"
+@dataclass
+class EvaluatorConfig(BaseConfig):
+    label: str
+    type: EvaluatorType = EvaluatorType.lm
+    data: DataConfig = field(default_factory=DataConfig)
+    device_eval_batch_size: Optional[int] = None
+    subset_num_batches: Optional[int] = None
+class TruncationDirection(StrEnum):
+    right = "right"
+    left = "left"
+@dataclass
+class TokenizerConfig(BaseConfig):
+    identifier: str = "gpt2"
+    truncate_direction: TruncationDirection = TruncationDirection.right
+@dataclass
+class WandbConfig(BaseConfig):
+    project: Optional[str] = None
+    entity: Optional[str] = "ai2-llm"
+    group: Optional[str] = None
+    name: Optional[str] = None
+    tags: Optional[List[str]] = field(default_factory=lambda: ["watching"])
+    log_artifacts: bool = False
+    rank_zero_only: bool = True
+    log_interval: int = 1
+@dataclass
+class SpeedMonitorConfig(BaseConfig):
+    window_size: int = 100
+    gpu_flops_available: Optional[Union[float, int]] = None
+@dataclass
+class CompilerConfig(BaseConfig):
+    mode: Optional[str] = None
+    """
+    The mode to compile the model in. At the moment this can be "default",
+    "reduce-overhead" (useful for smaller models/batches), or "max-autotune"
+    (the fastest for larger models, but takes a long time to compile).
+    """
+    fullgraph: bool = False
+    """
+    Whether it is OK to break model into several subgraphs when compiling.
+    Note that this is not compatible with FSDP.
+    """
+    backend: str = "inductor"
+    """
+    The backend to use.
+    """
+    dynamic: Optional[bool] = None
+    """
+    From the torch docs:
+    Use dynamic shape tracing. When this is True, we will up-front attempt to generate a kernel that is as dynamic
+    as possible to avoid recompilations when sizes change. This may not always work as some
+    operations/optimizations will force specialization; use TORCH_LOGS=dynamic to debug overspecialization. When
+    this is False, we will NEVER generate dynamic kernels, we will always specialize. By default (None), we
+    automatically detect if dynamism has occurred and compile a more dynamic kernel upon recompile.
+    """
+class DistributedStrategy(StrEnum):
+    ddp = "ddp"
+    """
+    Wrap OLMo in torch.nn.parallel.DistributedDataParallel to train across ranks.
+    """
+    fsdp = "fsdp"
+    """
+    Wrap OLMo in torch.distributed.fsdp.FullyShardedDataParallel to train across ranks.
+    """
+    single = "single"
+    """
+    Train on a single device, i.e., do not distribute training. For development and debugging.
+    """
+class DDPGradSyncMode(StrEnum):
+    batch = "batch"
+    """
+    Synchronize gradients after computation at each bucket only at the last micro-batch.
+    This is slightly faster than gradient syncs across each micro-batch but will consume more memory.
+    Can use this mode only when `find_unused_params` is set to False.
+    """
+    micro_batch = "micro_batch"
+    """
+    Synchronize gradients after computation at each bucket per micro-batch.
+    This will be slightly slower than gradient sync at the last micro-batch, but will consume less memory.
+    Can use this mode with both option of `find_unused_params` but specifically recommended to use with `find_unused_params`
+    set to True, to prevent errors.
+    """
+@dataclass
+class DDPConfig(BaseConfig):
+    grad_sync_mode: DDPGradSyncMode = DDPGradSyncMode.batch
+    """
+    Gradient sync mode for DDP
+    Note: When `find_unused_params` is set, set `grad_sync_mode` to `micro_batch` as different micro-batches might activate
+    different parts of the model, ex- MOEs.
+    """
+    find_unused_params: bool = False
+    """
+    (from torch documentation)
+    This mode allows running backward on a subgraph of the model, and DDP finds out which parameters
+    are involved in the backward pass by traversing the autograd graph from the model output and marking
+    all unused parameters as ready for reduction. Note that traversing the autograd graph introduces extra overheads,
+    so applications should only set find_unused_parameters to True when necessary.
+    """
+class FSDPWrapStrategy(StrEnum):
+    by_block = "by_block"
+    """
+    Wrap each OLMo block with its own FSDP instance.
+    """
+    by_block_and_size = "by_block_and_size"
+    """
+    Like 'by_block' but `wte` and `ff_out` will be wrapped separately as well.
+    """
+    by_block_group = "by_block_group"
+    """
+    Wrap each block group together into its own FSDP instance.
+    This requires :attr:`~ModelConfig.block_group_size` to be bigger than 1.
+    """
+    by_block_group_and_size = "by_block_group_and_size"
+    """
+    Like 'by_block_group' but `wte` and `ff_out` will be wrapped separately as well.
+    """
+    size_based = "size_based"
+    """
+    Used PyTorch's default size-based auto wrap policy.
+    """
+    one_in_two = "one_in_two"
+    one_in_three = "one_in_three"
+    one_in_four = "one_in_four"
+    one_in_five = "one_in_five"
+class FSDPPrecision(StrEnum):
+    pure = "pure"
+    """
+    Equivalent to :class:`torch.distributed.fsdp.MixedPrecision` with ``param_dtype``, ``reduce_dtype``,
+    and ``buffer_dtype`` all set to the autocast precision data type.
+    """
+    mixed = "mixed"
+    """
+    Equivalent to :class:`torch.distributed.fsdp.MixedPrecision` with ``param_dtype``, and ``buffer_dtype``
+    set to the autocast precision data type, while ``reduce_dtype`` is set to fp32.
+    """
+@dataclass
+class FSDPConfig(BaseConfig):
+    use_orig_params: bool = True
+    """
+    This must be ``True`` if using ``compile`` or you want to track the parameter norm during training.
+    """
+    sharding_strategy: ShardingStrategy = ShardingStrategy.FULL_SHARD
+    wrapping_strategy: Optional[FSDPWrapStrategy] = None
+    """
+    The wrapping strategy to use. If ``None``, the default, the model is wrapped with a single top-level
+    FSDP instance.
+    """
+    precision: Optional[FSDPPrecision] = FSDPPrecision.pure
+    hybrid_sharding_num_model_replicas: Optional[int] = None
+    """
+    The number of model instances, when using a hybrid sharding strategy.
+    If not ``None``, this must divide the total number of nodes. If ``None``, the default,
+    a model instance is used per node (as determined by ``get_world_size() // get_local_world_size()``).
+    PyTorch's default HSDP behavior matches this default behavior.
+    """
+@dataclass
+class SingleGPUConfig(BaseConfig):
+    device: str = "auto"
+    """
+    Device to run single-device training.
+    """
+    def get_device(self):
+        if self.device == "auto":
+            if torch.backends.mps.is_available():
+                return torch.device("mps")
+            elif torch.cuda.is_available():
+                return torch.device("cuda")
+            else:
+                return torch.device("cpu")
+        elif self.device == "mps" and not torch.backends.mps.is_available():
+            raise OLMoConfigurationError("MPS not available.")
+        elif self.device == "cuda" and not torch.cuda.is_available():
+            raise OLMoConfigurationError("CUDA not available.")
+        else:
+            return torch.device(self.device)
+class CheckpointType(StrEnum):
+    sharded = "sharded"
+    unsharded = "unsharded"
+    sharded_ephemeral = "sharded_ephemeral"
+class ShardedCheckpointerType(StrEnum):
+    torch_new = "torch_new"
+    torch_legacy = "torch_legacy"
+    local = "local"
+    olmo_core = "olmo_core"
+class ActivationCheckpointingStrategy(StrEnum):
+    whole_layer = "whole_layer"
+    """
+    Checkpoint every transformer layer.
+    """
+    one_in_two = "one_in_two"
+    """
+    Checkpoint one in two transformer layers.
+    """
+    one_in_three = "one_in_three"
+    """
+    Checkpoint one in three transformer layers.
+    """
+    one_in_four = "one_in_four"
+    """
+    Checkpoint one in four transformer layers.
+    """
+    one_in_eight = "one_in_eight"
+    """
+    Checkpoint one in eight transformer layers.
+    """
+    two_in_three = "two_in_three"
+    """
+    Checkpoint two out of every three transformer layers.
+    """
+    three_in_four = "three_in_four"
+    """
+    Checkpoint three out of four of every transformer layers.
+    """
+    fine_grained = "fine_grained"
+    """
+    Focus checkpointing on where it is cheap to recompute and saves most memory.
+    """
+@dataclass
+class TrainConfig(BaseConfig):
+    """
+    OLMo training configuration.
+    """
+    run_name: Optional[str] = None
+    """
+    The name of the run.
+    """
+    seed: int = 6198
+    """
+    Used to seed all initial RNG states.
+    """
+    epoch: Optional[int] = None
+    """
+    Increment this when starting a new epoch.
+    """
+    dry_run: bool = False
+    """
+    If ``True``, don't actually train.
+    """
+    model: ModelConfig = field(default_factory=ModelConfig)
+    """
+    OLMo Model configuration.
+    """
+    optimizer: OptimizerConfig = field(default_factory=OptimizerConfig)
+    """
+    Optimizer configuration.
+    """
+    scheduler: SchedulerConfig = field(default_factory=SchedulerConfig)
+    """
+    Learning rate scheduler configuration.
+    """
+    data: DataConfig = field(default_factory=DataConfig)
+    """
+    Training data configuration.
+    """
+    restore_dataloader: bool = True
+    """
+    When restarting, restore the data loader to where it left off.
+    If you restarting in order to train on a different dataset, set this to ``False``.
+    """
+    fast_forward_batches: Optional[int] = None
+    """
+    When restarting, use this to fast-forward the dataloader beyond the last checkpoint.
+    This can be useful when restarting due to a loss spike in order to skip the data that
+    corresponded to the spike.
+    """
+    evaluators: List[EvaluatorConfig] = field(default_factory=list)
+    """
+    Evaluation configurations.
+    """
+    eval_interval: int = 1000
+    """
+    How often (in terms of batches) to run evaluations.
+    """
+    tokenizer: TokenizerConfig = field(default_factory=TokenizerConfig)
+    """
+    Tokenizer configuration.
+    """
+    save_folder: str = "./"
+    """
+    The directory to save checkpoints to.
+    """
+    remote_save_folder: Optional[str] = None
+    """
+    A folder in a cloud bucket to upload saved checkpoints to.
+    """
+    canceled_check_interval: int = 50
+    """
+    How often (in batches) to check if the run has been canceled or reached its time limit.
+    """
+    save_interval: Optional[int] = 1000
+    """
+    How often (in terms of steps) to save sharded training state checkpoints.
+    """
+    save_interval_unsharded: Optional[int] = None
+    """
+    How often (if at all) to save unsharded training state checkpoint.
+    For large models it can be costly to save these, so it usually makes sense to save
+    these less often than regular (sharded) training checkpoints.
+    """
+    save_interval_ephemeral: Optional[int] = None
+    """
+    How often (if at all) to save ephemeral sharded checkpoints. These checkpoints are the same
+    as those saved every `save_interval` except that at most only the most recent one of these is kept.
+    This is useful when you want to checkpoint often for restarts in case of failures, but don't
+    want to keep the majority of these checkpoints.
+    For example, suppose you want to keep your checkpoints at every 1000 steps, but you also want to save
+    a temporary checkpoint every 100 steps in case your job fails. In that case you would
+    set `save_interval=1000` and `save_interval_ephemeral=100`.
+    """
+    save_num_checkpoints_to_keep: int = -1
+    """
+    How many sharded checkpoints to keep.
+    """
+    save_num_unsharded_checkpoints_to_keep: int = -1
+    """
+    How many unsharded checkpoints to keep.
+    """
+    save_overwrite: bool = False
+    """
+    If ``True``, overwrite any conflicting checkpoint files.
+    """
+    force_save_unsharded: bool = False
+    """
+    Save an unsharded checkpoint before training (even during a dry run).
+    Use this option with `--load-path={PATH}` and `--dry_run` to convert a sharded
+    checkpoint into an unsharded checkpoint.
+    """
+    no_pre_train_checkpoint: bool = False
+    """
+    Skip saving pre-train checkpoint.
+    """
+    load_path: Optional[str] = None
+    """
+    The path to a training checkpoint to restore/resume from. If not set, then training begins from scratch.
+    Note that you can make use of the "path.last_checkpoint" Omegaconfig YAML resolver here, which takes
+    a local or remote directory and resolves to the latest checkpoint (sharded or unsharded) in that directory.
+    For example,
+    ```bash
+    --load_path='${path.last_checkpoint:s3://ai2-llm/checkpoints/7b/v1_5-mix-run-001}'
+    ```
+    If `try_load_latest_save` is set and saved checkpoints exist, then `load_path` will be overriden
+    by the latest saved checkpoint.
+    """
+    load_path_sharded_checkpointer: Optional[ShardedCheckpointerType] = None
+    """
+    The sharded checkpointer type to use to load the initial checkpoint from ``load_path``.
+    """
+    try_load_latest_save: bool = False
+    """
+    If set, then training will be resumed from the latest checkpoint in the local save folder, falling
+    back to the latest checkpoint in the remote save folder if none exists. If there are no checkpoints
+    in the local and remote save folders, then checkpoint loading will fall back to `load_path`.
+    """
+    reset_optimizer_state: bool = False
+    """
+    When this is set, we restore the model from a checkpoint (if given), but we leave the optimizer uninitialized.
+    We also set a new learning rate schedule that does a new warmup, such that it intercepts the original learning
+    curve (according to the current learning rate schedule settings), and continues from there.
+    """
+    reset_trainer_state: bool = False
+    """
+    When this is set we don't restore the trainer state from a checkpoint.
+    """
+    sharded_checkpointer: ShardedCheckpointerType = ShardedCheckpointerType.torch_legacy
+    """
+    The name of the sharded checkpointer to use to save (sharded) checkpoints throughout training.
+    """
+    new_style_checkpoints: Optional[bool] = None
+    """
+    Deprecated. Use ``sharded_checkpointer`` instead.
+    """
+    max_duration: Union[int, str] = 10000
+    """
+    How long to train for.
+    If specified without a unit (the default), the units are assumed to be steps.
+    You can also specify this in terms of tokens, for example: `max_duration="2e12T"` means train until
+    2 trillion tokens.
+    """
+    global_train_batch_size: int = 512
+    """
+    The effective global batch size.
+    """
+    device_train_batch_size: Optional[int] = None  # calculated automatically
+    """
+    Don't set this manually. This will be set to ``global_train_batch_size // world_size``.
+    """
+    device_train_microbatch_size: int = 16
+    """
+    The number of instances passed to the model in a single forward-backward pass. You should set
+    this as large as you can based on available GPU memory.
+    """
+    device_eval_batch_size: int = 16
+    """
+    The number of evaluation instances passed to the model in a single forward pass on each device.
+    """
+    eval_subset_num_batches: int = -1
+    """
+    The number of batches to use for downstream evaluation from each dataset.
+    """
+    eval_on_load: bool = False
+    """
+    When resuming from a checkpoint, run the evaluation loop right away.
+    """
+    device_train_grad_accum: Optional[int] = None  # calculated automatically
+    """
+    Don't set this manually. This will be set to ``device_train_batch_size // device_train_microbatch_size``.
+    """
+    max_grad_norm: Optional[float] = None
+    """
+    Clip gradient norms to this value if set.
+    """
+    max_grad_norm_ratio: Optional[float] = None
+    """
+    If set, gradient norms will be clipped to `max_grad_norm_ratio * exp_avg(norm(grad))`.
+    This takes priority over `max_grad_norm` when set.
+    """
+    precision: Optional[str] = None
+    """
+    Precision to train with (e.g. "amp_bf16", "amp_fp16", or "fp32").
+    """
+    wandb: Optional[WandbConfig] = None
+    """
+    Weights & Biases configuration.
+    """
+    speed_monitor: SpeedMonitorConfig = field(default_factory=SpeedMonitorConfig)
+    """
+    Speed monitor configuration.
+    """
+    console_log_interval: int = 1
+    """
+    How often to log to the console.
+    """
+    gen1_gc_interval: Optional[int] = 1
+    """
+    How often (in steps) to run generation 1 garbage collection.
+    Set to ``None`` to use automatic garbage collection (i.e. we don't mess with it).
+    """
+    compile: Optional[CompilerConfig] = None
+    """
+    Settings for compiling the model with ``torch.compile()``.
+    """
+    distributed_strategy: Optional[DistributedStrategy] = DistributedStrategy.fsdp
+    """
+    Distributed strategy for OLMo model (eg. single GPU, DDP, FSDP).
+    """
+    fsdp: Optional[FSDPConfig] = field(default_factory=FSDPConfig)
+    """
+    Fully sharded data parallel settings.
+    """
+    ddp: Optional[DDPConfig] = None
+    """
+    DDP settings.
+    """
+    single: SingleGPUConfig = field(default_factory=lambda: SingleGPUConfig(device="auto"))
+    """
+    Single device settings for GPU/CPU/MPS. Defaults to auto-detect the best device.
+    """
+    softmax_auxiliary_loss: bool = False
+    """
+    If ``True``, we add the auxiliary loss function from PaLM that encourages the softmax
+    normalizing term to be close to 0.
+    """
+    auxiliary_loss_multiplier: Optional[float] = 1e-4
+    """
+    Used with `softmax_auxiliary_loss`. PaLM uses 1e-4, Chameleon uses 1e-5.
+    """
+    time_limit: Optional[float] = None
+    """
+    The maximum amount of time to train for before saving a checkpoint and ending early.
+    """
+    extra_steps_after_cancel: int = 10
+    """
+    Under certain conditions when a run is canceled we train for a few extra steps after saving
+    the final checkpoint so that when the run is restarted from the latest checkpoint we have some
+    overlap in metrics.
+    """
+    early_stopping_factor: Optional[float] = None
+    save_data_indices: bool = True
+    """
+    Save training data indices from each batch for each worker.
+    """
+    python_profiling: bool = False
+    """
+    Whether to run the Python profiler on batches 6, 7, and 8.
+    """
+    torch_profiling: bool = False
+    """
+    Whether to run the PyTorch profiler on batches 6, 7, and 8.
+    """
+    stop_at: Optional[int] = None
+    """
+    Stop at a specific step.
+    """
+    stop_after: Optional[int] = None
+    """
+    Stop after a specific number of steps.
+    """
+    activation_checkpointing: Optional[ActivationCheckpointingStrategy] = None
+    """
+    The activation checkpointing strategy to use.
+    """
+    fused_loss: Optional[bool] = None
+    """
+    Whether to use the fused CE loss function from `flash-attn`.
+    """
+    hf_datasets_cache_dir: Optional[str] = None
+    """
+    Deprecated, HF datasets are now stored in `olmo_data.hf_datasets`.
+    Path to cache directory of HF datasets saved with `datasets.save_to_disk`.
+    """
+    module_outputs_save_steps: Optional[List[int]] = None
+    """
+    Outputs of model submodules are saved during the provided steps. Submodule outputs
+    can be compared using `scripts/compare_module_outputs.py`.
+    """
+    @property
+    def autocast_precision(self) -> torch.dtype:
+        if self.precision == "amp_bf16":
+            return torch.bfloat16
+        elif self.precision == "amp_fp16":
+            return torch.float16
+        elif self.precision == "fp32":
+            return torch.float32
+        else:
+            raise ValueError(f"Unexpected precision type '{self.precision}'")
+    @property
+    def fsdp_precision(self) -> Optional[MixedPrecision]:
+        if self.fsdp is not None:
+            if self.fsdp.precision is None:
+                return None
+            elif self.fsdp.precision == FSDPPrecision.pure:
+                return MixedPrecision(
+                    param_dtype=self.autocast_precision,
+                    reduce_dtype=self.autocast_precision,
+                    buffer_dtype=self.autocast_precision,
+                )
+            elif self.fsdp.precision == FSDPPrecision.mixed:
+                return MixedPrecision(
+                    param_dtype=self.autocast_precision,
+                    reduce_dtype=torch.float32,
+                    buffer_dtype=self.autocast_precision,
+                )
+            else:
+                raise NotImplementedError(f"{self.fsdp.precision}")
+        else:
+            raise ValueError("self.fsdp is None!")
+    @classmethod
+    def update_legacy_settings(cls, config: D) -> D:
+        new_config = config.copy()
+        if om.is_dict(new_config):
+            assert isinstance(new_config, DictConfig)
+            if hasattr(new_config, "activation_checkpointing"):
+                if new_config.activation_checkpointing is False:
+                    new_config.activation_checkpointing = None
+                if new_config.activation_checkpointing is True:
+                    new_config.activation_checkpointing = ActivationCheckpointingStrategy.whole_layer
+            if hasattr(new_config, "optimizer"):
+                new_config.optimizer = OptimizerConfig.update_legacy_settings(new_config.optimizer)
+        return new_config

exceptions.py ADDED Viewed

	@@ -0,0 +1,50 @@

+__all__ = [
+    "OLMoError",
+    "OLMoConfigurationError",
+    "OLMoCliError",
+    "OLMoEnvironmentError",
+    "OLMoNetworkError",
+    "OLMoCheckpointError",
+]
+class OLMoError(Exception):
+    """
+    Base class for all custom OLMo exceptions.
+    """
+class OLMoConfigurationError(OLMoError):
+    """
+    An error with a configuration file.
+    """
+class OLMoCliError(OLMoError):
+    """
+    An error from incorrect CLI usage.
+    """
+class OLMoEnvironmentError(OLMoError):
+    """
+    An error from incorrect environment variables.
+    """
+class OLMoNetworkError(OLMoError):
+    """
+    An error with a network request.
+    """
+class OLMoCheckpointError(OLMoError):
+    """
+    An error occurred reading or writing from a checkpoint.
+    """
+class OLMoThreadError(Exception):
+    """
+    Raised when a thread fails.
+    """

initialization.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from typing import Optional, Union
+import torch.nn as nn
+__all__ = ["init_normal"]
+def init_normal(
+    module: Union[nn.Linear, nn.Embedding],
+    std: float,
+    init_cutoff_factor: Optional[float] = None,
+):
+    # weights
+    if init_cutoff_factor is not None:
+        cutoff_value = init_cutoff_factor * std
+        nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-cutoff_value, b=cutoff_value)
+    else:
+        nn.init.normal_(module.weight, mean=0.0, std=std)
+    # biases
+    if isinstance(module, nn.Linear) and module.bias is not None:
+        nn.init.zeros_(module.bias)

model.py ADDED Viewed

	@@ -0,0 +1,1959 @@

+"""
+Adapted from
+[MosaiclML](https://github.com/mosaicml/examples.git) and
+[minGPT](https://github.com/karpathy/minGPT.git)
+"""
+from __future__ import annotations
+import logging
+import math
+import sys
+from abc import abstractmethod
+from collections import defaultdict
+from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    cast,
+)
+import torch
+import torch.backends.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import einsum
+from .aliases import PathOrStr
+from .beam_search import BeamSearch, Constraint, FinalSequenceScorer, Sampler
+from .config import (
+    ActivationCheckpointingStrategy,
+    ActivationType,
+    BlockType,
+    CheckpointType,
+    FSDPWrapStrategy,
+    InitFnType,
+    LayerNormType,
+    ModelConfig,
+    ShardedCheckpointerType,
+    TrainConfig,
+)
+from .exceptions import OLMoConfigurationError
+from .initialization import init_normal
+from .torch_util import ensure_finite_, get_cumulative_document_lengths
+if sys.version_info.minor > 8:
+    from collections.abc import MutableMapping
+elif sys.version_info.minor == 8:
+    from typing import MutableMapping
+else:
+    raise SystemExit("This script supports Python 3.8 or higher")
+__all__ = [
+    "LayerNormBase",
+    "LayerNorm",
+    "RMSLayerNorm",
+    "RotaryEmbedding",
+    "Activation",
+    "GELU",
+    "ReLU",
+    "SwiGLU",
+    "OLMoBlock",
+    "OLMoSequentialBlock",
+    "OLMo",
+    "OLMoOutput",
+    "OLMoGenerateOutput",
+]
+log = logging.getLogger(__name__)
+class FANLayer(nn.Module):
+    """
+    FANLayer: The layer used in FAN (https://arxiv.org/abs/2410.02675).
+    Args:
+        input_dim (int): The number of input features.
+        output_dim (int): The number of output features.
+        p_ratio (float): The ratio of output dimensions used for cosine and sine parts (default: 0.25).
+        activation (str or callable): The activation function to apply to the g component. If a string is passed,
+            the corresponding activation from torch.nn.functional is used (default: 'gelu').
+        use_p_bias (bool): If True, include bias in the linear transformations of p component (default: True).
+            There is almost no difference between bias and non-bias in our experiments.
+    """
+    def __init__(self, input_dim, output_dim, p_ratio=0.25, activation='gelu', use_p_bias=True):
+        super(FANLayer, self).__init__()
+        # Ensure the p_ratio is within a valid range
+        assert 0 <= p_ratio <= 0.5, "p_ratio must be between 0 and 0.5"
+        self.p_ratio = p_ratio
+        p_output_dim = int(output_dim * self.p_ratio)
+        g_output_dim = output_dim - p_output_dim * 2  # Account for cosine and sine terms
+        self.input_linear = nn.Linear(input_dim, p_output_dim+g_output_dim, bias=use_p_bias)
+        self.fused_dims = (p_output_dim, g_output_dim)
+        # Set the activation function
+        if isinstance(activation, str):
+            self.activation = getattr(F, activation)
+        else:
+            self.activation = activation if activation else lambda x: x
+    def forward(self, src, norm_g=None):
+        """
+        Args:
+            src (Tensor): Input tensor of shape (batch_size, input_dim).
+        Returns:
+            Tensor: Output tensor of shape (batch_size, output_dim), after applying the FAN layer.
+        """
+        pg = self.input_linear(src)
+        p, g = pg.split(self.fused_dims, dim=-1)
+        # Concatenate cos(p), sin(p), and activated g along the last dimension
+        output = torch.cat((torch.cos(p), torch.sin(p), self.activation(g)), dim=-1)
+        return output
+class FAN(nn.Module):
+    def __init__(self, input_dim, output_dim, config, activation='gelu'):
+        super(FAN, self).__init__()
+        self.fanlayer = FANLayer(input_dim, input_dim, config.p_ratio, activation)
+        self.linear = nn.Linear(input_dim, output_dim, bias=config.include_bias, device=config.init_device)
+    def forward(self, src):
+        return self.linear(self.fanlayer(src))
+def activation_checkpoint_function(cfg: ModelConfig):
+    preserve_rng_state = not (
+        (cfg.attention_dropout == 0.0) and (cfg.embedding_dropout == 0.0) and (cfg.residual_dropout == 0.0)
+    )
+    from torch.utils.checkpoint import checkpoint
+    return partial(
+        checkpoint,
+        preserve_rng_state=preserve_rng_state,
+        use_reentrant=False,
+    )
+def should_checkpoint_block(strategy: Optional[ActivationCheckpointingStrategy], block_idx: int) -> bool:
+    if strategy is None:
+        return False
+    elif (
+        (strategy == ActivationCheckpointingStrategy.whole_layer)
+        or (strategy == ActivationCheckpointingStrategy.one_in_two and block_idx % 2 == 0)
+        or (strategy == ActivationCheckpointingStrategy.one_in_three and block_idx % 3 == 0)
+        or (strategy == ActivationCheckpointingStrategy.one_in_four and block_idx % 4 == 0)
+        or (strategy == ActivationCheckpointingStrategy.one_in_eight and block_idx % 8 == 0)
+        or (strategy == ActivationCheckpointingStrategy.two_in_three and block_idx % 3 != 0)
+        or (strategy == ActivationCheckpointingStrategy.three_in_four and block_idx % 4 != 0)
+    ):
+        return True
+    else:
+        return False
+class BufferCache(dict, MutableMapping[str, torch.Tensor]):
+    """
+    Cache for attention biases and other things that would normally be stored as buffers.
+    We avoid using buffers because we've run into various issues doing so with FSDP.
+    In general it appears the way FSDP handles buffers is not well-defined.
+    It doesn't shard them but apparently it does synchronize them across processes, which we want to avoid
+    since (A) it isn't necessary, and (B) we sometimes have `-inf` in these biases which might get turned into
+    NaNs when they're synchronized due to casting or some other issue.
+    """
+def _non_meta_init_device(config: ModelConfig) -> torch.device:
+    if config.init_device is not None and config.init_device != "meta":
+        return torch.device(config.init_device)
+    else:
+        if torch.backends.mps.is_available():
+            return torch.device("mps")
+        elif torch.cuda.is_available():
+            return torch.device("cuda")
+        else:
+            return torch.device("cpu")
+class Dropout(nn.Dropout):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.p == 0.0:
+            return input
+        else:
+            return F.dropout(input, self.p, self.training, self.inplace)
+class LayerNormBase(nn.Module):
+    def __init__(
+        self,
+        config: ModelConfig,
+        *,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = True,
+    ):
+        super().__init__()
+        self.config = config
+        self.eps = config.layer_norm_eps
+        self.normalized_shape = (size or config.d_model,)
+        if elementwise_affine or (elementwise_affine is None and self.config.layer_norm_with_affine):
+            self.weight = nn.Parameter(torch.ones(self.normalized_shape, device=config.init_device))
+            use_bias = self.config.bias_for_layer_norm
+            if use_bias is None:
+                use_bias = self.config.include_bias
+            if use_bias:
+                self.bias = nn.Parameter(torch.zeros(self.normalized_shape, device=config.init_device))
+            else:
+                self.register_parameter("bias", None)
+        else:
+            self.register_parameter("bias", None)
+            self.register_parameter("weight", None)
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, config: ModelConfig, size: Optional[int] = None, **kwargs) -> LayerNormBase:
+        if config.layer_norm_type == LayerNormType.default:
+            return LayerNorm(config, size=size, low_precision=False, **kwargs)
+        elif config.layer_norm_type == LayerNormType.low_precision:
+            return LayerNorm(config, size=size, low_precision=True, **kwargs)
+        elif config.layer_norm_type == LayerNormType.rms:
+            return RMSLayerNorm(config, size=size, **kwargs)
+        else:
+            raise NotImplementedError(f"Unknown LayerNorm type: '{config.layer_norm_type}'")
+    def _cast_if_autocast_enabled(self, tensor: torch.Tensor, dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
+        # `is_autocast_cpu_enabled()` for CPU autocast.
+        # See https://github.com/pytorch/pytorch/issues/110966.
+        if tensor.device.type == "cuda" and torch.is_autocast_enabled():
+            return tensor.to(dtype=dtype if dtype is not None else torch.get_autocast_gpu_dtype())
+        elif tensor.device.type == "cpu" and torch.is_autocast_cpu_enabled():
+            return tensor.to(dtype=dtype if dtype is not None else torch.get_autocast_cpu_dtype())
+        else:
+            return tensor
+    def reset_parameters(self):
+        if self.weight is not None:
+            torch.nn.init.ones_(self.weight)  # type: ignore
+        if self.bias is not None:
+            torch.nn.init.zeros_(self.bias)  # type: ignore
+class LayerNorm(LayerNormBase):
+    """
+    The default :class:`LayerNorm` implementation which can optionally run in low precision.
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        low_precision: bool = False,
+        elementwise_affine: Optional[bool] = None,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine)
+        self.low_precision = low_precision
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.low_precision:
+            module_device = x.device
+            downcast_x = self._cast_if_autocast_enabled(x)
+            downcast_weight = (
+                self._cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+            )
+            downcast_bias = self._cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+            with torch.autocast(enabled=False, device_type=module_device.type):
+                return F.layer_norm(
+                    downcast_x, self.normalized_shape, weight=downcast_weight, bias=downcast_bias, eps=self.eps
+                )
+        else:
+            return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
+class RMSLayerNorm(LayerNormBase):
+    """
+    RMS layer norm, a simplified :class:`LayerNorm` implementation
+    """
+    def __init__(
+        self,
+        config: ModelConfig,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = None,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            og_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            x = x.to(og_dtype)
+        if self.weight is not None:
+            if self.bias is not None:
+                return self.weight * x + self.bias
+            else:
+                return self.weight * x
+        else:
+            return x
+class RotaryEmbedding(nn.Module):
+    """
+    [Rotary positional embeddings (RoPE)](https://arxiv.org/abs/2104.09864).
+    """
+    def __init__(self, config: ModelConfig, cache: BufferCache):
+        super().__init__()
+        self.config = config
+        self.__cache = cache
+        # Warm up cache.
+        self.get_rotary_embedding(config.max_sequence_length, _non_meta_init_device(config))
+    def get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+        if (
+            (pos_sin := self.__cache.get("rope_pos_sin")) is not None
+            and (pos_cos := self.__cache.get("rope_pos_cos")) is not None
+            and pos_sin.shape[-2] >= seq_len
+            and pos_cos.shape[-2] >= seq_len
+        ):
+            if pos_sin.device != device:
+                pos_sin = pos_sin.to(device)
+                self.__cache["rope_pos_sin"] = pos_sin
+            if pos_cos.device != device:
+                pos_cos = pos_cos.to(device)
+                self.__cache["rope_pos_cos"] = pos_cos
+            return pos_sin[:, :, :seq_len, :], pos_cos[:, :, :seq_len, :]
+        with torch.autocast(device.type, enabled=False):
+            dim = self.config.d_model // self.config.n_heads
+            inv_freq = 1.0 / (
+                self.config.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim)
+            )
+            seq = torch.arange(seq_len, device=device, dtype=torch.float)
+            freqs = einsum("i , j -> i j", seq, inv_freq)
+            positions = torch.cat((freqs, freqs), dim=-1)
+            pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
+        self.__cache["rope_pos_sin"] = pos_sin
+        self.__cache["rope_pos_cos"] = pos_cos
+        return pos_sin, pos_cos
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        B, nh, T, hs = x.size()
+        x = x.view(B, nh, T, 2, hs // 2)
+        x1, x2 = x.unbind(dim=-2)
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.config.rope_full_precision:
+            q_, k_ = q.float(), k.float()
+        else:
+            q_, k_ = q, k
+        with torch.autocast(q.device.type, enabled=False):
+            query_len, key_len = q_.shape[-2], k_.shape[-2]  # could be different if layer_past not None
+            pos_sin, pos_cos = self.get_rotary_embedding(key_len, q_.device)
+            pos_sin = pos_sin.type_as(q_)
+            pos_cos = pos_cos.type_as(q_)
+            q_ = self.apply_rotary_pos_emb(
+                pos_sin[:, :, key_len - query_len : key_len, :],
+                pos_cos[:, :, key_len - query_len : key_len, :],
+                q_,
+            )
+            k_ = self.apply_rotary_pos_emb(pos_sin, pos_cos, k_)
+        return q_.type_as(q), k_.type_as(k)
+class Activation(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    @property
+    @abstractmethod
+    def output_multiplier(self) -> float:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, config: ModelConfig) -> Activation:
+        if config.activation_type == ActivationType.gelu:
+            return cast(Activation, GELU(approximate="none"))
+        elif config.activation_type == ActivationType.relu:
+            return cast(Activation, ReLU(inplace=False))
+        elif config.activation_type == ActivationType.swiglu:
+            return SwiGLU(config)
+        else:
+            raise NotImplementedError(f"Unknown activation: '{config.activation_type}'")
+class GELU(nn.GELU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class ReLU(nn.ReLU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class SwiGLU(Activation):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = x.chunk(2, dim=-1)
+        return F.silu(gate) * x
+    @property
+    def output_multiplier(self) -> float:
+        return 0.5
+def causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor:
+    att_bias = torch.triu(
+        torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
+        diagonal=1,
+    )
+    att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min)
+    return att_bias.view(1, 1, seq_len, seq_len)  # type: ignore
+def get_causal_attention_bias(cache: BufferCache, seq_len: int, device: torch.device) -> torch.Tensor:
+    if (causal_bias := cache.get("causal_attention_bias")) is not None and causal_bias.shape[-1] >= seq_len:
+        if causal_bias.device != device:
+            causal_bias = causal_bias.to(device)
+            cache["causal_attention_bias"] = causal_bias
+        return causal_bias
+    with torch.autocast(device.type, enabled=False):
+        causal_bias = causal_attention_bias(seq_len, device)
+    cache["causal_attention_bias"] = causal_bias
+    return causal_bias
+def alibi_attention_bias(seq_len: int, config: ModelConfig, device: torch.device) -> torch.FloatTensor:
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, 1, seq_len)
+    # shape: (1, 1, seq_len, seq_len)
+    alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.float, device=device).view(1, 1, seq_len, 1)
+    alibi_bias.abs_().mul_(-1)
+    # shape: (n_heads,)
+    m = torch.arange(1, config.n_heads + 1, dtype=torch.float, device=device)
+    m.mul_(config.alibi_bias_max / config.n_heads)
+    # shape: (1, n_heads, seq_len, seq_len)
+    return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1)))  # type: ignore
+class OLMoBlock(nn.Module):
+    """
+    A base class for transformer block implementations.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.hidden_size = (
+            config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
+        )
+        self.__cache = cache
+        assert config.d_model % config.n_heads == 0
+        self._activation_checkpoint_fn: Optional[Callable] = None
+        # Dropout.
+        self.dropout = Dropout(config.residual_dropout)
+        # Layer norms.
+        self.k_norm: Optional[LayerNormBase] = None
+        self.q_norm: Optional[LayerNormBase] = None
+        if config.attention_layer_norm:
+            assert config.effective_n_kv_heads is not None
+            self.k_norm = LayerNormBase.build(
+                config,
+                size=(config.d_model // config.n_heads) * config.effective_n_kv_heads,
+                elementwise_affine=config.attention_layer_norm_with_affine,
+            )
+            self.q_norm = LayerNormBase.build(config, elementwise_affine=config.attention_layer_norm_with_affine)
+        # Make sure QKV clip coefficient is positive, otherwise it's not well-defined.
+        if config.clip_qkv is not None:
+            assert config.clip_qkv > 0
+        # Activation function.
+        self.act = Activation.build(config)
+        assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
+        # Attention output projection.
+        self.attn_out = nn.Linear(
+            config.d_model, config.d_model, bias=config.include_bias, device=config.init_device
+        )
+        # Feed-forward output projection.
+        self.ff_out = nn.Linear(
+            int(self.act.output_multiplier * self.hidden_size),
+            config.d_model,
+            bias=config.include_bias,
+            device=config.init_device,
+        )
+        self.ff_out._is_residual = True  # type: ignore
+        # Rotary embeddings.
+        if self.config.rope:
+            self.rotary_emb = RotaryEmbedding(config, self.__cache)
+        self.flash_attn_func = None
+        self.flash_attn_varlen_func = None
+        if config.flash_attention:
+            try:
+                from flash_attn import (  # type: ignore
+                    flash_attn_func,
+                    flash_attn_varlen_func,
+                )
+                self.flash_attn_func = flash_attn_func
+                self.flash_attn_varlen_func = flash_attn_varlen_func
+            except ModuleNotFoundError:
+                pass
+    def reset_parameters(self):
+        if self.k_norm is not None:
+            self.k_norm.reset_parameters()
+        if self.q_norm is not None:
+            self.q_norm.reset_parameters()
+        if self.config.init_fn == InitFnType.normal:
+            attn_out_std = ff_out_std = self.config.init_std
+            cutoff_factor = self.config.init_cutoff_factor
+        elif self.config.init_fn == InitFnType.mitchell:
+            attn_out_std = 1 / (math.sqrt(2 * self.config.d_model * (self.layer_id + 1)))
+            ff_out_std = 1 / (math.sqrt(2 * self.ff_out.in_features * (self.layer_id + 1)))
+            cutoff_factor = self.config.init_cutoff_factor or 3.0
+        elif self.config.init_fn == InitFnType.full_megatron:
+            attn_out_std = ff_out_std = self.config.init_std / math.sqrt(2.0 * self.config.n_layers)
+            cutoff_factor = self.config.init_cutoff_factor or 3.0
+        else:
+            raise NotImplementedError(self.config.init_fn)
+        init_normal(self.attn_out, std=attn_out_std, init_cutoff_factor=cutoff_factor)
+        init_normal(self.ff_out, std=ff_out_std, init_cutoff_factor=cutoff_factor)
+    def set_activation_checkpointing(
+        self, strategy: Optional[ActivationCheckpointingStrategy], checkpoint_func: Optional[Callable] = None
+    ):
+        if strategy == ActivationCheckpointingStrategy.fine_grained:
+            self._activation_checkpoint_fn = checkpoint_func or activation_checkpoint_function(self.config)
+        else:
+            self._activation_checkpoint_fn = None
+    @classmethod
+    def _cast_attn_bias(cls, bias: torch.Tensor, input_dtype: torch.dtype) -> torch.Tensor:
+        target_dtype = input_dtype
+        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
+        # `is_autocast_cpu_enabled()` for CPU autocast.
+        # See https://github.com/pytorch/pytorch/issues/110966.
+        if bias.device.type == "cuda" and torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        elif bias.device.type == "cpu" and torch.is_autocast_cpu_enabled():
+            target_dtype = torch.get_autocast_cpu_dtype()
+        elif bias.device.type == "mps":
+            target_dtype = torch.get_autocast_dtype("mps")
+        if bias.dtype != target_dtype:
+            bias = bias.to(target_dtype)
+            ensure_finite_(bias, check_neg_inf=True, check_pos_inf=False)
+        return bias
+    def _scaled_dot_product_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        max_doc_len: Optional[int] = None,
+        cu_doc_lens: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Computes scaled dot product attention on query, key and value tensors, using an optional
+        attention mask if passed, and applying dropout if a probability greater than 0.0 is specified.
+        """
+        if max_doc_len is not None and cu_doc_lens is not None:
+            assert self.flash_attn_varlen_func is not None, "flash-attn is required for document masking"
+            assert attn_mask is None, "attn-mask is currently not supported with document masking"
+            B, T, D = q.size(0), q.size(2), q.size(3)
+            r = self.flash_attn_varlen_func(
+                q.transpose(1, 2).view(B * T, -1, D),
+                k.transpose(1, 2).view(B * T, -1, D),
+                v.transpose(1, 2).view(B * T, -1, D),
+                cu_doc_lens,
+                cu_doc_lens,
+                max_doc_len,
+                max_doc_len,
+                dropout_p=dropout_p,
+                causal=is_causal,
+            )
+            return r.view(B, T, -1, D).transpose(1, 2)
+        elif self.flash_attn_func is not None and attn_mask is None:
+            r = self.flash_attn_func(
+                q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), dropout_p=dropout_p, causal=is_causal
+            )
+            return r.transpose(1, 2)
+        else:
+            # torch's sdpa doesn't support GQA, so we're doing this
+            assert k.size(1) == v.size(1)
+            num_kv_heads = k.size(1)
+            num_q_heads = q.size(1)
+            if num_q_heads != num_kv_heads:
+                assert num_q_heads % num_kv_heads == 0
+                k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+                v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+            return F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attn_mask,
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+            )
+    def attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        max_doc_len: Optional[int] = None,
+        cu_doc_lens: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        B, T, C = q.size()  # batch size, sequence length, d_model
+        dtype = k.dtype
+        # Optionally apply layer norm to keys and queries.
+        if self.q_norm is not None and self.k_norm is not None:
+            q = self.q_norm(q).to(dtype=dtype)
+            k = self.k_norm(k).to(dtype=dtype)
+        # Move head forward to be next to the batch dim.
+        # shape: (B, nh, T, hs)
+        q = q.view(B, T, self.config.n_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        k = k.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        v = v.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat((past_key, k), dim=-2)
+            v = torch.cat((past_value, v), dim=-2)
+        present = (k, v) if use_cache else None
+        query_len, key_len = q.shape[-2], k.shape[-2]  # could be different if layer_past not None
+        if self.config.rope:
+            # Apply rotary embeddings.
+            q, k = self.rotary_emb(q, k)
+        if attention_bias is not None:
+            # Resize and cast attention bias.
+            # The current dtype of the attention bias might not match the dtype that the SDP attn function will
+            # run in if AMP is enabled, and this can be a problem if some tokens are masked out due to padding
+            # as down-casting the attention bias to the autocast precision will result in -infs, which will
+            # cause the SDP attn function to produce NaNs.
+            attention_bias = self._cast_attn_bias(
+                attention_bias[:, :, key_len - query_len : key_len, :key_len], dtype
+            )
+        # Get the attention scores.
+        # shape: (B, nh, T, hs)
+        att = self._scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=attention_bias,
+            dropout_p=0.0 if not self.training else self.config.attention_dropout,
+            is_causal=attention_bias is None,
+            max_doc_len=max_doc_len,
+            cu_doc_lens=cu_doc_lens,
+        )
+        # Re-assemble all head outputs side-by-side.
+        att = att.transpose(1, 2).contiguous().view(B, T, C)
+        # Apply output projection.
+        return self.attn_out(att), present
+    @abstractmethod
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        max_doc_len: Optional[int] = None,
+        cu_doc_lens: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> OLMoBlock:
+        if config.block_type == BlockType.sequential:
+            return OLMoSequentialBlock(layer_id, config, cache)
+        elif config.block_type == BlockType.llama:
+            return OLMoLlamaBlock(layer_id, config, cache)
+        else:
+            raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
+class OLMoSequentialBlock(OLMoBlock):
+    """
+    This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection). To compute it as ``LN(MLP(x + LN(Attention(x))))``,
+    use the flag `norm_after`.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__(layer_id, config, cache)
+        # Attention input projection. Projects x -> (q, k, v)
+        self.use_ATF = config.use_ATF
+        head_dim = config.d_model // config.n_heads
+        self.fused_dims = (
+            config.d_model,
+            config.effective_n_kv_heads * head_dim,
+            config.effective_n_kv_heads * head_dim,
+        )
+        if self.use_ATF:
+            self.att_proj = FAN(config.d_model, sum(self.fused_dims), config, activation=config.attention_activation)
+        else:
+            self.att_proj = nn.Linear(
+                config.d_model, sum(self.fused_dims), bias=config.include_bias, device=config.init_device
+            )
+        # Feed-forward input projection.
+        self.ff_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+        # Layer norms.
+        self.attn_norm = LayerNorm.build(config, size=config.d_model)
+        self.ff_norm = LayerNorm.build(config, size=config.d_model)
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        if self.config.init_fn == InitFnType.normal:
+            std = self.config.init_std
+            cutoff_factor = self.config.init_cutoff_factor
+        elif self.config.init_fn == InitFnType.mitchell:
+            std = 1 / math.sqrt(self.config.d_model)
+            cutoff_factor = self.config.init_cutoff_factor or 3.0
+        elif self.config.init_fn == InitFnType.full_megatron:
+            std = self.config.init_std
+            cutoff_factor = self.config.init_cutoff_factor or 3.0
+        else:
+            raise NotImplementedError(self.config.init_fn)
+        if self.use_ATF:
+            init_normal(self.att_proj.fanlayer.input_linear, std, cutoff_factor)
+            init_normal(self.att_proj.linear, std, cutoff_factor)
+        else:
+            init_normal(self.att_proj, std, cutoff_factor)
+        init_normal(self.ff_proj, std, cutoff_factor)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        max_doc_len: Optional[int] = None,
+        cu_doc_lens: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        #  - for group query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_kv_heads)
+        # apply norm before
+        if not self.config.norm_after:
+            if self._activation_checkpoint_fn is not None:
+                h = self._activation_checkpoint_fn(self.attn_norm, x)
+            else:
+                h = self.attn_norm(x)
+        else:
+            h = x
+        qkv = self.att_proj(h)
+        if self.config.clip_qkv is not None:
+            qkv.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+        q, k, v = qkv.split(self.fused_dims, dim=-1)
+        # Get attention scores.
+        if self._activation_checkpoint_fn is not None:
+            att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention,
+                q,
+                k,
+                v,
+                attention_bias,
+                layer_past=layer_past,
+                use_cache=use_cache,
+                max_doc_len=max_doc_len,
+                cu_doc_lens=cu_doc_lens,
+            )
+        else:
+            att, cache = self.attention(
+                q,
+                k,
+                v,
+                attention_bias,
+                layer_past=layer_past,
+                use_cache=use_cache,
+                max_doc_len=max_doc_len,
+                cu_doc_lens=cu_doc_lens,
+            )
+        if self.config.norm_after:
+            if self._activation_checkpoint_fn is not None:
+                att = self._activation_checkpoint_fn(self.attn_norm, att)
+            else:
+                att = self.attn_norm(att)
+        # Add attention scores.
+        # shape: (B, T, C)
+        x = x + self.dropout(att)
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        if not self.config.norm_after:
+            if self._activation_checkpoint_fn is not None:
+                x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+            else:
+                x = self.ff_norm(x)
+        x = self.ff_proj(x)
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+        else:
+            x = self.act(x)
+        x = self.ff_out(x)
+        if self.config.norm_after:
+            if self._activation_checkpoint_fn is not None:
+                x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+            else:
+                x = self.ff_norm(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class OLMoLlamaBlock(OLMoBlock):
+    """
+    This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection). This block is similar to `OLMoSequentialBlock`
+    but some operations have slightly different implementations to imitate the
+    behavior of Llama.
+    """
+    def __init__(self, layer_id: int, config: ModelConfig, cache: BufferCache):
+        super().__init__(layer_id, config, cache)
+        # Layer norms.
+        self.use_ATF = config.use_ATF
+        self.attn_norm = LayerNorm.build(config)
+        self.ff_norm = LayerNorm.build(config)
+        self.__cache = cache
+        # Attention input projection. Projects x -> (q, k, v)
+        if config.multi_query_attention:
+            q_proj_out_dim = config.d_model
+            k_proj_out_dim = config.d_model // config.n_heads
+            v_proj_out_dim = config.d_model // config.n_heads
+        else:
+            q_proj_out_dim = config.d_model
+            k_proj_out_dim = config.d_model
+            v_proj_out_dim = config.d_model
+        if self.use_ATF:
+            self.q_proj = FAN(config.d_model, q_proj_out_dim, config)
+            self.k_proj = FAN(config.d_model, k_proj_out_dim, config)
+            self.v_proj = FAN(config.d_model, v_proj_out_dim, config)
+        else:
+            self.q_proj = nn.Linear(
+                config.d_model, q_proj_out_dim, bias=config.include_bias, device=config.init_device
+            )
+            self.k_proj = nn.Linear(
+                config.d_model, k_proj_out_dim, bias=config.include_bias, device=config.init_device
+            )
+            self.v_proj = nn.Linear(
+                config.d_model, v_proj_out_dim, bias=config.include_bias, device=config.init_device
+            )
+        # Feed-forward input projection.
+        self.ff_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        if self.config.init_fn == InitFnType.normal:
+            std = self.config.init_std
+            cutoff_factor = self.config.init_cutoff_factor
+        elif self.config.init_fn == InitFnType.mitchell:
+            std = 1 / math.sqrt(self.config.d_model)
+            cutoff_factor = self.config.init_cutoff_factor or 3.0
+        elif self.config.init_fn == InitFnType.full_megatron:
+            std = self.config.init_std
+            cutoff_factor = self.config.init_cutoff_factor or 3.0
+        else:
+            raise NotImplementedError(self.config.init_fn)
+        init_normal(self.q_proj, std, cutoff_factor)
+        init_normal(self.k_proj, std, cutoff_factor)
+        init_normal(self.v_proj, std, cutoff_factor)
+        init_normal(self.ff_proj, std, cutoff_factor)
+    def _scaled_dot_product_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        max_doc_len: Optional[int] = None,
+        cu_doc_lens: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if max_doc_len is not None or cu_doc_lens is not None:
+            raise NotImplementedError(
+                f"attention document masking is not implemented for {self.__class__.__name__}"
+            )
+        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(q.size(-1))
+        if is_causal:
+            assert attn_mask is None
+            query_len, key_len = q.shape[-2], k.shape[-2]  # could be different if layer_past not None
+            attn_bias = get_causal_attention_bias(self.__cache, key_len, q.device)[:, :, :query_len, :key_len]
+        elif attn_mask is not None:
+            attn_bias = attn_mask.to(q.dtype)
+        else:
+            attn_bias = torch.zeros_like(attn_weights)
+        attn_weights += attn_bias
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1).to(q.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=dropout_p)
+        return torch.matmul(attn_weights, v)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        max_doc_len: Optional[int] = None,
+        cu_doc_lens: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        x_normed = self.attn_norm(x)
+        q = self.q_proj(x_normed)
+        k = self.k_proj(x_normed)
+        v = self.v_proj(x_normed)
+        if self.config.clip_qkv is not None:
+            q.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+            k.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+            v.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+        # Get attention scores.
+        att, cache = self.attention(
+            q,
+            k,
+            v,
+            attention_bias,
+            layer_past=layer_past,
+            use_cache=use_cache,
+            max_doc_len=max_doc_len,
+            cu_doc_lens=cu_doc_lens,
+        )
+        # Add attention scores.
+        # shape: (B, T, C)
+        x = x + self.dropout(att)
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+        else:
+            x = self.ff_norm(x)
+        x = self.ff_proj(x)
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+        else:
+            x = self.act(x)
+        x = self.ff_out(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class OLMoOutput(NamedTuple):
+    logits: torch.FloatTensor
+    """
+    A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities
+    for the next token *before* normalization via (log) softmax.
+    """
+    attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]]
+    """
+    Attention keys and values from each block.
+    """
+    hidden_states: Optional[Tuple[torch.Tensor, ...]]
+    """
+    Hidden states from each block.
+    """
+class OLMoGenerateOutput(NamedTuple):
+    token_ids: torch.LongTensor
+    """
+    The generated token IDs, a tensor of shape `(batch_size, beam_size, max_steps)`.
+    These do *not* include the original input IDs.
+    """
+    scores: torch.FloatTensor
+    """
+    The scores of the generated sequences, a tensor of shape `(batch_size, beam_size)`.
+    """
+class OLMoBlockGroup(nn.ModuleList):
+    def __init__(self, config: ModelConfig, layer_offset: int, modules: Optional[Iterable[nn.Module]] = None):
+        super().__init__(modules)
+        self.config = config
+        self.layer_offset = layer_offset
+        self.activation_checkpointing_strategy: Optional[ActivationCheckpointingStrategy] = None
+        self._activation_checkpoint_fn = activation_checkpoint_function(self.config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.FloatTensor] = None,
+        layers_past: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+        max_doc_len: Optional[int] = None,
+        cu_doc_lens: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[List[Tuple[torch.Tensor, torch.Tensor]]]]:
+        attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+        for block_idx, block in enumerate(self):
+            layer_past = None if layers_past is None else layers_past[block_idx]
+            block_idx += self.layer_offset
+            if should_checkpoint_block(self.activation_checkpointing_strategy, block_idx):
+                # shape: (batch_size, seq_len, d_model)
+                x, cache = self._activation_checkpoint_fn(  # type: ignore
+                    block,
+                    x,
+                    attention_bias=attention_bias,
+                    layer_past=layer_past,
+                    use_cache=use_cache,
+                    max_doc_len=max_doc_len,
+                    cu_doc_lens=cu_doc_lens,
+                )
+            else:
+                # shape: (batch_size, seq_len, d_model)
+                x, cache = block(
+                    x,
+                    attention_bias=attention_bias,
+                    layer_past=layer_past,
+                    use_cache=use_cache,
+                    max_doc_len=max_doc_len,
+                    cu_doc_lens=cu_doc_lens,
+                )
+            if attn_key_values is not None:
+                assert cache is not None
+                attn_key_values.append(cache)
+        return x, attn_key_values
+    def reset_parameters(self):
+        for block in self:
+            block.reset_parameters()
+    def set_activation_checkpointing(
+        self, strategy: Optional[ActivationCheckpointingStrategy], checkpoint_func: Optional[Callable] = None
+    ):
+        self.activation_checkpointing_strategy = strategy
+        for block in self:
+            block.set_activation_checkpointing(strategy, checkpoint_func=checkpoint_func)
+class OLMo(nn.Module):
+    def __init__(self, config: ModelConfig, init_params: bool = True):
+        super().__init__()
+        self.config = config
+        self.__cache = BufferCache()
+        # Validate config.
+        if self.config.alibi and self.config.flash_attention:
+            raise OLMoConfigurationError("ALiBi is currently not supported with FlashAttention")
+        if self.config.alibi and self.config.rope:
+            raise OLMoConfigurationError("ALiBi and RoPE are mutually exclusive")
+        if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
+            if self.config.embedding_size < self.config.vocab_size:
+                raise OLMoConfigurationError("embedding size should be at least as big as vocab size")
+            elif self.config.embedding_size % 128 != 0:
+                import warnings
+                warnings.warn(
+                    "Embedding size is not a multiple of 128! This could hurt throughput performance.", UserWarning
+                )
+        self.activation_checkpointing_strategy: Optional[ActivationCheckpointingStrategy] = None
+        self._activation_checkpoint_fn: Callable = activation_checkpoint_function(self.config)
+        if not (
+            0 < self.config.block_group_size <= self.config.n_layers
+            and self.config.n_layers % self.config.block_group_size == 0
+        ):
+            raise OLMoConfigurationError("n layers must be divisible by block group size")
+        torch.backends.cuda.enable_flash_sdp(True)
+        torch.backends.cuda.enable_mem_efficient_sdp(False)  # this is super slow so make sure torch won't use it
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(
+                    config.embedding_size or config.vocab_size, config.d_model, device=config.init_device
+                ),
+                emb_drop=Dropout(config.embedding_dropout),
+                ln_f=LayerNorm.build(config),
+            )
+        )
+        blocks = [OLMoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
+        if self.config.block_group_size > 1:
+            block_groups = [
+                OLMoBlockGroup(config, i, blocks[i : i + config.block_group_size])
+                for i in range(0, config.n_layers, config.block_group_size)
+            ]
+            self.transformer.update({"block_groups": nn.ModuleList(block_groups)})
+        else:
+            self.transformer.update({"blocks": nn.ModuleList(blocks)})
+        if not (self.config.alibi or self.config.rope):
+            self.transformer.update(
+                {"wpe": nn.Embedding(config.max_sequence_length, config.d_model, device=config.init_device)}
+            )
+        if not config.weight_tying:
+            self.transformer.update(
+                {
+                    "ff_out": nn.Linear(
+                        config.d_model,
+                        config.embedding_size or config.vocab_size,
+                        bias=config.include_bias,
+                        device=config.init_device,
+                    )
+                }
+            )
+        if config.embedding_layer_norm:
+            self.transformer.update({"emb_norm": LayerNorm.build(config)})
+        # When `init_device="meta"` FSDP will call `reset_parameters()` to initialize weights.
+        if init_params and self.config.init_device != "meta":
+            self.reset_parameters()
+        self.__num_fwd_flops: Optional[int] = None
+        self.__num_bck_flops: Optional[int] = None
+        # Warm up cache.
+        if self.config.alibi:
+            get_causal_attention_bias(self.__cache, config.max_sequence_length, _non_meta_init_device(config))
+            self.get_alibi_attention_bias(config.max_sequence_length, _non_meta_init_device(config))
+    def set_activation_checkpointing(
+        self, strategy: Optional[ActivationCheckpointingStrategy], checkpoint_func: Optional[Callable] = None
+    ):
+        self.activation_checkpointing_strategy = strategy
+        if self.config.block_group_size != 1:
+            for block_group in self.transformer.block_groups:
+                block_group.set_activation_checkpointing(strategy, checkpoint_func=checkpoint_func)
+        else:
+            for block in self.transformer.blocks:
+                block.set_activation_checkpointing(strategy, checkpoint_func=checkpoint_func)
+    @property
+    def device(self) -> torch.device:
+        device: torch.device = self.transformer.wte.weight.device  # type: ignore
+        if device.type == "meta":
+            return _non_meta_init_device(self.config)
+        else:
+            return device
+    def reset_parameters(self):
+        log.info("Initializing model parameters...")
+        # Top-level embeddings / linear layers.
+        if self.config.init_fn == InitFnType.normal:
+            # Note: We may potentially want to multiply the std by a factor of sqrt(d) in case of `scale_logits`
+            # and `weight_tying`. However, we are currently not using either, and may need to rethink the init logic
+            # if/when we do want it.
+            wte_std = self.config.emb_init_std or self.config.init_std
+            wte_cutoff_factor = self.config.init_cutoff_factor
+        elif self.config.init_fn == InitFnType.mitchell:
+            wte_std = self.config.emb_init_std or 1.0 / math.sqrt(self.config.d_model)
+            wte_cutoff_factor = self.config.init_cutoff_factor or 3.0
+        elif self.config.init_fn == InitFnType.full_megatron:
+            wte_std = self.config.init_std
+            if self.config.emb_init_std is not None:
+                wte_std = self.config.emb_init_std
+            elif self.config.scale_emb_init:
+                wte_std *= math.sqrt(self.config.d_model)
+            wte_cutoff_factor = self.config.init_cutoff_factor or 3.0
+        else:
+            raise NotImplementedError(self.config.init_fn)
+        init_normal(self.transformer.wte, std=wte_std, init_cutoff_factor=wte_cutoff_factor)
+        if hasattr(self.transformer, "wpe"):
+            if self.config.init_fn == InitFnType.normal:
+                wpe_std = self.config.init_std
+                wpe_cutoff_factor = self.config.init_cutoff_factor
+            elif self.config.init_fn == InitFnType.mitchell:
+                wpe_std = 1 / math.sqrt(self.config.d_model)
+                wpe_cutoff_factor = self.config.init_cutoff_factor or 3.0
+            elif self.config.init_fn == InitFnType.full_megatron:
+                wpe_std = self.config.init_std
+                wpe_cutoff_factor = self.config.init_cutoff_factor or 3.0
+            else:
+                raise NotImplementedError(self.config.init_fn)
+            init_normal(self.transformer.wpe, std=wpe_std, init_cutoff_factor=wpe_cutoff_factor)
+        # Top-level layer norm.
+        self.transformer.ln_f.reset_parameters()  # type: ignore
+        # Output weights.
+        if hasattr(self.transformer, "ff_out"):
+            if self.config.init_fn == InitFnType.normal:
+                ff_out_std = self.config.init_std
+                ff_out_cutoff_factor = self.config.init_cutoff_factor
+            elif self.config.init_fn == InitFnType.mitchell:
+                ff_out_std = 1 / math.sqrt(self.config.d_model)
+                ff_out_cutoff_factor = self.config.init_cutoff_factor or 3.0
+            elif self.config.init_fn == InitFnType.full_megatron:
+                ff_out_std = 1 / math.sqrt(self.config.d_model)
+                ff_out_cutoff_factor = self.config.init_cutoff_factor or 3.0
+            else:
+                raise NotImplementedError(self.config.init_fn)
+            init_normal(self.transformer.ff_out, ff_out_std, ff_out_cutoff_factor)
+        # Let the blocks handle themselves.
+        if self.config.block_group_size == 1:
+            for block in self.transformer.blocks:
+                block.reset_parameters()
+        else:
+            for block_group in self.transformer.block_groups:
+                block_group.reset_parameters()
+    def get_alibi_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        if (alibi_bias := self.__cache.get("alibi_attention_bias")) is not None and alibi_bias.shape[
+            -1
+        ] >= seq_len:
+            if alibi_bias.device != device:
+                alibi_bias = alibi_bias.to(device)
+                self.__cache["alibi_attention_bias"] = alibi_bias
+            return alibi_bias
+        with torch.autocast(device.type, enabled=False):
+            alibi_bias = alibi_attention_bias(seq_len, self.config, device)
+        self.__cache["alibi_attention_bias"] = alibi_bias
+        return alibi_bias
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        input_embeddings: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+        last_logits_only: bool = False,
+        output_hidden_states: Optional[bool] = None,
+        doc_lens: Optional[torch.Tensor] = None,
+        max_doc_lens: Optional[Sequence[int]] = None,
+    ) -> OLMoOutput:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input
+            embeddings. When provided, it is treated as the output of the input embedding layer.
+        :param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates
+            which input IDs are masked. A `1` value in the mask means that
+            the corresponding input ID should *not* be ignored. A `0` means
+            that the corresponding input ID is masked.
+            This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
+            library.
+        :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
+            `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
+            to introduce causal or other biases.
+            If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
+            indicates that the i-th element in the sequence is allowed to attend to the j-th
+            element in the sequence.
+            If the tensor is a float tensor, it will just be added to the attention
+            scores before the softmax.
+            The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
+        :param past_key_values: Pre-computed keys and values for each attention block.
+            Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+        :param use_cache: If `True`, return key and value tensors for each block.
+        :param last_logits_only: If `True`, only compute the logits for the last token of each sequence.
+            This can speed up decoding when you only care about the next token.
+        :param doc_lens: Document lengths to use in attention for intra-document masking.
+            Shape `(batch_size, max_docs)`.
+        :param max_doc_lens: Maximum document length for each instance in the batch.
+        """
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        if past_key_values:
+            assert len(past_key_values) == self.config.n_layers
+        batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
+        if past_key_values is None:
+            past_length = 0
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        max_doc_len: Optional[int] = None
+        cu_doc_lens: Optional[torch.Tensor] = None
+        if doc_lens is not None and max_doc_lens is not None:
+            max_doc_len = max(max_doc_lens)
+            cu_doc_lens = get_cumulative_document_lengths(doc_lens)
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings  # type: ignore
+        # Apply embedding layer norm.
+        if self.config.embedding_layer_norm:
+            x = self.transformer.emb_norm(x)
+        if not (self.config.alibi or self.config.rope):
+            # Get positional embeddings.
+            # shape: (1, seq_len)
+            pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
+            # shape: (1, seq_len, d_model)
+            pos_emb = self.transformer.wpe(pos)  # type: ignore
+            x = pos_emb + x
+        # Apply dropout.
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.emb_drop(x)  # type: ignore
+        # Transform the attention mask into what the blocks expect.
+        if attention_mask is not None:
+            # shape: (batch_size, 1, 1, seq_len)
+            attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
+            attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min
+        # Merge attention mask with attention bias.
+        if (
+            attention_bias is not None
+            or attention_mask is not None
+            or self.config.alibi
+            # NOTE (epwalsh): we need to initialize the attn bias in order for attn to work properly
+            # with key+value cache. Otherwise `F.scaled_dot_product_attention()` doesn't seem to compute
+            # scores correctly.
+            or past_key_values is not None
+        ):
+            if attention_bias is None and self.config.alibi:
+                attention_bias = get_causal_attention_bias(
+                    self.__cache, past_length + seq_len, x.device
+                ) + self.get_alibi_attention_bias(past_length + seq_len, x.device)
+            elif attention_bias is None:
+                attention_bias = get_causal_attention_bias(self.__cache, past_length + seq_len, x.device)
+            elif attention_bias.dtype in (torch.int8, torch.bool):
+                attention_bias = attention_bias.to(dtype=torch.float)
+                attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)
+            # Transform to the right shape and data type.
+            mask_len = seq_len
+            if attention_mask is not None:
+                mask_len = attention_mask.shape[-1]
+            elif past_key_values is not None:
+                mask_len = past_key_values[0][0].shape[-2] + seq_len
+            attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float)
+            # Add in the masking bias.
+            if attention_mask is not None:
+                attention_bias = attention_bias + attention_mask
+                # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf.
+                # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead
+                # it can produce NaNs.
+                ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False)
+        attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+        # decoder layers
+        all_hidden_states = []
+        # Apply blocks one-by-one.
+        if self.config.block_group_size == 1:
+            for block_idx, block in enumerate(self.transformer.blocks):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layer_past = None if past_key_values is None else past_key_values[block_idx]
+                if should_checkpoint_block(self.activation_checkpointing_strategy, block_idx):
+                    # shape: (batch_size, seq_len, d_model)
+                    x, cache = self._activation_checkpoint_fn(
+                        block,
+                        x,
+                        attention_bias=attention_bias,
+                        layer_past=layer_past,
+                        use_cache=use_cache,
+                        max_doc_len=max_doc_len,
+                        cu_doc_lens=cu_doc_lens,
+                    )
+                else:
+                    # shape: (batch_size, seq_len, d_model)
+                    x, cache = block(
+                        x,
+                        attention_bias=attention_bias,
+                        layer_past=layer_past,
+                        use_cache=use_cache,
+                        max_doc_len=max_doc_len,
+                        cu_doc_lens=cu_doc_lens,
+                    )
+                if attn_key_values is not None:
+                    assert cache is not None
+                    attn_key_values.append(cache)
+        else:
+            for group_idx, block_group in enumerate(self.transformer.block_groups):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layers_past = (
+                    None
+                    if past_key_values is None
+                    else past_key_values[
+                        group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size
+                    ]
+                )
+                x, cache = block_group(
+                    x,
+                    attention_bias=attention_bias,
+                    layers_past=layers_past,
+                    use_cache=use_cache,
+                    max_doc_len=max_doc_len,
+                    cu_doc_lens=cu_doc_lens,
+                )
+                if attn_key_values is not None:
+                    assert cache is not None
+                    attn_key_values.extend(cache)
+        if last_logits_only:
+            # shape: (batch_size, 1, d_model)
+            x = x[:, -1, :].unsqueeze(1)
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        x = self.transformer.ln_f(x)  # type: ignore
+        if output_hidden_states:
+            # add final hidden state post-final-layernorm, following HuggingFace's convention
+            all_hidden_states.append(x)
+        # Get logits.
+        # shape: (batch_size, seq_len or 1, vocab_size)
+        if self.config.weight_tying:
+            logits = F.linear(x, self.transformer.wte.weight, None)  # type: ignore
+        else:
+            logits = self.transformer.ff_out(x)  # type: ignore
+        if self.config.scale_logits:
+            logits.mul_(1 / math.sqrt(self.config.d_model))
+        return OLMoOutput(
+            logits=logits,
+            attn_key_values=attn_key_values,
+            hidden_states=tuple(all_hidden_states) if output_hidden_states else None,
+        )
+    def get_fsdp_wrap_policy(self, wrap_strategy: Optional[FSDPWrapStrategy] = None):
+        if wrap_strategy is None:
+            return None
+        # The 'recurse' mode for the wrap function does not behave like you'd expect.
+        # Even if we return False, it may still recurse because PyTorch does what it wants,
+        # not what you want. This causes issues when, for example, we want to wrap 'ff_out' (a linear layer)
+        # but not other linear layers within a block.
+        # So we have to explicitly tell PyTorch which linear layers to wrap, and we also just
+        # return True in 'recurse' mode for simplicity.
+        size_based_module_to_wrap = {self.transformer.wte}
+        if hasattr(self.transformer, "ff_out"):
+            size_based_module_to_wrap.add(self.transformer.ff_out)
+        if wrap_strategy == FSDPWrapStrategy.by_block:
+            def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
+                del nonwrapped_numel
+                wrap = isinstance(module, OLMoBlock)
+                if recurse:
+                    return True
+                else:
+                    return wrap
+            return fsdp_wrap_fn
+        elif wrap_strategy == FSDPWrapStrategy.by_block_and_size:
+            def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
+                del nonwrapped_numel
+                wrap = isinstance(module, (OLMoBlock,)) or module in size_based_module_to_wrap
+                if recurse:
+                    return True
+                else:
+                    return wrap
+            return fsdp_wrap_fn
+        elif wrap_strategy == FSDPWrapStrategy.by_block_group:
+            if self.config.block_group_size <= 1:
+                raise OLMoConfigurationError(
+                    "'by_block_group' FSDP wrapping strategy requires block group size greater than 1"
+                )
+            def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
+                del nonwrapped_numel
+                wrap = isinstance(module, OLMoBlockGroup)
+                if recurse:
+                    return True
+                else:
+                    return wrap
+            return fsdp_wrap_fn
+        elif wrap_strategy == FSDPWrapStrategy.by_block_group_and_size:
+            if self.config.block_group_size <= 1:
+                raise OLMoConfigurationError(
+                    "'by_block_group_and_size' FSDP wrapping strategy requires block group size greater than 1"
+                )
+            def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
+                del nonwrapped_numel
+                wrap = isinstance(module, (OLMoBlockGroup,)) or module in size_based_module_to_wrap
+                if recurse:
+                    return True
+                else:
+                    return wrap
+            return fsdp_wrap_fn
+        elif wrap_strategy == FSDPWrapStrategy.size_based:
+            from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
+            return size_based_auto_wrap_policy
+        elif wrap_strategy in {
+            FSDPWrapStrategy.one_in_two,
+            FSDPWrapStrategy.one_in_three,
+            FSDPWrapStrategy.one_in_four,
+            FSDPWrapStrategy.one_in_five,
+        }:
+            c = {
+                FSDPWrapStrategy.one_in_two: 2,
+                FSDPWrapStrategy.one_in_three: 3,
+                FSDPWrapStrategy.one_in_four: 4,
+                FSDPWrapStrategy.one_in_five: 5,
+            }[wrap_strategy]
+            def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0):
+                del nonwrapped_numel
+                wrap = isinstance(module, OLMoBlock) and module.layer_id % c == 0
+                if recurse:
+                    return True
+                else:
+                    return wrap
+            return fsdp_wrap_fn
+        else:
+            raise NotImplementedError(wrap_strategy)
+    def num_params(self, include_embedding: bool = True) -> int:
+        """
+        Get the total number of parameters.
+        """
+        params = (np for np in self.named_parameters())
+        if not include_embedding:
+            params = filter(  # type: ignore
+                lambda np: ".wte." not in np[0] and ".wpe." not in np[0],
+                params,
+            )
+        return sum(p.numel() for _, p in params)
+    @property
+    def num_fwd_flops(self):
+        if self.__num_fwd_flops:
+            return self.__num_fwd_flops
+        # embedding table is just a lookup in the forward pass
+        n_params = self.num_params(include_embedding=False)
+        # the number of parameters is approximately the number of multiply-accumulates (MAC) in the network
+        # each MAC has 2 FLOPs - we multiply by 2 ie 2 * n_param
+        # this gets us FLOPs / token
+        params_flops_per_token = 2 * n_params
+        # there are 2 FLOPS per mac; there is A=Q*K^T and out=A*V ops (ie mult by 2)
+        attn_flops_per_token = (
+            self.config.n_layers * 2 * 2 * (self.config.d_model * self.config.max_sequence_length)
+        )
+        self.__num_fwd_flops = params_flops_per_token + attn_flops_per_token
+        return self.__num_fwd_flops
+    @property
+    def num_bck_flops(self):
+        if self.__num_bck_flops:
+            return self.__num_bck_flops
+        n_params = self.num_params()
+        params_flops_per_token = 4 * n_params
+        attn_flops_per_token = self.config.n_layers * 8 * (self.config.d_model * self.config.max_sequence_length)
+        self.__num_bck_flops = params_flops_per_token + attn_flops_per_token
+        return self.__num_bck_flops
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        max_steps: int = 10,
+        beam_size: int = 1,
+        per_node_beam_size: Optional[int] = None,
+        sampler: Optional[Sampler] = None,
+        min_steps: Optional[int] = None,
+        final_sequence_scorer: Optional[FinalSequenceScorer] = None,
+        constraints: Optional[List[Constraint]] = None,
+    ) -> OLMoGenerateOutput:
+        """
+        Generate token IDs using beam search.
+        Note that by default ``beam_size`` is set to 1, which is greedy decoding.
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        :param attention_mask: A optional tensor of shape `(batch_size, seq_len)`, the same
+            as for the forward method.
+        :param attention_bias: A tensor of shape
+            `(batch_size, 1, seq_len + tokens_to_generate, seq_len + tokens_to_generate)`,
+            the same as for the forward method except only one shape is excepted here.
+        For an explanation of the other arguments, see :class:`BeamSearch`.
+        """
+        beam_search = BeamSearch(
+            self.config.eos_token_id,
+            max_steps=max_steps,
+            beam_size=beam_size,
+            per_node_beam_size=per_node_beam_size,
+            sampler=sampler,
+            min_steps=min_steps,
+            final_sequence_scorer=final_sequence_scorer,
+            constraints=constraints,
+        )
+        # Validate inputs.
+        batch_size, seq_len = input_ids.shape
+        if attention_mask is not None:
+            assert attention_mask.shape == (batch_size, seq_len)
+        if attention_bias is not None:
+            assert len(attention_bias.shape) == 4
+            assert attention_bias.shape[:2] == (batch_size, 1)
+            assert (
+                seq_len + beam_search.max_steps
+                <= attention_bias.shape[2]
+                == attention_bias.shape[3]
+                <= self.config.max_sequence_length
+            )
+        tokens_generated = 0
+        def flatten_past_key_values(
+            past_key_values: List[Tuple[torch.Tensor, torch.Tensor]],
+        ) -> Dict[str, torch.Tensor]:
+            out = {}
+            for i, (key, value) in enumerate(past_key_values):
+                out[f"past_key_{i}"] = key
+                out[f"past_value_{i}"] = value
+            return out
+        def unflatten_past_key_values(
+            past_key_values: Dict[str, torch.Tensor],
+        ) -> List[Tuple[torch.Tensor, torch.Tensor]]:
+            out = []
+            for i in range(self.config.n_layers):
+                past_key = past_key_values[f"past_key_{i}"]
+                past_value = past_key_values[f"past_value_{i}"]
+                out.append((past_key, past_value))
+            return out
+        def step(
+            last_predictions: torch.Tensor, state: dict[str, torch.Tensor]
+        ) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
+            nonlocal tokens_generated
+            attention_mask = state.get("attention_mask")
+            attention_bias = state.get("attention_bias")
+            if tokens_generated > 0:
+                past_key_values = unflatten_past_key_values(state)
+                input_ids = last_predictions.unsqueeze(1)
+                if attention_mask is not None:
+                    group_size = input_ids.shape[0]
+                    attention_mask = torch.cat((attention_mask, attention_mask.new_ones((group_size, 1))), dim=-1)
+            else:
+                past_key_values = None
+                input_ids = state["input_ids"]
+            tokens_generated += 1
+            # Run forward pass of model to get logits, then normalize to get log probs.
+            output = self(
+                input_ids,
+                attention_mask=attention_mask,
+                attention_bias=attention_bias,
+                past_key_values=past_key_values,
+                use_cache=True,
+                last_logits_only=True,
+            )
+            log_probs = F.log_softmax(output.logits[:, -1, :], dim=-1)
+            # Create new state.
+            state = flatten_past_key_values(output.attn_key_values)
+            if attention_mask is not None:
+                state["attention_mask"] = attention_mask
+            if attention_bias is not None:
+                state["attention_bias"] = attention_bias
+            return log_probs, state
+        initial_preds = input_ids.new_zeros((batch_size,))  # This is arbitrary, we won't use this.
+        state: dict[str, torch.Tensor] = {"input_ids": input_ids}
+        if attention_mask is not None:
+            state["attention_mask"] = attention_mask
+        if attention_bias is not None:
+            state["attention_bias"] = attention_bias
+        with torch.no_grad():
+            token_ids, scores = beam_search.search(initial_preds, state, step)
+        return OLMoGenerateOutput(
+            token_ids=token_ids,  # type: ignore[arg-type]
+            scores=scores,  # type: ignore[arg-type]
+        )
+    @classmethod
+    def from_checkpoint(
+        cls, checkpoint_dir: PathOrStr, device: str = "cpu", checkpoint_type: Optional[CheckpointType] = None
+    ) -> OLMo:
+        """
+        Load an OLMo model from a checkpoint.
+        """
+        from .util import resource_path
+        # Guess checkpoint type.
+        if checkpoint_type is None:
+            try:
+                if resource_path(checkpoint_dir, "model.pt").is_file():
+                    checkpoint_type = CheckpointType.unsharded
+                else:
+                    checkpoint_type = CheckpointType.sharded
+            except FileNotFoundError:
+                checkpoint_type = CheckpointType.sharded
+        # Load config.
+        config_path = resource_path(checkpoint_dir, "config.yaml")
+        model_config = ModelConfig.load(config_path, key="model", validate_paths=False)
+        if checkpoint_type == CheckpointType.unsharded:
+            # Initialize model (always on CPU to start with so we don't run out of GPU memory).
+            model_config.init_device = "cpu"
+            model = OLMo(model_config)
+            # Load state dict directly to target device.
+            state_dict_path = resource_path(checkpoint_dir, "model.pt")
+            state_dict = torch.load(state_dict_path, map_location="cpu")
+            model.load_state_dict(model._make_state_dict_compatible(state_dict)[0])
+            model = model.to(torch.device(device))
+        else:
+            train_config = TrainConfig.load(config_path)
+            if train_config.sharded_checkpointer == ShardedCheckpointerType.olmo_core:
+                from olmo_core.distributed.checkpoint import (  # type: ignore
+                    load_model_and_optim_state,
+                )
+                model_config.init_device = device
+                model = OLMo(model_config)
+                load_model_and_optim_state(checkpoint_dir, model)
+            else:
+                # train_config.sharded_checkpointer == ShardedCheckpointerType.torch_new
+                from .checkpoint import load_model_state
+                # Initialize model on target device. In this case the state dict is loaded in-place
+                # so it's not necessary to start on CPU if the target device is a GPU.
+                model_config.init_device = device
+                model = OLMo(model_config)
+                # Load state dict in place.
+                load_model_state(checkpoint_dir, model)
+        return model.eval()
+    def _make_state_dict_compatible(
+        self, state_dict: Dict[str, torch.Tensor]
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Set[str]]]:
+        """
+        Handles some cases where the state dict is valid yet may need to be transformed in order to
+        be loaded.
+        This modifies the state dict in-place and also returns it, along with a mapping of original key
+        names to new key names in cases where the keys were simply renamed. That mapping can be used
+        to make a corresponding optimizer state dict compatible as well.
+        """
+        import re
+        from fnmatch import fnmatch
+        new_keys_to_og_keys: Dict[str, str] = {}
+        # Remove "_fsdp_wrapped_module." prefix from all keys. We don't want this prefix when the model is
+        # not wrapped in FSDP. And when the model is wrapped in FSDP, loading this state dict will still work
+        # fine without the prefixes. This also simplifies the other steps below.
+        for key in list(state_dict.keys()):
+            state_dict[(new_key := key.replace("_fsdp_wrapped_module.", ""))] = state_dict.pop(key)
+            new_keys_to_og_keys[new_key] = key
+        # For backwards compatibility prior to fixing https://github.com/allenai/LLM/issues/222
+        if self.config.block_type == BlockType.sequential:
+            for key in list(state_dict.keys()):
+                if fnmatch(key, "transformer.*.norm.weight"):
+                    tensor = state_dict.pop(key)
+                    state_dict[(new_key := key.replace("norm.weight", "attn_norm.weight"))] = tensor
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    state_dict[(new_key := key.replace("norm.weight", "ff_norm.weight"))] = tensor.clone()
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    del new_keys_to_og_keys[key]
+                elif fnmatch(key, "transformer.*.norm.bias"):
+                    tensor = state_dict.pop(key)
+                    state_dict[(new_key := key.replace("norm.bias", "attn_norm.bias"))] = tensor
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    state_dict[(new_key := key.replace("norm.bias", "ff_norm.bias"))] = tensor.clone()
+                    new_keys_to_og_keys[new_key] = new_keys_to_og_keys[key]
+                    del new_keys_to_og_keys[key]
+        # For loading a state dict that was saved with a different `block_group_size`.
+        if "transformer.block_groups.0.0.attn_out.weight" in state_dict.keys():
+            state_dict_block_group_size = len(
+                [k for k in state_dict.keys() if fnmatch(k, "transformer.block_groups.0.*.attn_out.weight")]
+            )
+        else:
+            state_dict_block_group_size = 1
+        if self.config.block_group_size != state_dict_block_group_size:
+            log.info(
+                f"Regrouping state dict blocks from group size {state_dict_block_group_size} to "
+                f"group size {self.config.block_group_size}"
+            )
+            # For simplicity we're first going to flatten out the block groups in the state dict (if necessary)
+            # and then (re-)group them into the right block sizes.
+            if state_dict_block_group_size > 1:
+                for key in list(state_dict.keys()):
+                    if (m := re.match(r"transformer.block_groups\.(\d+)\.(\d+)\..*", key)) is not None:
+                        group_idx, group_block_idx = int(m.group(1)), int(m.group(2))
+                        block_idx = (group_idx * state_dict_block_group_size) + group_block_idx
+                        state_dict[
+                            (
+                                new_key := key.replace(
+                                    f"block_groups.{group_idx}.{group_block_idx}.", f"blocks.{block_idx}."
+                                )
+                            )
+                        ] = state_dict.pop(key)
+                        new_keys_to_og_keys[new_key] = new_keys_to_og_keys.pop(key)
+            if self.config.block_group_size > 1:
+                # Group the state dict blocks into the right block size.
+                for key in list(state_dict.keys()):
+                    if (m := re.match(r"transformer.blocks\.(\d+)\..*", key)) is not None:
+                        block_idx = int(m.group(1))
+                        group_idx, group_block_idx = (
+                            block_idx // self.config.block_group_size,
+                            block_idx % self.config.block_group_size,
+                        )
+                        state_dict[
+                            (
+                                new_key := key.replace(
+                                    f"blocks.{block_idx}.", f"block_groups.{group_idx}.{group_block_idx}."
+                                )
+                            )
+                        ] = state_dict.pop(key)
+                        new_keys_to_og_keys[new_key] = new_keys_to_og_keys.pop(key)
+        og_keys_to_new: Dict[str, Set[str]] = defaultdict(set)
+        for new_key, og_key in new_keys_to_og_keys.items():
+            og_keys_to_new[og_key].add(new_key)
+        return state_dict, og_keys_to_new

modeling_fan.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import logging
+from dataclasses import fields
+from typing import Callable, List, Optional, Tuple, Union
+import torch
+from transformers import PreTrainedModel
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.auto import AutoModelForCausalLM
+from .config import ActivationCheckpointingStrategy, ModelConfig
+from .model import OLMo
+from .configuration_olmo import OLMoConfig
+from typing import (
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    cast,
+)
+log = logging.getLogger(__name__)
+def create_model_config_from_pretrained_config(config: OLMoConfig):
+    """
+    Utility function
+    """
+    kwargs = {}
+    for field in fields(ModelConfig):
+        kwargs[field.name] = getattr(config, field.name)
+    model_config = ModelConfig(**kwargs)
+    # Handle flash attention settings
+    if config._attn_implementation == "flash_attention_2":
+        model_config.flash_attention = True
+    elif config._attn_implementation in ("eager", "sdpa"):
+        model_config.flash_attention = False
+    else:
+        raise ValueError(f"Unexpected _attn_implementation {config._attn_implementation}")
+    return model_config
+class OLMoForCausalLM(PreTrainedModel):
+    """
+    Extremely barebones HF model wrapper.
+    """
+    config_class = OLMoConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["OLMoBlock"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    supports_gradient_checkpointing = True
+    def __init__(self, config: OLMoConfig, model: Optional[OLMo] = None, init_params: bool = False):
+        super().__init__(config)
+        self._gradient_checkpointing_func: Optional[Callable] = None
+        self._gradient_checkpointing = False
+        if not model:
+            model_config = create_model_config_from_pretrained_config(config)
+            # Initialize model (always on CPU to start with so we don't run out of GPU memory).
+            model_config.init_device = "cpu"
+            self.model = OLMo(model_config, init_params=init_params)
+        else:
+            self.model = model
+    @property
+    def gradient_checkpointing(self) -> bool:
+        return self._gradient_checkpointing
+    @gradient_checkpointing.setter
+    def gradient_checkpointing(self, enabled: bool):
+        if self._gradient_checkpointing == enabled:
+            return
+        # HF does not specify a way to pass checkpointing strategies, so we pick
+        # whole layer as our strategy. We can make this configurable later if needed.
+        checkpointing_strategy = ActivationCheckpointingStrategy.whole_layer if enabled else None
+        self.model.set_activation_checkpointing(
+            checkpointing_strategy, checkpoint_func=self._gradient_checkpointing_func
+        )
+        self._gradient_checkpointing = enabled
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        # past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[
+            Cache
+        ] = None,  # This is a hack mitigation of an issue in transformers `4.39.x` https://github.com/huggingface/transformers/issues/29426
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if use_cache is None:
+            use_cache = self.config.use_cache
+        if output_attentions:
+            raise ValueError("output_attentions is not yet supported in OLMo")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            input_embeddings=inputs_embeds,
+            attention_mask=attention_mask,
+            attention_bias=attention_bias,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+        )
+        logits = outputs.logits
+        hidden_states = outputs.hidden_states
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = torch.nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.embedding_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.attn_key_values,
+            hidden_states=hidden_states,
+        )
+    def can_generate(self) -> bool:
+        return True
+    def prepare_inputs_for_generation(
+        self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
+    ):
+        if past_key_values:
+            # This is because we want the model to only process the last generated token.
+            input_ids = input_ids[:, -1:]
+        model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values}
+        model_inputs.update(kwargs)
+        model_inputs["use_cache"] = kwargs.pop("use_cache", self.config.use_cache)
+        return model_inputs
+    # TODO: these are required to make the implementation complete.
+    # def resize_position_embeddings(self, new_num_position_embeddings: int):
+    #     pass
+    #
+    # def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
+    #     pass
+    #
+    # def _reorder_cache(self, past_key_values, beam_idx):
+    #     pass
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.model.transformer.wte
+    def set_input_embeddings(self, value: torch.nn.Module):
+        self.model.transformer.wte = value
+    def get_output_embeddings(self):
+        if self.config.weight_tying:
+            return self.model.transformer.wte
+        else:
+            return self.model.transformer.ff_out
+    def set_output_embeddings(self, value: torch.nn.Module):
+        if self.config.weight_tying:
+            self.model.transformer.wte = value
+        else:
+            self.model.transformer.ff_out = value
+    def tie_weights(self):
+        """
+        This function is intentionally left as a no-op.
+        Weight tying is handled as follows:
+        - When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
+        See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
+        - When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
+        See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
+        Therefore, there is no need to explicitly tie the weights in this function.
+        """
+        pass
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """
+        Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
+        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+        Arguments:
+            new_num_tokens (`int`, *optional*):
+                The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
+                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
+                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
+                details about this, or help on choosing the correct value for resizing, refer to this guide:
+                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
+        Return:
+            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
+        Note:
+            This method differs from the base class implementation by resizing the `embedding_size` attribute of the
+            model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
+            is less than the `vocab_size`. In OLMo, `embedding_size` refers to the dimensionality of the model's token
+            embeddings, while `vocab_size` refers to the number of unique tokens in the vocabulary.
+        """
+        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        if new_num_tokens is None and pad_to_multiple_of is None:
+            return model_embeds
+        # Update base model and current model config
+        self.config.embedding_size = model_embeds.weight.shape[0]
+        self.model.config.embedding_size = model_embeds.weight.shape[0]
+        # Check if the embedding size is less than the vocab size
+        if self.config.embedding_size < self.config.vocab_size:
+            warning_message = (
+                f"Resizing token embeddings to size {self.config.embedding_size}, which is less than the vocab size "
+                f"{self.config.vocab_size} defined in the model configuration. Make sure your tokenizer's vocabulary "
+                "size is less than or equal to the new token embedding size."
+            )
+            log.warning(warning_message)
+        # Tie weights again if needed
+        self.tie_weights()
+        return model_embeds
+# Register the model so that it is available for transformer pipelines, auto-loading, etc.
+# OLMo is integrated directly in transformers from v4.40.0 onwards, but the version in transformers
+# may not support the newest architectures we create.
+AutoModelForCausalLM.register(OLMoConfig, OLMoForCausalLM)

optim.py ADDED Viewed

	@@ -0,0 +1,1040 @@

+import logging
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass, replace
+from math import cos, pi, sqrt
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed.fsdp import FullyShardedDataParallel
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.optim.optimizer import Optimizer as OptimizerBase
+from . import LayerNormBase
+from .config import OptimizerType, SchedulerConfig, SchedulerType, TrainConfig
+from .torch_util import get_default_device, is_distributed
+__all__ = [
+    "Optimizer",
+    "LionW",
+    "AdamW",
+    "Scheduler",
+    "CosWithWarmup",
+    "LinearWithWarmup",
+    "InvSqrtWithWarmup",
+    "MaxScheduler",
+    "ConstantScheduler",
+    "CosLinearEnvelope",
+    "BoltOnWarmupScheduler",
+    "build_optimizer",
+    "build_scheduler",
+]
+log = logging.getLogger(__name__)
+class Optimizer(OptimizerBase):
+    def __init__(self, *args, record_update_metrics: bool = False, selective_updates: bool = False, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._record_update_metrics = record_update_metrics
+        self._collecting_metrics = False
+        self._selective_updates = selective_updates
+    def _clean_param_name(self, name: str) -> str:
+        return name.replace("_fsdp_wrapped_module.", "")
+    @torch.no_grad()
+    def clip_grads_and_collect_metrics(
+        self,
+        global_step: int,
+        collect_param_metrics: bool = True,
+        process_group: Optional[dist.ProcessGroup] = None,
+        device: Optional[torch.device] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Clips gradients for every group that has the field `max_grad_norm`.
+        At the same time collect metrics for each parameter and its gradient.
+        """
+        self._collecting_metrics = collect_param_metrics
+        device = get_default_device() if device is None else device
+        # NOTE (epwalsh): during distributed training we're making an assumption that the order of
+        # the param groups and the params within each group are the same across all ranks.
+        # This is justified since we initialize the parameter groups in every rank by iterating over
+        # `module.parameters()` or `module.named_modules()` / `module.named_parameters()`, each of which
+        # provides a consistent order.
+        #  For each parameter (with a gradient) we'll collect:
+        # - min, max, avg, norm of the param itself
+        # - min, max, avg, norm of the param's gradient
+        # - min, max, avg, norm of any additional per-parameter optimizer state metrics returned from
+        #   `self.get_state_for_param()`.
+        # Afterwards we'll reduce these all over all ranks.
+        per_param_min_metrics: List[torch.Tensor] = []
+        per_param_max_metrics: List[torch.Tensor] = []
+        per_param_sum_metrics: List[torch.Tensor] = []
+        per_param_norm_metrics: List[torch.Tensor] = []
+        per_param_numel_metrics: List[torch.Tensor] = []
+        per_param_min_metric_names: List[str] = []
+        per_param_max_metric_names: List[str] = []
+        per_param_avg_metric_names: List[str] = []
+        per_param_norm_metric_names: List[str] = []
+        dst_rank = 0
+        if process_group is not None:
+            dst_rank = dist.get_global_rank(process_group, 0)
+        #######################################################################
+        # part 1: collect metrics locally
+        #######################################################################
+        for group in self.param_groups:
+            for name, p in zip(group["param_names"], group["params"]):
+                name = self._clean_param_name(name)
+                # Always need to collect the norm of gradients for clipping, even if we're not collecting
+                # other metrics.
+                tensors: List[Optional[torch.Tensor]] = [p.grad]
+                prefixes: List[str] = [f"grad/{name}"]
+                if collect_param_metrics:
+                    state = self.get_state_for_param(p)
+                    sorted_state_keys = sorted([k for k in state.keys()])
+                    tensors.extend([p] + [state[key] for key in sorted_state_keys])
+                    prefixes.extend([f"param/{name}"] + [f"{key}/{name}" for key in sorted_state_keys])
+                assert len(tensors) == len(prefixes)
+                # Get min, max, avg, and norm for all `tensors` associated with the parameter.
+                for x, prefix in zip(tensors, prefixes):
+                    # grad or state tensors could be none for params that have their shards completely on
+                    # other ranks.
+                    if x is not None and x.numel() > 0:
+                        if collect_param_metrics:
+                            x_abs = x.abs()
+                            per_param_min_metrics.append(x_abs.min().unsqueeze(0).to(dtype=torch.float32))
+                            per_param_max_metrics.append(x_abs.max().unsqueeze(0).to(dtype=torch.float32))
+                            per_param_sum_metrics.append(x.sum().unsqueeze(0).to(dtype=torch.float32))
+                            per_param_numel_metrics.append(
+                                torch.tensor([x.numel()], device=device, dtype=torch.float32)
+                            )
+                        per_param_norm_metrics.append(
+                            torch.linalg.vector_norm(x, 2.0, dtype=torch.float32).unsqueeze(0)
+                        )
+                    else:
+                        if collect_param_metrics:
+                            per_param_min_metrics.append(
+                                torch.tensor([float("inf")], device=device, dtype=torch.float32)
+                            )
+                            per_param_max_metrics.append(torch.tensor([0.0], device=device, dtype=torch.float32))
+                            per_param_sum_metrics.append(torch.tensor([0.0], device=device, dtype=torch.float32))
+                            per_param_numel_metrics.append(torch.tensor([0.0], device=device, dtype=torch.float32))
+                        per_param_norm_metrics.append(torch.tensor([0.0], device=device, dtype=torch.float32))
+                    if collect_param_metrics:
+                        per_param_min_metric_names.append(f"{prefix}.min")
+                        per_param_max_metric_names.append(f"{prefix}.max")
+                        per_param_avg_metric_names.append(f"{prefix}.avg")
+                    per_param_norm_metric_names.append(f"{prefix}.norm")
+        assert (
+            len(per_param_min_metrics)
+            == len(per_param_min_metric_names)
+            == len(per_param_max_metrics)
+            == len(per_param_max_metric_names)
+            == len(per_param_sum_metrics)
+            == len(per_param_numel_metrics)
+            == len(per_param_avg_metric_names)
+        )
+        assert len(per_param_norm_metrics) == len(per_param_norm_metric_names)
+        def is_grad_norm_metric(metric_name: str) -> bool:
+            return metric_name.startswith("grad/") and metric_name.endswith(".norm")
+        #######################################################################
+        # part 2: reduce metrics over ranks
+        #######################################################################
+        param_group_sharded = False
+        for group in self.param_groups:
+            param_group_sharded = param_group_sharded or group.get("sharded", False)
+        total_grad_norm: torch.Tensor
+        per_param_avg_metrics: List[torch.Tensor] = []
+        if is_distributed() and param_group_sharded:
+            # Reduce metrics across all ranks. Note that we can use a `reduce` for most cases
+            # instead of an `all_reduce`, but we need `all_reduce` for norms so that all ranks
+            # get the right value for gradient norms so they can clip correctly.
+            # Reduce mins.
+            if per_param_min_metrics:
+                all_mins = torch.cat(per_param_min_metrics).to(device)
+                dist.reduce(all_mins, dst_rank, op=dist.ReduceOp.MIN, group=process_group)
+                per_param_min_metrics = all_mins.split(1)
+            # Reduce maxs.
+            if per_param_max_metrics:
+                all_maxs = torch.cat(per_param_max_metrics).to(device)
+                dist.reduce(all_maxs, dst_rank, op=dist.ReduceOp.MAX, group=process_group)
+                per_param_max_metrics = all_maxs.split(1)
+            # Reduce sums or just norms.
+            all_norms = torch.cat(per_param_norm_metrics).to(device) ** 2.0
+            if per_param_sum_metrics and per_param_numel_metrics:
+                all_sums = torch.cat(per_param_sum_metrics).to(device)
+                all_numels = torch.cat(per_param_numel_metrics).to(device)
+                all_sums_norms_numels = torch.cat(
+                    [all_sums.unsqueeze(0), all_norms.unsqueeze(0), all_numels.unsqueeze(0)], dim=0
+                )
+                dist.all_reduce(all_sums_norms_numels, op=dist.ReduceOp.SUM, group=process_group)
+                all_sums, all_norms, all_numels = all_sums_norms_numels.split(1)
+                # Get averages.
+                # NOTE: could get infs for non-rank0 processes but that's okay.
+                per_param_avg_metrics = (all_sums / all_numels).squeeze(0).split(1)
+            else:
+                dist.all_reduce(all_norms, op=dist.ReduceOp.SUM, group=process_group)
+            grad_norm_metric_mask = torch.tensor(
+                [float(is_grad_norm_metric(n)) for n in per_param_norm_metric_names], device=all_norms.device
+            )
+            total_grad_norm = (all_norms * grad_norm_metric_mask).sum() ** 0.5
+            per_param_norm_metrics = (all_norms ** (0.5)).squeeze(0).split(1)
+        else:
+            total_grad_norm = (
+                torch.cat(
+                    [
+                        m
+                        for m, n in zip(per_param_norm_metrics, per_param_norm_metric_names)
+                        if is_grad_norm_metric(n)
+                    ]
+                )
+                ** 2.0
+            ).sum() ** 0.5
+            per_param_avg_metrics = [x / n for x, n in zip(per_param_sum_metrics, per_param_numel_metrics)]
+        assert len(per_param_avg_metrics) == len(per_param_avg_metric_names)
+        # Collect all metrics into a single dict.
+        all_metrics: Dict[str, torch.Tensor] = {}
+        if collect_param_metrics:
+            for metric_name, metric in zip(per_param_min_metric_names, per_param_min_metrics):
+                all_metrics[metric_name] = metric.squeeze(0)
+            for metric_name, metric in zip(per_param_max_metric_names, per_param_max_metrics):
+                all_metrics[metric_name] = metric.squeeze(0)
+            for metric_name, metric in zip(per_param_avg_metric_names, per_param_avg_metrics):
+                all_metrics[metric_name] = metric.squeeze(0)
+        for metric_name, metric in zip(per_param_norm_metric_names, per_param_norm_metrics):
+            all_metrics[metric_name] = metric.squeeze(0)
+        all_metrics["total_grad_norm"] = total_grad_norm
+        #######################################################################
+        # part 3: clip grads
+        #######################################################################
+        num_grads_clipped = 0
+        num_eligible_grads = 0
+        for group in self.param_groups:
+            if (max_norm_ratio := group.get("max_grad_norm_ratio")) is not None:
+                num_clipped = self._do_adaptive_clipping(
+                    group, max_norm_ratio, global_step, all_metrics, collect_param_metrics=collect_param_metrics
+                )
+            elif (max_norm := group.get("max_grad_norm")) is not None:
+                num_clipped = self._do_global_fixed_clipping(
+                    group, max_norm, all_metrics, collect_param_metrics=collect_param_metrics
+                )
+            else:
+                # No clipping needed.
+                continue
+            num_eligible_grads += len(group["params"])
+            if num_clipped is not None:
+                num_grads_clipped += num_clipped
+        if collect_param_metrics:
+            if num_eligible_grads > 0:
+                clipping_rate = torch.tensor(num_grads_clipped / num_eligible_grads, device="cpu")
+            else:
+                clipping_rate = torch.tensor(0.0, device="cpu")
+            all_metrics["clipping_rate"] = clipping_rate
+        # total_grad_norm is computed at all steps, even when collect_param_metrics is set to False
+        return all_metrics
+    @torch.no_grad()
+    def _do_adaptive_clipping(
+        self,
+        group: Dict[str, Any],
+        max_norm_ratio: float,
+        global_step: int,
+        all_metrics: Dict[str, torch.Tensor],
+        collect_param_metrics: bool = True,
+        device: Optional[torch.device] = None,
+    ) -> Optional[int]:
+        """
+        Do adaptive gradient clipping on a param group.
+        If ``collect_param_metrics`` is ``True`` this will return the total number of gradients clipped.
+        """
+        device = get_default_device() if device is None else device
+        num_grads_clipped = 0
+        # We'll use the bigger of beta1 and beta2 to update the exponential average of the norm of
+        # the gradient (a scalar), not to be confused with the exponential average of the gradient.
+        # TODO (epwalsh): handle optimizers that don't have betas.
+        beta1, beta2 = group["betas"]
+        beta = max(beta1, beta2)
+        for name, p in zip(group["param_names"], group["params"]):
+            name = self._clean_param_name(name)
+            grad_norm = all_metrics.get(f"grad/{name}.norm")
+            if grad_norm is None:
+                continue
+            # Get or initialize the exponential average of grad norm.
+            # TODO: The way we have it right now, every rank tracks the `grad_norm_exp_avg` of every parameter,
+            # even parameters for which the corresponding local shard is empty. This has the potential to
+            # cause some issues with the optimizer, as we ran into with https://github.com/allenai/LLM/pull/372.
+            # So we should consider changing how we do this at some point so that we don't add any state
+            # to parameters for which the local shard is empty. That would probably add extra distributed
+            # communication, at least on steps where we have to log (i.e. when `collect_param_metrics=True`).
+            state = self.state[p]
+            grad_norm_exp_avg = state.get("grad_norm_exp_avg")
+            if grad_norm_exp_avg is None:
+                grad_norm_exp_avg = grad_norm.clone().to(device)
+                # We don't want to add anything to `state` until `state` has been initialized, otherwise
+                # this will crash some optimizers which rely on checking `len(state)`. The downside here
+                # is that we won't start tracking `grad_norm_exp_avg` until the 2nd training step.
+                if global_step > 1:
+                    state["grad_norm_exp_avg"] = grad_norm_exp_avg
+            max_allowed_norm = max_norm_ratio * grad_norm_exp_avg
+            clip_coef = max_allowed_norm / (grad_norm + 1e-6)
+            # Clip the gradients and update the exponential average.
+            # Note that multiplying by the clamped coefficient is meaningless when it is
+            # equal to 1, but it avoids the host-device sync that would result from `if clip_coef_clamped < 1`.
+            clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+            if p.grad is not None:
+                # p.grad could be none for some ranks when using FSDP.
+                p.grad.detach().mul_(clip_coef_clamped.to(p.grad.device, p.grad.dtype))
+            # Update the exponential average of the norm of the gradient with the clipped norm of the gradient.
+            grad_norm_exp_avg.lerp_((grad_norm * clip_coef_clamped).to(grad_norm_exp_avg.device), 1 - beta)
+            # Alternative: update with the *unclipped* norm of the gradient.
+            #  grad_norm_exp_avg.lerp_(grad_norm.to(grad_norm_exp_avg.device), 1 - beta)
+            if collect_param_metrics:
+                # Can't avoid host-device sync here.
+                if clip_coef_clamped < 1.0:
+                    num_grads_clipped += 1
+                all_metrics[f"grad_norm_exp_avg/{name}"] = grad_norm_exp_avg
+        return num_grads_clipped if collect_param_metrics else None
+    @torch.no_grad()
+    def _do_global_fixed_clipping(
+        self,
+        group: Dict[str, Any],
+        max_norm: float,
+        all_metrics: Dict[str, torch.Tensor],
+        collect_param_metrics: bool = True,
+        device: Optional[torch.device] = None,
+    ) -> Optional[int]:
+        """
+        Do global fixed gradient clipping on a param group.
+        If ``collect_param_metrics`` is ``True`` this will return the total number of gradients clipped.
+        """
+        device = get_default_device() if device is None else device
+        total_grad_norm = all_metrics["total_grad_norm"]
+        clip_coef = max_norm / (total_grad_norm.to(device) + 1e-6)
+        clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+        num_grads_clipped: Optional[int] = None
+        if collect_param_metrics:
+            # Can't avoid host-device sync here.
+            if clip_coef_clamped < 1.0:
+                num_grads_clipped = len(group["params"])
+        for p in group["params"]:
+            # Clip the gradients.
+            # Note that multiplying by the clamped coefficient is meaningless when it is
+            # equal to 1, but it avoids the host-device sync that would result from `if clip_coef_clamped < 1`.
+            if p.grad is not None:
+                # p.grad could be none for some ranks when using FSDP.
+                p.grad.detach().mul_(clip_coef_clamped.to(p.grad.device, p.grad.dtype))
+        return num_grads_clipped
+    def get_post_step_metrics(
+        self, module: nn.Module, process_group: Optional[dist.ProcessGroup] = None
+    ) -> Dict[str, torch.Tensor]:
+        del module, process_group
+        return {}
+    def get_state_for_param(self, param: nn.Parameter) -> Dict[str, Optional[torch.Tensor]]:
+        del param
+        return {}
+class LionW(Optimizer):
+    """
+    Adapted from https://github.com/google/automl/blob/master/lion/lion_pytorch.py
+    """
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-4,
+        betas: Tuple[float, float] = (0.9, 0.99),
+        weight_decay: float = 0.0,
+        record_update_metrics: bool = False,
+        selective_updates: bool = False,
+        device: Optional[torch.device] = None,
+    ):
+        assert lr > 0.0
+        assert all([0.0 <= beta <= 1.0 for beta in betas])
+        defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay)
+        super().__init__(
+            params, defaults, record_update_metrics=record_update_metrics, selective_updates=selective_updates
+        )
+        for group in self.param_groups:
+            group["initial_lr"] = group["lr"]
+        self._update_total_dot_prod: Optional[torch.Tensor] = None
+        self._update_total_norm: Optional[torch.Tensor] = None
+        self._signed_update_total_norm: Optional[torch.Tensor] = None
+        self._device: Optional[torch.device] = device
+    def get_post_step_metrics(
+        self, module: nn.Module, process_group: Optional[dist.ProcessGroup] = None
+    ) -> Dict[str, torch.Tensor]:
+        assert isinstance(
+            module, FSDP
+        ), "`get_post_step_metrics` expects module to be FSDP and will not work with other `distributed_strategy`."
+        update_total_dot_prod = self._update_total_dot_prod
+        update_total_norm = self._update_total_norm
+        signed_update_total_norm = self._signed_update_total_norm
+        if update_total_dot_prod is None or update_total_norm is None or signed_update_total_norm is None:
+            return {}
+        self._update_total_dot_prod = None
+        self._update_total_norm = None
+        self._signed_update_total_norm = None
+        if is_distributed() and isinstance(module, FullyShardedDataParallel):
+            # Reduce total dot prod and norms across all ranks.
+            update_total_norm = update_total_norm**2.0
+            signed_update_total_norm = signed_update_total_norm**2.0
+            # Reduce all together to avoid multiple communication calls.
+            all_together = torch.stack([update_total_dot_prod, update_total_norm, signed_update_total_norm])
+            # Only need the final result on rank0, since that's where we log from.
+            dist.reduce(
+                all_together,
+                0 if process_group is None else dist.get_global_rank(process_group, 0),
+                group=process_group,
+            )
+            update_total_dot_prod, update_total_norm, signed_update_total_norm = all_together
+            update_total_norm = update_total_norm**0.5
+            signed_update_total_norm = signed_update_total_norm**0.5
+        update_cos_sim = update_total_dot_prod / torch.max(
+            update_total_norm * signed_update_total_norm,
+            torch.tensor(1e-8, device=get_default_device() if self._device is None else self._device),
+        )
+        return {"update_cos_sim": update_cos_sim}
+    @torch.no_grad()
+    def step(self, closure=None) -> None:
+        if closure is not None:
+            with torch.enable_grad():
+                closure()
+        update_total_dot_prod: Optional[torch.Tensor] = None
+        update_norms: Optional[List[torch.Tensor]] = None
+        signed_update_norms: Optional[List[torch.Tensor]] = None
+        if self._collecting_metrics and self._record_update_metrics:
+            update_total_dot_prod = torch.tensor(0.0, dtype=torch.float32)
+            update_norms = []
+            signed_update_norms = []
+        for group in self.param_groups:
+            for p in group["params"]:
+                grad = p.grad
+                if grad is None:
+                    continue
+                state = self.state[p]
+                # Perform step weight decay
+                mask: Union[torch.Tensor, int] = grad != 0 if self._selective_updates else 1
+                p.data.mul_(1 - mask * (group["lr"] * group["weight_decay"]))
+                # State initialization
+                if len(state) == 0:
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(p)
+                exp_avg = state["exp_avg"]
+                beta1, beta2 = group["betas"]
+                # Weight update
+                update = exp_avg * beta1 + grad * (1 - beta1)
+                if isinstance(mask, torch.Tensor):
+                    # When mask isn't a tensor it's just a literal `1` (python int), so there's
+                    # no point in calling this op.
+                    update.mul_(mask)
+                signed_update = torch.sign(update)
+                p.add_(signed_update, alpha=-group["lr"])
+                # Decay the momentum running average coefficient
+                exp_avg.mul_(1 - mask * (1 - beta2)).add_(grad, alpha=1 - beta2)
+                # Track dot product and norms of update vs signed update in order to calculate
+                # their cosine similarity.
+                if (
+                    update_total_dot_prod is not None
+                    and update_norms is not None
+                    and signed_update_norms is not None
+                ):
+                    update_total_dot_prod = update_total_dot_prod.to(update.device)
+                    update_total_dot_prod += torch.tensordot(update, signed_update, dims=len(update.shape))
+                    update_norms.append(torch.linalg.vector_norm(update, 2.0, dtype=torch.float32))
+                    signed_update_norms.append(torch.linalg.vector_norm(signed_update, 2.0, dtype=torch.float32))
+        # Compute cosine similarity between update and signed update.
+        if update_total_dot_prod is not None and update_norms is not None and signed_update_norms is not None:
+            device = get_default_device() if self._device is None else self._device
+            self._update_total_dot_prod = update_total_dot_prod.to(device)
+            self._update_total_norm = torch.linalg.vector_norm(
+                torch.stack(update_norms),
+                2.0,
+                dtype=torch.float32,
+            ).to(device)
+            self._signed_update_total_norm = torch.linalg.vector_norm(
+                torch.stack(signed_update_norms),
+                2.0,
+                dtype=torch.float32,
+            ).to(device)
+class AdamW(torch.optim.AdamW, Optimizer):
+    def __init__(self, *args, record_update_metrics: bool = False, selective_updates: bool = False, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Need to set these here just like in our base `Optimizer` class since our `Optimizer.__init__`
+        # won't be called.
+        self._record_update_metrics = record_update_metrics
+        self._collecting_metrics = False
+        self._selective_updates = selective_updates
+        self._step_size_param_names: Optional[List[str]] = None
+        self._step_size_norms: Optional[List[torch.Tensor]] = None
+        self._step_size_maxs: Optional[List[torch.Tensor]] = None
+    @torch.no_grad()
+    def step(self, closure=None) -> None:
+        if not (self._record_update_metrics and self._collecting_metrics) and not self._selective_updates:
+            return super().step(closure=closure)
+        device = get_default_device()
+        param_names = []
+        step_size_norms = []
+        step_size_maxs = []
+        for group in self.param_groups:
+            beta1, beta2 = group["betas"]
+            lr = group["lr"]
+            weight_decay = group["weight_decay"]
+            eps = group["eps"]
+            amsgrad = group["amsgrad"]
+            for name, param in zip(group["param_names"], group["params"]):
+                name = self._clean_param_name(name)
+                param_names.append(name)
+                grad = param.grad
+                if grad is None:
+                    step_size_norms.append(torch.tensor([0.0], device=device))
+                    step_size_maxs.append(torch.tensor([0.0], device=device))
+                    continue
+                state = self.state[param]
+                # init state if needed
+                if len(state) == 0:
+                    state["step"] = (
+                        torch.zeros((), dtype=torch.float32, device=param.device)
+                        if group["capturable"] or group["fused"]
+                        else torch.tensor(0.0, dtype=torch.float32)
+                    )
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state["max_exp_avg_sq"] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                exp_avg = state["exp_avg"]
+                exp_avg_sq = state["exp_avg_sq"]
+                step_t = state["step"]
+                # Update step.
+                step_t += 1
+                # Perform step weight decay.
+                mask: Union[torch.Tensor, int] = grad != 0 if self._selective_updates else 1
+                param.mul_(1 - mask * (lr * weight_decay))
+                # Decay the first and second moment running average coefficient.
+                exp_avg.lerp_(grad, mask * (1 - beta1))
+                exp_avg_sq.mul_(1 - mask * (1 - beta2)).addcmul_(grad, grad, value=1 - beta2)
+                step = step_t.item()
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                step_size = lr / bias_correction1
+                bias_correction2_sqrt = sqrt(bias_correction2)
+                if amsgrad:
+                    max_exp_avg_sq = state["max_exp_avg_sq"]
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = (max_exp_avg_sq.sqrt() / bias_correction2_sqrt).add_(eps)
+                else:
+                    denom = (exp_avg_sq.sqrt() / bias_correction2_sqrt).add_(eps)
+                update = -step_size * torch.div(exp_avg, denom)
+                if isinstance(mask, torch.Tensor):
+                    # When mask isn't a tensor it's just a literal `1` (python int), so there's
+                    # no point in calling this op.
+                    update.mul_(mask)
+                param.add_(update)
+                step_size_norms.append(torch.linalg.vector_norm(update, 2.0, dtype=torch.float32).unsqueeze(0))
+                step_size_maxs.append(update.abs().max().unsqueeze(0))
+        self._step_size_param_names = param_names
+        self._step_size_norms = step_size_norms
+        self._step_size_maxs = step_size_maxs
+    def get_state_for_param(self, param: nn.Parameter) -> Dict[str, Optional[torch.Tensor]]:
+        return {key: self.state[param].get(key) for key in ("exp_avg", "exp_avg_sq")}  # type: ignore
+    def get_post_step_metrics(
+        self, module: nn.Module, process_group: Optional[dist.ProcessGroup] = None
+    ) -> Dict[str, torch.Tensor]:
+        if not (self._record_update_metrics and self._collecting_metrics):
+            return {}
+        else:
+            device = get_default_device()
+            dst_rank = 0
+            if process_group is not None:
+                dst_rank = dist.get_global_rank(process_group, 0)
+            param_names = self._step_size_param_names
+            step_size_norms = self._step_size_norms
+            step_size_maxs = self._step_size_maxs
+            assert param_names is not None
+            assert step_size_norms is not None
+            assert step_size_maxs is not None
+            # Reduce metrics if needed.
+            if is_distributed() and isinstance(module, FullyShardedDataParallel):
+                # Reduce norms.
+                all_norms = torch.cat(step_size_norms).to(device) ** 2.0
+                dist.reduce(all_norms, dst_rank, op=dist.ReduceOp.SUM, group=process_group)
+                step_size_norms = (all_norms ** (0.5)).squeeze(0).split(1)
+                # Reduce maxs.
+                all_maxs = torch.cat(step_size_maxs).to(device)
+                dist.reduce(all_maxs, dst_rank, op=dist.ReduceOp.MAX, group=process_group)
+                step_size_maxs = all_maxs.split(1)
+            metrics = {}
+            for param_name, step_size_norm, step_size_max in zip(param_names, step_size_norms, step_size_maxs):  # type: ignore[arg-type]
+                metrics[f"step/{param_name}.norm"] = step_size_norm.squeeze(0)
+                metrics[f"step/{param_name}.max"] = step_size_max.squeeze(0)
+            self._step_size_param_names = None
+            self._step_size_norms = None
+            self._step_size_maxs = None
+            return metrics
+@dataclass
+class Scheduler(metaclass=ABCMeta):
+    # NOTE: these fields are not given default values because otherwise dataclasses complains
+    # about how the scheduler subclasses are defined.
+    grad_clip_warmup_steps: Optional[int]
+    grad_clip_warmup_factor: Optional[float]
+    warmup_min_lr: Optional[float]
+    @abstractmethod
+    def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float:
+        raise NotImplementedError
+    def _get_max_grad_norm_coeff(
+        self, initial_value: Optional[float], step: int, max_steps: int
+    ) -> Optional[float]:
+        del max_steps  # might need this in the future, but for now I just wanted to match the API of `get_lr()`.
+        if initial_value is None:
+            return None
+        elif (
+            self.grad_clip_warmup_steps is None
+            or self.grad_clip_warmup_factor is None
+            or step > self.grad_clip_warmup_steps
+        ):
+            return initial_value
+        else:
+            return self.grad_clip_warmup_factor * initial_value
+    def get_max_grad_norm(
+        self, initial_max_grad_norm: Optional[float], step: int, max_steps: int
+    ) -> Optional[float]:
+        return self._get_max_grad_norm_coeff(initial_max_grad_norm, step, max_steps)
+    def get_max_grad_norm_ratio(
+        self, initial_max_grad_norm_ratio: Optional[float], step: int, max_steps: int
+    ) -> Optional[float]:
+        return self._get_max_grad_norm_coeff(initial_max_grad_norm_ratio, step, max_steps)
+    def _linear_warmup(self, initial_lr: float, step: int, warmup_steps: int = 2000) -> float:
+        warmup_min_lr = self.warmup_min_lr if self.warmup_min_lr is not None else initial_lr * 0.10
+        assert 0 <= warmup_min_lr < initial_lr
+        return warmup_min_lr + (initial_lr - warmup_min_lr) * min(step, warmup_steps) / warmup_steps
+@dataclass
+class CosWithWarmup(Scheduler):
+    warmup_steps: int
+    alpha_f: float = 0.1
+    t_max: Optional[int] = None
+    def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float:
+        max_steps = max_steps if self.t_max is None else self.t_max
+        eta_min = initial_lr * self.alpha_f
+        if step < self.warmup_steps:
+            return self._linear_warmup(initial_lr, step, self.warmup_steps)
+        elif step >= max_steps:
+            return eta_min
+        else:
+            step = step - self.warmup_steps
+            max_steps = max_steps - self.warmup_steps
+            return eta_min + (initial_lr - eta_min) * (1 + cos(pi * step / max_steps)) / 2
+@dataclass
+class LinearWithWarmup(Scheduler):
+    warmup_steps: int
+    alpha_f: float = 0.1
+    t_max: Optional[int] = None
+    def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float:
+        max_steps = max_steps if self.t_max is None else self.t_max
+        eta_min = initial_lr * self.alpha_f
+        if step < self.warmup_steps:
+            return self._linear_warmup(initial_lr, step, self.warmup_steps)
+        elif step >= max_steps:
+            return eta_min
+        else:
+            step = step - self.warmup_steps
+            max_steps = max_steps - self.warmup_steps
+            return initial_lr - (initial_lr - eta_min) * (step / max_steps)
+@dataclass
+class InvSqrtWithWarmup(Scheduler):
+    warmup_steps: int
+    def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float:
+        if step < self.warmup_steps:
+            return self._linear_warmup(initial_lr, step, self.warmup_steps)
+        del max_steps
+        return initial_lr * sqrt(self.warmup_steps / max(self.warmup_steps, step))
+@dataclass
+class MaxScheduler(Scheduler):
+    sched1: Scheduler
+    sched2: Scheduler
+    def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float:
+        return max(
+            self.sched1.get_lr(initial_lr, step, max_steps), self.sched2.get_lr(initial_lr, step, max_steps)
+        )
+@dataclass
+class BoltOnWarmupScheduler(Scheduler):
+    inner: Scheduler
+    warmup_start: int
+    warmup_end: int
+    @classmethod
+    def wrap(cls, scheduler: Scheduler, warmup_start: int, warmup_end: int) -> "BoltOnWarmupScheduler":
+        return cls(
+            grad_clip_warmup_steps=None,
+            grad_clip_warmup_factor=None,
+            inner=scheduler,
+            warmup_start=warmup_start,
+            warmup_end=warmup_end,
+            warmup_min_lr=None,
+        )
+    def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float:
+        if step < self.warmup_start:
+            return 0.0
+        if step < self.warmup_end:
+            lr_at_intercept = self.inner.get_lr(initial_lr, self.warmup_end, max_steps)
+            return lr_at_intercept * (step - self.warmup_start) / (self.warmup_end - self.warmup_start)
+        else:
+            return self.inner.get_lr(initial_lr, step, max_steps)
+    def _get_max_grad_norm_coeff(
+        self, initial_value: Optional[float], step: int, max_steps: int
+    ) -> Optional[float]:
+        return self.inner._get_max_grad_norm_coeff(initial_value, step, max_steps)
+@dataclass
+class ConstantScheduler(Scheduler):
+    def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float:
+        del step, max_steps
+        return initial_lr
+@dataclass
+class CosLinearEnvelope(Scheduler):
+    "Pointwise product of cosine schedule and linear decay; useful during annealing."
+    warmup_steps: int
+    alpha_f: float = 0.1
+    t_max: Optional[int] = None
+    def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float:
+        max_steps = max_steps if self.t_max is None else self.t_max
+        eta_min = initial_lr * self.alpha_f
+        if step < self.warmup_steps:
+            return self._linear_warmup(initial_lr, step, self.warmup_steps)
+        if step >= max_steps:
+            return eta_min
+        else:
+            step = step - self.warmup_steps
+            max_steps = max_steps - self.warmup_steps
+            linear_envelope = 1 - (step / max_steps)
+            cosine_schedule = (initial_lr - eta_min) * (1 + cos(pi * step / max_steps)) / 2
+            return eta_min + linear_envelope * cosine_schedule
+@dataclass
+class ConstantWithWarmupScheduler(Scheduler):
+    warmup_steps: int
+    def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float:
+        if step < self.warmup_steps:
+            return self._linear_warmup(initial_lr, step, self.warmup_steps)
+        del max_steps
+        return initial_lr
+PARAM_GROUP_FIELDS = ("sharded", "max_grad_norm", "max_grad_norm_ratio", "param_names")
+def get_param_groups(cfg: TrainConfig, model: nn.Module) -> List[Dict[str, Any]]:
+    """
+    Separate parameters into weight decay and non weight decay groups.
+    """
+    param_groups: List[Dict[str, Any]]
+    param_group_defaults = {
+        "sharded": isinstance(model, FullyShardedDataParallel),
+        "max_grad_norm": cfg.max_grad_norm,
+        "max_grad_norm_ratio": cfg.max_grad_norm_ratio,
+    }
+    # Separate out parameters that we don't want to apply weight decay to, like norms and biases.
+    decay = set()
+    no_decay = set()
+    all_params = {}
+    for mn, m in model.named_modules():
+        for pn, p in m.named_parameters():
+            # NOTE: because named_modules and named_parameters are recursive
+            # we will see the same tensors p many many times, but doing it this way
+            # allows us to know which parent module any tensor p belongs to...
+            if not p.requires_grad:
+                continue
+            fpn = f"{mn}.{pn}" if mn else pn
+            all_params[fpn] = p
+            if pn.endswith("bias"):
+                if cfg.optimizer.decay_norm_and_bias:
+                    decay.add(fpn)
+                else:
+                    no_decay.add(fpn)
+            elif pn.endswith("weight") and isinstance(m, nn.Linear):
+                decay.add(fpn)
+            elif pn.endswith("weight") and isinstance(m, (LayerNormBase, nn.LayerNorm)):
+                if cfg.optimizer.decay_norm_and_bias:
+                    decay.add(fpn)
+                else:
+                    no_decay.add(fpn)
+            elif pn.endswith("weight") and isinstance(m, nn.Embedding):
+                if cfg.optimizer.decay_embeddings:
+                    decay.add(fpn)
+                else:
+                    no_decay.add(fpn)
+    # Validate that we've considered every parameter
+    inter_params = decay & no_decay
+    union_params = decay | no_decay
+    assert len(inter_params) == 0, f"parameters {inter_params} made it into both decay/no_decay sets!"
+    assert (
+        len(all_params.keys() - union_params) == 0
+    ), f"parameters {all_params.keys() - union_params} were not separated into either decay/no_decay set!"
+    # Create the pytorch optimizer groups.
+    decay_sorted = sorted(list(decay))
+    no_decay_sorted = sorted(list(no_decay))
+    param_groups = []
+    if len(decay_sorted) > 0:
+        param_groups.append(
+            {
+                "params": [all_params[pn] for pn in decay_sorted],
+                "param_names": decay_sorted,
+                **param_group_defaults,
+            }
+        )
+    if len(no_decay_sorted) > 0:
+        param_groups.append(
+            {
+                "params": [all_params[pn] for pn in no_decay_sorted],
+                "param_names": no_decay_sorted,
+                "weight_decay": 0.0,
+                **param_group_defaults,
+            }
+        )
+    # Validate fields.
+    for group in param_groups:
+        for key in PARAM_GROUP_FIELDS:
+            assert key in group
+    return param_groups
+def fix_optim_state_dict(optimizer: Optimizer, state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Make sure old optim state dicts are compatible with new versions.
+    """
+    if len(state_dict["param_groups"]) == 1 and len(optimizer.param_groups) == 2:
+        assert optimizer.param_groups[1]["weight_decay"] == 0.0
+        # Decay
+        decay_param_group = {k: v for k, v in state_dict["param_groups"][0].items() if k != "params"}
+        decay_param_group["params"] = optimizer.state_dict()["param_groups"][0]["params"]
+        # No decay.
+        no_decay_param_group = {k: v for k, v in state_dict["param_groups"][0].items() if k != "params"}
+        no_decay_param_group["weight_decay"] = 0.0
+        no_decay_param_group["params"] = optimizer.state_dict()["param_groups"][1]["params"]
+        state_dict["param_groups"] = [decay_param_group, no_decay_param_group]
+    assert len(optimizer.param_groups) == len(state_dict["param_groups"])
+    # Make sure:
+    #  - All required fields are included in the state dict,
+    #  - And that the values of those fields doesn't change from what's currently set in the optimizer,
+    #    since we might have changed those fields on purpose after a restart.
+    for group, sd_group in zip(optimizer.param_groups, state_dict["param_groups"]):
+        for key in PARAM_GROUP_FIELDS:
+            sd_group[key] = group[key]
+    return state_dict
+def build_optimizer(cfg: TrainConfig, model: nn.Module) -> Optimizer:
+    param_groups = get_param_groups(cfg, model)
+    log.info(f"Constructing optimizer with {len(param_groups)} param groups")
+    if cfg.optimizer.name == OptimizerType.lionw:
+        return LionW(
+            param_groups,
+            lr=cfg.optimizer.learning_rate,
+            betas=cfg.optimizer.betas,
+            weight_decay=cfg.optimizer.weight_decay,
+            record_update_metrics=cfg.optimizer.record_update_metrics,
+            selective_updates=cfg.optimizer.selective_updates,
+        )
+    elif cfg.optimizer.name == OptimizerType.adamw:
+        return AdamW(
+            param_groups,
+            lr=cfg.optimizer.learning_rate,
+            betas=cfg.optimizer.betas,
+            weight_decay=cfg.optimizer.weight_decay,
+            record_update_metrics=cfg.optimizer.record_update_metrics,
+            selective_updates=cfg.optimizer.selective_updates,
+            eps=cfg.optimizer.eps,
+        )
+    else:
+        raise NotImplementedError
+def build_scheduler(cfg: TrainConfig, sched_cfg: Optional[SchedulerConfig] = None) -> Scheduler:
+    sched_cfg = sched_cfg if sched_cfg is not None else cfg.scheduler
+    if sched_cfg.name == SchedulerType.cosine_with_warmup:
+        return CosWithWarmup(
+            grad_clip_warmup_steps=(
+                None if sched_cfg.grad_clip_warmup_steps is None else int(sched_cfg.grad_clip_warmup_steps)
+            ),
+            grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor,
+            warmup_steps=int(sched_cfg.t_warmup),
+            alpha_f=sched_cfg.alpha_f,
+            t_max=None if sched_cfg.t_max is None else int(sched_cfg.t_max),
+            warmup_min_lr=sched_cfg.warmup_min_lr,
+        )
+    elif sched_cfg.name == SchedulerType.linear_with_warmup:
+        return LinearWithWarmup(
+            grad_clip_warmup_steps=(
+                None if sched_cfg.grad_clip_warmup_steps is None else int(sched_cfg.grad_clip_warmup_steps)
+            ),
+            grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor,
+            warmup_steps=int(sched_cfg.t_warmup),
+            alpha_f=sched_cfg.alpha_f,
+            t_max=None if sched_cfg.t_max is None else int(sched_cfg.t_max),
+            warmup_min_lr=sched_cfg.warmup_min_lr,
+        )
+    elif sched_cfg.name == SchedulerType.inverse_sqrt_with_warmup:
+        return InvSqrtWithWarmup(
+            grad_clip_warmup_steps=(
+                None if sched_cfg.grad_clip_warmup_steps is None else int(sched_cfg.grad_clip_warmup_steps)
+            ),
+            grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor,
+            warmup_steps=int(sched_cfg.t_warmup),
+            warmup_min_lr=sched_cfg.warmup_min_lr,
+        )
+    elif sched_cfg.name == SchedulerType.max_scheduler:
+        return MaxScheduler(
+            grad_clip_warmup_steps=(
+                None if sched_cfg.grad_clip_warmup_steps is None else int(sched_cfg.grad_clip_warmup_steps)
+            ),
+            grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor,
+            sched1=build_scheduler(cfg, replace(sched_cfg, name=SchedulerType.cosine_with_warmup)),
+            sched2=build_scheduler(cfg, replace(sched_cfg, name=SchedulerType.inverse_sqrt_with_warmup)),
+            warmup_min_lr=sched_cfg.warmup_min_lr,
+        )
+    elif sched_cfg.name == SchedulerType.constant:
+        return ConstantScheduler(
+            grad_clip_warmup_steps=(
+                None if sched_cfg.grad_clip_warmup_steps is None else int(sched_cfg.grad_clip_warmup_steps)
+            ),
+            grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor,
+            warmup_min_lr=sched_cfg.warmup_min_lr,
+        )
+    elif sched_cfg.name == SchedulerType.cosine_linear_envelope:
+        return CosLinearEnvelope(
+            grad_clip_warmup_steps=(
+                None if sched_cfg.grad_clip_warmup_steps is None else int(sched_cfg.grad_clip_warmup_steps)
+            ),
+            grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor,
+            warmup_steps=int(sched_cfg.t_warmup),
+            alpha_f=sched_cfg.alpha_f,
+            t_max=None if sched_cfg.t_max is None else int(sched_cfg.t_max),
+            warmup_min_lr=sched_cfg.warmup_min_lr,
+        )
+    elif sched_cfg.name == SchedulerType.constant_with_warmup:
+        return ConstantWithWarmupScheduler(
+            grad_clip_warmup_steps=(
+                None if sched_cfg.grad_clip_warmup_steps is None else int(sched_cfg.grad_clip_warmup_steps)
+            ),
+            grad_clip_warmup_factor=sched_cfg.grad_clip_warmup_factor,
+            warmup_min_lr=sched_cfg.warmup_min_lr,
+            warmup_steps=int(sched_cfg.t_warmup),
+        )
+    else:
+        raise NotImplementedError

safetensors_util.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import base64
+import pickle
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+import safetensors.torch
+import torch
+from olmo.aliases import PathOrStr
+__all__ = [
+    "state_dict_to_safetensors_file",
+    "safetensors_file_to_state_dict",
+]
+@dataclass(eq=True, frozen=True)
+class STKey:
+    keys: Tuple
+    value_is_pickled: bool
+def encode_key(key: STKey) -> str:
+    b = pickle.dumps((key.keys, key.value_is_pickled))
+    b = base64.urlsafe_b64encode(b)
+    return str(b, "ASCII")
+def decode_key(key: str) -> STKey:
+    b = base64.urlsafe_b64decode(key)
+    keys, value_is_pickled = pickle.loads(b)
+    return STKey(keys, value_is_pickled)
+def flatten_dict(d: Dict) -> Dict[STKey, torch.Tensor]:
+    result = {}
+    for key, value in d.items():
+        if isinstance(value, torch.Tensor):
+            result[STKey((key,), False)] = value
+        elif isinstance(value, dict):
+            value = flatten_dict(value)
+            for inner_key, inner_value in value.items():
+                result[STKey((key,) + inner_key.keys, inner_key.value_is_pickled)] = inner_value
+        else:
+            pickled = bytearray(pickle.dumps(value))
+            pickled_tensor = torch.frombuffer(pickled, dtype=torch.uint8)
+            result[STKey((key,), True)] = pickled_tensor
+    return result
+def unflatten_dict(d: Dict[STKey, torch.Tensor]) -> Dict:
+    result: Dict = {}
+    for key, value in d.items():
+        if key.value_is_pickled:
+            value = pickle.loads(value.numpy().data)
+        target_dict = result
+        for k in key.keys[:-1]:
+            new_target_dict = target_dict.get(k)
+            if new_target_dict is None:
+                new_target_dict = {}
+                target_dict[k] = new_target_dict
+            target_dict = new_target_dict
+        target_dict[key.keys[-1]] = value
+    return result
+def state_dict_to_safetensors_file(state_dict: Dict, filename: PathOrStr):
+    state_dict = flatten_dict(state_dict)
+    state_dict = {encode_key(k): v for k, v in state_dict.items()}
+    safetensors.torch.save_file(state_dict, filename)
+def safetensors_file_to_state_dict(filename: PathOrStr, map_location: Optional[str] = None) -> Dict:
+    if map_location is None:
+        map_location = "cpu"
+    state_dict = safetensors.torch.load_file(filename, device=map_location)
+    state_dict = {decode_key(k): v for k, v in state_dict.items()}
+    return unflatten_dict(state_dict)

torch_util.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import gc
+import os
+from typing import Optional, TypeVar
+import torch
+import torch.distributed as dist
+T = TypeVar("T")
+def seed_all(seed: int):
+    """Seed all rng objects."""
+    import random
+    import numpy as np
+    if seed < 0 or seed > 2**32 - 1:
+        raise ValueError(f"Seed {seed} is invalid. It must be on [0; 2^32 - 1]")
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    # torch.manual_seed may call manual_seed_all but calling it again here
+    # to make sure it gets called at least once
+    torch.cuda.manual_seed_all(seed)
+def is_distributed() -> bool:
+    return dist.is_available() and dist.is_initialized()
+def get_node_rank() -> int:
+    return int(os.environ.get("NODE_RANK") or (get_global_rank() - get_local_rank()) // get_local_world_size())
+def get_world_size() -> int:
+    if is_distributed():
+        return dist.get_world_size()
+    else:
+        return 1
+def get_local_world_size() -> int:
+    return int(os.environ.get("LOCAL_WORLD_SIZE") or 1)
+def get_global_rank() -> int:
+    if is_distributed():
+        return int(os.environ.get("RANK") or dist.get_rank())
+    else:
+        return 0
+def get_local_rank() -> int:
+    return int(os.environ.get("LOCAL_RANK") or 0)
+def get_fs_local_rank() -> int:
+    """Get the local rank per filesystem, meaning that, regardless of the number of nodes,
+    if all ranks share the same filesystem then `get_fs_local_rank()` will be equivalent to `get_global_rank()`,
+    but if nodes do not share the same filesystem then `get_fs_local_rank()` will be equivalent to `get_local_rank()`.
+    """
+    if os.environ.get("OLMO_SHARED_FS"):
+        return int(os.environ.get("FS_LOCAL_RANK") or get_global_rank())
+    else:
+        return int(os.environ.get("FS_LOCAL_RANK") or get_local_rank())
+def move_to_device(o: T, device: torch.device) -> T:
+    if isinstance(o, torch.Tensor):
+        return o.to(device)  # type: ignore[return-value]
+    elif isinstance(o, dict):
+        return {k: move_to_device(v, device) for k, v in o.items()}  # type: ignore[return-value]
+    elif isinstance(o, list):
+        return [move_to_device(x, device) for x in o]  # type: ignore[return-value]
+    elif isinstance(o, tuple):
+        return tuple((move_to_device(x, device) for x in o))  # type: ignore[return-value]
+    else:
+        return o
+def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
+    """
+    Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
+    is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``.
+    """
+    if check_neg_inf:
+        x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min)
+    if check_pos_inf:
+        x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
+def get_default_device() -> torch.device:
+    if torch.cuda.is_available() and torch.cuda.is_initialized():
+        return torch.device("cuda")
+    else:
+        return torch.device("cpu")
+def barrier() -> None:
+    if is_distributed():
+        dist.barrier()
+def peak_gpu_memory(reset: bool = False) -> Optional[float]:
+    """
+    Get the peak GPU memory usage in MB across all ranks.
+    Only rank 0 will get the final result.
+    """
+    if not torch.cuda.is_available():
+        return None
+    device = torch.device("cuda")
+    peak_mb = torch.cuda.max_memory_allocated(device) / 1000000
+    if is_distributed():
+        peak_mb_tensor = torch.tensor(peak_mb, device=device)
+        dist.reduce(peak_mb_tensor, 0, dist.ReduceOp.MAX)
+        peak_mb = peak_mb_tensor.item()
+    if reset:
+        # Reset peak stats.
+        torch.cuda.reset_max_memory_allocated(device)
+    return peak_mb
+V = TypeVar("V", bool, int, float)
+def synchronize_value(value: V, device: torch.device) -> V:
+    if dist.is_available() and dist.is_initialized():
+        value_tensor = torch.tensor(value, device=device)
+        dist.broadcast(value_tensor, 0)
+        return value_tensor.item()  # type: ignore
+    else:
+        return value
+def synchronize_flag(flag: bool, device: torch.device) -> bool:
+    return synchronize_value(flag, device)
+def gc_cuda():
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def get_cumulative_document_lengths(doc_lens: torch.Tensor) -> torch.Tensor:
+    """
+    Transform a batched tensor of document lengths into a 1D tensor of cumulative document
+    lengths for the whole batch.
+    """
+    return torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32, device=doc_lens.device),
+            torch.cumsum(doc_lens.masked_select(doc_lens != 0), 0, dtype=torch.int32),
+        ]
+    )

train.py ADDED Viewed

	@@ -0,0 +1,1384 @@

+from __future__ import annotations
+import cProfile
+import functools
+import gc
+import logging
+import math
+import os
+import random
+import shutil
+import time
+from collections import deque
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from itertools import islice
+from pathlib import Path
+from pstats import SortKey
+from typing import Any, Callable, Deque, Dict, List, Optional, TextIO, Tuple, Union
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+import torch.utils
+import torch.utils.hooks
+import wandb
+from packaging import version
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader
+from .aliases import PathOrStr
+from .checkpoint import Checkpointer, FullCheckpointer, build_sharded_checkpointer
+from .config import (
+    CheckpointType,
+    DDPGradSyncMode,
+    DistributedStrategy,
+    SchedulerUnits,
+    ShardedCheckpointerType,
+    SpeedMonitorConfig,
+    TrainConfig,
+)
+from .data import IterableDataset
+from .eval import Evaluator
+from .exceptions import OLMoConfigurationError
+from .model import OLMo
+from .optim import Optimizer, Scheduler
+from .torch_util import (
+    barrier,
+    gc_cuda,
+    get_fs_local_rank,
+    get_global_rank,
+    get_world_size,
+    move_to_device,
+    peak_gpu_memory,
+    synchronize_flag,
+    synchronize_value,
+)
+from .util import upload
+__all__ = ["SpeedMonitor", "LRMonitor", "Trainer"]
+log = logging.getLogger(__name__)
+@dataclass
+class SpeedMonitor:
+    cfg: SpeedMonitorConfig
+    start_times: Deque[float] = field(default_factory=lambda: deque([]))
+    global_total_tokens: int = 0
+    total_training_Gflops: float = 0
+    device_interval_tokens: Deque[int] = field(default_factory=lambda: deque([]))
+    def batch_start(
+        self,
+        global_total_tokens: int,
+        device_batch_num_tokens: int,
+        num_fwd_flops: int,
+        num_bck_flops: int,
+        record: bool = True,
+    ) -> None:
+        self.global_total_tokens = global_total_tokens
+        # num_fwd_flops and num_bck_flops from the OLMo model computes flops per token
+        # converting to GFLOPs here prevents numerical issues while logging
+        self.total_training_Gflops = (num_fwd_flops + num_bck_flops) * global_total_tokens / 1e9
+        if record:
+            if len(self.start_times) >= self.cfg.window_size:
+                self.start_times.popleft()
+                self.device_interval_tokens.popleft()
+            self.start_times.append(time.monotonic())
+            self.device_interval_tokens.append(device_batch_num_tokens)
+    def reset(self) -> None:
+        self.start_times.clear()
+        self.device_interval_tokens.clear()
+    def check(self) -> Dict[str, float]:
+        metrics: Dict[str, float] = {"throughput/total_tokens": self.global_total_tokens}
+        # plot flops related metrics
+        metrics["throughput/total_training_Gflops"] = self.total_training_Gflops
+        metrics["throughput/total_training_log_Gflops"] = math.log(self.total_training_Gflops)
+        if self.start_times:
+            interval_seconds = time.monotonic() - self.start_times[0]
+            interval_batches = len(self.start_times)
+            interval_tokens = sum(self.device_interval_tokens)
+            metrics["throughput/device/tokens_per_second"] = interval_tokens / interval_seconds
+            metrics["throughput/device/batches_per_second"] = interval_batches / interval_seconds
+        return metrics
+@dataclass
+class LRMonitor:
+    optim: torch.optim.Optimizer
+    def check(self) -> Dict[str, float]:
+        lrs = [group["lr"] for group in self.optim.param_groups]
+        return {f"optim/learning_rate_group{idx}": lr for idx, lr in enumerate(lrs)}
+def cross_entropy_loss(
+    logits,
+    labels,
+    ignore_index: int = -100,
+    reduction: str = "mean",
+    compute_z_loss: bool = False,
+    z_loss_multiplier: float = 1e-4,
+):
+    loss = F.cross_entropy(logits, labels, ignore_index=ignore_index, reduction=reduction)
+    if not compute_z_loss:
+        return loss, None
+    z_squared = logits.logsumexp(-1).pow(2)
+    if reduction == "mean":
+        z_squared = (z_squared * (labels != ignore_index)).mean()
+    elif reduction == "sum":
+        z_squared = (z_squared * (labels != ignore_index)).sum()
+    z_loss = z_loss_multiplier * z_squared
+    return loss, z_loss
+fused_loss_fn: Optional[Callable]
+try:
+    import flash_attn
+    from flash_attn.ops.triton.cross_entropy import (
+        cross_entropy_loss as flash_cross_entropy_loss,  # type: ignore
+    )
+    def fused_loss_fn(
+        logits,
+        labels,
+        ignore_index: int = -100,
+        reduction: str = "mean",
+        compute_z_loss: bool = False,
+        z_loss_multiplier: float = 1e-4,
+    ):
+        # The `ignored_index` parameter of `cross_entropy_loss` was changed to `ignore_index` in v2.5.8 with commit https://github.com/Dao-AILab/flash-attention/commit/ec6d22143b5d375e253b2ebfc563b26a43f43684
+        ce_loss_use_ignore_index_param = version.parse(flash_attn.__version__) >= version.parse("2.5.8")
+        if ce_loss_use_ignore_index_param:
+            ignore_index_kwarg = {"ignore_index": ignore_index}
+        else:
+            ignore_index_kwarg = {"ignored_index": ignore_index}
+        loss, z_loss = flash_cross_entropy_loss(
+            logits,
+            labels,
+            label_smoothing=0.0,
+            logit_scale=1.0,
+            lse_square_scale=z_loss_multiplier,
+            inplace_backward=False,
+            process_group=None,
+            **ignore_index_kwarg,
+        )
+        mask = labels != ignore_index
+        if reduction == "mean":
+            loss = loss.sum() / mask.sum()
+        elif reduction == "sum":
+            loss = loss.sum()
+        else:
+            loss = loss
+        if not compute_z_loss:
+            return loss, None
+        if reduction == "mean":
+            z_loss = z_loss.sum() / mask.sum()
+        elif reduction == "sum":
+            z_loss = z_loss.sum()
+        else:
+            z_loss = z_loss
+        return loss, z_loss
+except ImportError:
+    fused_loss_fn = None
+@dataclass
+class Trainer:
+    cfg: TrainConfig
+    model: OLMo
+    dist_model: Union[DDP, FSDP]
+    optim: Optimizer
+    scheduler: Scheduler
+    train_loader: DataLoader
+    device: torch.device
+    evaluators: List[Evaluator]
+    epoch: Optional[int] = None
+    global_step: int = 0
+    global_train_examples_seen_this_epoch: int = 0
+    """Tracks the global number of training examples seen in the current epoch for the purpose of restoring
+    the data loader position on restarts."""
+    global_train_tokens_seen: int = 0
+    """Tracks the global total number of tokens trained on."""
+    checkpoints: List[Path] = field(default_factory=list)
+    unsharded_checkpoints: List[Path] = field(default_factory=list)
+    ephemeral_checkpoints: List[Path] = field(default_factory=list)
+    min_train_loss: float = float("inf")
+    cur_train_loss: float = float("inf")
+    indices_file: Optional[TextIO] = None
+    _start_time: float = 0.0
+    _gc_init_state: bool = True
+    loss_fn: Callable[..., torch.Tensor] = field(default_factory=lambda: cross_entropy_loss)  # type: ignore
+    last_sharded_checkpoint_step: Optional[int] = None
+    last_unsharded_checkpoint_step: Optional[int] = None
+    def __post_init__(self):
+        if self.cfg.fused_loss:
+            if fused_loss_fn is not None:
+                self.loss_fn = fused_loss_fn
+            else:
+                raise NameError("`fused_loss_fn` is not defined. Please ensure that `flash_attn` is installed.")
+    @property
+    def dataset(self) -> IterableDataset:
+        assert isinstance(self.train_loader.dataset, IterableDataset)
+        return self.train_loader.dataset
+    @property
+    def tokens_per_batch(self) -> int:
+        return self.cfg.global_train_batch_size * self.cfg.model.max_sequence_length
+    @property
+    def batches_per_epoch(self) -> int:
+        return self.dataset.total_size // self.cfg.global_train_batch_size
+    @property
+    def max_epochs(self) -> int:
+        return math.ceil(self.max_steps / self.batches_per_epoch)
+    @property
+    def max_steps(self) -> int:
+        if isinstance(self.cfg.max_duration, int):
+            return self.cfg.max_duration
+        elif isinstance(self.cfg.max_duration, str):
+            if self.cfg.max_duration.endswith("T"):
+                # convert to float *first* to handle scientific notation
+                max_tokens = int(float(self.cfg.max_duration[:-1].strip()))
+                tokens_remaining = max(max_tokens - self.global_train_tokens_seen, 0)
+                steps_remaining = math.ceil(tokens_remaining / self.tokens_per_batch)
+                return self.global_step + steps_remaining
+            elif self.cfg.max_duration.endswith("ep"):
+                max_epochs = int(self.cfg.max_duration[:-2].strip())
+                return max_epochs * self.batches_per_epoch
+            else:
+                # convert to float *first* to handle scientific notation
+                return int(float(self.cfg.max_duration))
+        else:
+            raise TypeError(f"expected int or str for 'max_duration', found {type(self.cfg.max_duration)}")
+    @property
+    def max_tokens(self) -> int:
+        if isinstance(self.cfg.max_duration, int):
+            return (
+                self.global_train_tokens_seen
+                + max(self.cfg.max_duration - self.global_step, 0) * self.tokens_per_batch
+            )
+        elif isinstance(self.cfg.max_duration, str):
+            if self.cfg.max_duration.endswith("T"):
+                # convert to float *first* to handle scientific notation
+                return int(float(self.cfg.max_duration[:-1].strip()))
+            elif self.cfg.max_duration.endswith("ep"):
+                max_epochs = int(self.cfg.max_duration[:-2].strip())
+                return max_epochs * self.batches_per_epoch * self.tokens_per_batch
+            else:
+                # convert to float *first* to handle scientific notation
+                return (
+                    self.global_train_tokens_seen
+                    + max(int(float(self.cfg.max_duration)) - self.global_step, 0) * self.tokens_per_batch
+                )
+        else:
+            raise TypeError(f"expected int or str for 'max_duration', found {type(self.cfg.max_duration)}")
+    @property
+    def scheduler_current(self) -> int:
+        if self.cfg.scheduler.units == SchedulerUnits.steps:
+            return self.global_step
+        elif self.cfg.scheduler.units == SchedulerUnits.tokens:
+            return self.global_train_tokens_seen
+        else:
+            raise NotImplementedError(self.cfg.scheduler.units)
+    @property
+    def scheduler_max(self) -> int:
+        if self.cfg.scheduler.units == SchedulerUnits.steps:
+            return self.max_steps
+        elif self.cfg.scheduler.units == SchedulerUnits.tokens:
+            return self.max_tokens
+        else:
+            raise NotImplementedError(self.cfg.scheduler.units)
+    def trainer_state_dict(self) -> Dict[str, Any]:
+        return {
+            "epoch": self.epoch or 0,
+            "global_step": self.global_step,
+            "global_train_examples_seen_this_epoch": self.global_train_examples_seen_this_epoch,
+            "global_train_tokens_seen": self.global_train_tokens_seen,
+            "world_size": get_world_size(),
+            "checkpoints": self.checkpoints,
+            "unsharded_checkpoints": self.unsharded_checkpoints,
+            "ephemeral_checkpoints": self.ephemeral_checkpoints,
+            "rng": {
+                "python": random.getstate(),
+                "numpy": np.random.get_state(),
+                "torch": torch.random.get_rng_state(),
+                "cuda": torch.cuda.get_rng_state(),
+            },
+        }
+    def load_trainer_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        # Checkpoint paths.
+        self.checkpoints = [
+            path
+            for path in state_dict["checkpoints"]
+            if path.is_dir() and path.resolve().parent == Path(self.cfg.save_folder).resolve()
+        ]
+        self.unsharded_checkpoints = [
+            path
+            for path in state_dict["unsharded_checkpoints"]
+            if path.is_dir() and path.resolve().parent == Path(self.cfg.save_folder).resolve()
+        ]
+        self.ephemeral_checkpoints = [
+            path
+            for path in state_dict.get("ephemeral_checkpoints", [])
+            if path.is_dir() and path.resolve().parent == Path(self.cfg.save_folder).resolve()
+        ]
+        # Dataset / dataloader position.
+        checkpoint_epoch = state_dict.get("epoch") or 0
+        self.global_step = state_dict["global_step"]
+        self.global_train_examples_seen_this_epoch = state_dict.get(
+            "global_train_examples_seen_this_epoch",
+            state_dict.get(  # for backwards compatibility
+                "global_train_examples_seen",
+                state_dict.get("global_data_step", self.global_step) * self.cfg.global_train_batch_size,
+            ),
+        )
+        self.global_train_tokens_seen = state_dict.get(
+            "global_train_tokens_seen",
+            state_dict.get("global_data_step", self.global_step)  # for backwards compatibility
+            * self.cfg.global_train_batch_size
+            * self.cfg.model.max_sequence_length,
+        )
+        if not self.cfg.restore_dataloader:
+            self.epoch = 0
+            self.global_step = 0
+            self.global_train_tokens_seen = 0
+            self.global_train_examples_seen_this_epoch = 0
+        elif self.epoch is None:
+            self.epoch = checkpoint_epoch
+        elif checkpoint_epoch != self.epoch:
+            log.info(f"Starting new epoch (epoch = {self.epoch})")
+            self.global_train_examples_seen_this_epoch = 0
+        assert self.epoch is not None
+        # Reshuffle dataset if needed.
+        if self.dataset.epoch != self.epoch:
+            log.info(f"Reshuffling data loader for epoch {self.epoch}...")
+            self.dataset.reshuffle(self.epoch)
+        if self.cfg.fast_forward_batches:
+            log.info(f"Fast-forwarding data loader by {self.cfg.fast_forward_batches:,d} steps")
+            # Technically we don't "see" these batches that we fast-forward through, but we use
+            # this variable to update the position of the dataset so we need to include them here.
+            self.global_train_examples_seen_this_epoch += (
+                self.cfg.fast_forward_batches * self.cfg.global_train_batch_size
+            )
+            # NOTE: on the other hand we don't add anything to 'self.global_train_tokens_seen' here because
+            # that variable is meant to track the actual number of tokens trained on.
+        if self.global_train_examples_seen_this_epoch > 0:
+            assert isinstance(self.dataset, IterableDataset)
+            log.info(f"Data loader will start at instance index {self.global_train_examples_seen_this_epoch:,d}")
+            self.dataset.start_index = self.global_train_examples_seen_this_epoch
+        # Reset learning rate and weight decay to the values from the config, not the checkpoint.
+        log.info("Resetting learning rate...")
+        new_learning_rate = self.scheduler.get_lr(
+            self.cfg.optimizer.learning_rate, self.scheduler_current, self.scheduler_max
+        )
+        for group in self.optim.param_groups:
+            group["lr"] = new_learning_rate
+            group["initial_lr"] = self.cfg.optimizer.learning_rate
+            if "weight_decay" in group and group["weight_decay"] > 0.0:
+                group["weight_decay"] = self.cfg.optimizer.weight_decay
+        # RNG states.
+        if "rng" in state_dict and state_dict.get("world_size", get_world_size()) == get_world_size():
+            log.info("Restoring RNG states...")
+            rng_state = state_dict["rng"]
+            self.restore_rng_state(rng_state)
+        else:
+            log.warning(
+                "Trainer will not restore RNG states since the RNG states in the checkpoint are missing or invalid. "
+                "This typically happens when restoring from an unsharded checkpoint or a checkpoint that was saved "
+                "with a different world size. If that's the case you can safely ignore this warning."
+            )
+    def restore_rng_state(self, rng_state: Dict[str, Any]) -> None:
+        random.setstate(rng_state["python"])
+        np.random.set_state(rng_state["numpy"])
+        torch.set_rng_state(rng_state["torch"])
+        torch.cuda.set_rng_state(rng_state["cuda"])
+    def _save_checkpoint(
+        self, checkpointer: Checkpointer, checkpoint_type: CheckpointType
+    ) -> Tuple[PathOrStr, Optional[PathOrStr]]:
+        if checkpoint_type == CheckpointType.sharded:
+            suffix = ""
+            current_checkpoints = self.checkpoints
+            link_latest = get_fs_local_rank() == 0
+            num_checkpoints_to_keep = self.cfg.save_num_checkpoints_to_keep
+        elif checkpoint_type == CheckpointType.unsharded:
+            suffix = "-unsharded"
+            current_checkpoints = self.unsharded_checkpoints
+            link_latest = get_global_rank() == 0
+            num_checkpoints_to_keep = self.cfg.save_num_unsharded_checkpoints_to_keep
+        elif checkpoint_type == CheckpointType.sharded_ephemeral:
+            suffix = ""
+            current_checkpoints = self.ephemeral_checkpoints
+            link_latest = get_fs_local_rank() == 0
+            num_checkpoints_to_keep = 1
+        else:
+            raise NotImplementedError(checkpoint_type)
+        # Zero-gradients to avoid gathering them.
+        self.optim.zero_grad(set_to_none=True)
+        # Flush data indices file.
+        # TODO: upload the indices files?
+        if self.indices_file is not None:
+            self.indices_file.flush()
+        checkpoint_dir = Path(self.cfg.save_folder) / f"step{self.global_step}{suffix}"
+        remote_checkpoint_dir: Optional[str] = None
+        if self.cfg.remote_save_folder is not None:
+            remote_checkpoint_dir = f"{self.cfg.remote_save_folder.rstrip('/')}/{checkpoint_dir.name}"
+        current_checkpoints.append(checkpoint_dir)
+        # Save the checkpoint.
+        try:
+            checkpointer.save_checkpoint(
+                checkpoint_dir,
+                self.dist_model,
+                self.optim,
+                self.trainer_state_dict(),
+                upload_to=remote_checkpoint_dir,
+            )
+        except FileExistsError:
+            raise OLMoConfigurationError(
+                f"Checkpoint for step {self.global_step} already exists, use --save_overwrite to overwrite it"
+            )
+        if link_latest:
+            # Link to 'latest'.
+            latest_path = Path(self.cfg.save_folder) / f"latest{suffix}"
+            latest_path.unlink(missing_ok=True)
+            try:
+                latest_path.symlink_to(checkpoint_dir.name, target_is_directory=True)
+            except FileExistsError:
+                # Same as above, caught when another (file-system) local rank 0 has already made the 'latest' symlink.
+                # This can happen when nodes are saving to a common NFS drive but otherwise have distinct
+                # file-systems.
+                if latest_path.resolve().name != checkpoint_dir.name:
+                    raise
+        # Remove old checkpoints.
+        # For DDP, checkpoint_type being passed to remove_checkpoint is always `unsharded`.
+        if num_checkpoints_to_keep > 0:
+            while len(current_checkpoints) > num_checkpoints_to_keep:
+                self.remove_checkpoint(0, checkpoint_type)
+        barrier()
+        if remote_checkpoint_dir is not None:
+            return remote_checkpoint_dir, checkpoint_dir
+        else:
+            return checkpoint_dir, None
+    def save_sharded_checkpoint(self) -> Tuple[PathOrStr, Optional[PathOrStr]]:
+        checkpointer = build_sharded_checkpointer(self.cfg)
+        result = self._save_checkpoint(checkpointer, CheckpointType.sharded)
+        self.last_sharded_checkpoint_step = self.global_step
+        return result
+    def save_ephemeral_checkpoint(self) -> Tuple[PathOrStr, Optional[PathOrStr]]:
+        checkpointer = build_sharded_checkpointer(self.cfg)
+        result = self._save_checkpoint(checkpointer, CheckpointType.sharded_ephemeral)
+        self.last_sharded_checkpoint_step = self.global_step
+        return result
+    def _remove_sharded_checkpoint(self, idx: int, checkpoints: List[Path]):
+        oldest_checkpoint = checkpoints.pop(idx)
+        barrier()
+        if get_fs_local_rank() == 0 and oldest_checkpoint.is_dir():
+            shutil.rmtree(oldest_checkpoint, ignore_errors=True)
+            latest_path = Path(self.cfg.save_folder) / "latest"
+            if latest_path.resolve() == oldest_checkpoint.resolve():
+                latest_path.unlink()
+        barrier()
+    def remove_sharded_checkpoint(self, idx: int = 0):
+        self._remove_sharded_checkpoint(idx, self.checkpoints)
+    def remove_ephemeral_checkpoint(self, idx: int = 0):
+        self._remove_sharded_checkpoint(idx, self.ephemeral_checkpoints)
+    def restore_sharded_checkpoint(
+        self,
+        load_path: PathOrStr,
+        local_cache: Optional[PathOrStr] = None,
+        *,
+        load_optimizer_state: bool = True,
+        load_trainer_state: bool = True,
+        sharded_checkpointer: Optional[ShardedCheckpointerType] = None,
+    ):
+        # Zero-gradients to avoid gathering them.
+        self.optim.zero_grad(set_to_none=True)
+        checkpointer = build_sharded_checkpointer(self.cfg, name=sharded_checkpointer)
+        trainer_state = checkpointer.restore_checkpoint(
+            load_path,
+            self.dist_model,
+            self.optim,
+            local_cache=local_cache,
+            load_optimizer_state=load_optimizer_state,
+        )
+        if load_trainer_state:
+            self.load_trainer_state_dict(trainer_state)
+        barrier()
+    def save_unsharded_checkpoint(self) -> Tuple[PathOrStr, Optional[PathOrStr]]:
+        checkpointer = FullCheckpointer(self.cfg)
+        result = self._save_checkpoint(checkpointer, CheckpointType.unsharded)
+        self.last_unsharded_checkpoint_step = self.global_step
+        return result
+    def remove_unsharded_checkpoint(self, idx: int = 0):
+        barrier()
+        oldest_checkpoint = self.unsharded_checkpoints.pop(idx)
+        if get_global_rank() == 0 and oldest_checkpoint.is_dir():
+            shutil.rmtree(oldest_checkpoint, ignore_errors=True)
+            latest_path = Path(self.cfg.save_folder) / "latest-unsharded"
+            if latest_path.resolve() == oldest_checkpoint.resolve():
+                latest_path.unlink()
+        barrier()
+    def restore_unsharded_checkpoint(
+        self,
+        load_path: PathOrStr,
+        local_cache: Optional[PathOrStr] = None,
+        *,
+        load_optimizer_state: bool = True,
+        load_trainer_state: bool = True,
+    ):
+        # Zero-gradients to avoid gathering them.
+        self.optim.zero_grad(set_to_none=True)
+        checkpointer = FullCheckpointer(self.cfg)
+        trainer_state = checkpointer.restore_checkpoint(
+            load_path,
+            self.dist_model,
+            self.optim,
+            local_cache=local_cache,
+            load_optimizer_state=load_optimizer_state,
+        )
+        if load_trainer_state:
+            self.load_trainer_state_dict(trainer_state)
+        barrier()
+    def save_checkpoint(
+        self, checkpoint_type: CheckpointType = CheckpointType.sharded
+    ) -> Tuple[PathOrStr, Optional[PathOrStr]]:
+        result: Tuple[PathOrStr, Optional[PathOrStr]]
+        if checkpoint_type == CheckpointType.sharded:
+            result = self.save_sharded_checkpoint()
+        elif checkpoint_type == CheckpointType.unsharded:
+            result = self.save_unsharded_checkpoint()
+        elif checkpoint_type == CheckpointType.sharded_ephemeral:
+            result = self.save_ephemeral_checkpoint()
+        else:
+            raise NotImplementedError(checkpoint_type)
+        gc_cuda()
+        return result
+    def restore_checkpoint(
+        self,
+        load_path: PathOrStr,
+        *,
+        checkpoint_type: Optional[CheckpointType] = None,
+        local_cache: Optional[PathOrStr] = None,
+        load_optimizer_state: bool = True,
+        load_trainer_state: bool = True,
+        sharded_checkpointer: Optional[ShardedCheckpointerType] = None,
+    ):
+        if checkpoint_type == CheckpointType.unsharded or (
+            checkpoint_type is None and str(load_path).rstrip("/").endswith("-unsharded")
+        ):
+            self.restore_unsharded_checkpoint(
+                load_path,
+                local_cache=local_cache,
+                load_optimizer_state=load_optimizer_state,
+                load_trainer_state=load_trainer_state,
+            )
+        elif checkpoint_type == CheckpointType.sharded or checkpoint_type is None:
+            self.restore_sharded_checkpoint(
+                load_path,
+                local_cache=local_cache,
+                load_optimizer_state=load_optimizer_state,
+                load_trainer_state=load_trainer_state,
+                sharded_checkpointer=sharded_checkpointer,
+            )
+        elif checkpoint_type is not None:
+            raise NotImplementedError(checkpoint_type)
+        gc_cuda()
+    def remove_checkpoint(self, idx: int = 0, checkpoint_type: CheckpointType = CheckpointType.sharded):
+        if checkpoint_type == CheckpointType.sharded:
+            self.remove_sharded_checkpoint(idx=idx)
+        elif checkpoint_type == CheckpointType.unsharded:
+            self.remove_unsharded_checkpoint(idx=idx)
+        elif checkpoint_type == CheckpointType.sharded_ephemeral:
+            self.remove_ephemeral_checkpoint(idx=idx)
+        else:
+            raise NotImplementedError(checkpoint_type)
+    def _setup_module_output_save_hooks(self, micro_batch_idx: int) -> List[torch.utils.hooks.RemovableHandle]:
+        if (
+            self.cfg.module_outputs_save_steps is None
+            or self.global_step not in self.cfg.module_outputs_save_steps
+        ):
+            return []
+        if micro_batch_idx != 0 or get_global_rank() != 0:
+            # Hook is currently only used on the first microbatch of rank 0
+            return []
+        trace_save_folder = Path(self.cfg.save_folder) / f"traces/step{self.global_step}"
+        if trace_save_folder.exists():
+            if self.cfg.save_overwrite:
+                shutil.rmtree(trace_save_folder)
+            else:
+                raise OLMoConfigurationError(
+                    f"Attempting to overwrite traces at step {self.global_step} without --save_overwrite"
+                )
+        trace_save_folder.mkdir(parents=True)
+        def trace_outputs_hook(
+            module_name: str, _: torch.nn.Module, args: Tuple[torch.Tensor, ...], output: torch.Tensor
+        ) -> None:
+            if len(args) == 0:
+                log.info("No input args for module %s, output %s", module_name, output)
+            module_input = args[0] if len(args) > 0 else torch.tensor(())
+            trace_save_folder = Path(self.cfg.save_folder) / f"traces/step{self.global_step}"
+            trace_save_folder.mkdir(parents=True, exist_ok=True)
+            module_occurence_num = 0
+            while (
+                module_input_filepath := trace_save_folder / f"{module_name}_{module_occurence_num}_input.pt"
+            ).exists():
+                module_occurence_num += 1
+            torch.save(module_input, module_input_filepath)
+            module_output_filepath = trace_save_folder / f"{module_name}_{module_occurence_num}_output.pt"
+            torch.save(output, module_output_filepath)
+        output_hooks = []
+        for module_name, module in self.model.named_modules(prefix="model"):
+            output_hooks.append(module.register_forward_hook(functools.partial(trace_outputs_hook, module_name)))
+        return output_hooks
+    def get_labels(self, batch: Dict[str, Any]) -> torch.Tensor:
+        # Labels are just input IDs shifted to the left (first item is ignored).
+        labels, label_mask, attention_mask, instance_mask = (
+            batch["input_ids"].clone(),
+            batch.get("label_mask"),
+            batch.get("attention_mask"),
+            batch.get("instance_mask"),
+        )
+        if label_mask is not None:
+            labels.masked_fill_(~label_mask, -100)
+        if attention_mask is not None:
+            labels.masked_fill_(attention_mask == 0.0, -100)
+        if instance_mask is not None:
+            labels.masked_fill_(~instance_mask.unsqueeze(-1), value=-100)
+        return labels[..., 1:].contiguous()
+    def model_forward(
+        self, batch: Dict[str, Any], loss_reduction: str = "mean", compute_z_loss: bool = False
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+        # shape: (batch_size, seq_len, vocab_size)
+        logits = self.dist_model(
+            input_ids=batch["input_ids"],
+            attention_mask=batch.get("attention_mask"),
+            attention_bias=batch.get("attention_bias"),
+            doc_lens=batch.get("doc_lens"),
+            max_doc_lens=batch.get("max_doc_lens"),
+        ).logits
+        logits_for_loss = logits[..., :-1, :].contiguous()
+        # shape: (batch_size * seq_len, vocab_size)
+        logits_for_loss = logits_for_loss.view(-1, logits_for_loss.size(-1))
+        # shape: (batch_size, seq_len)
+        labels = self.get_labels(batch)
+        # shape: (batch_size * seq_len,)
+        labels = labels.view(-1)
+        ce_loss, z_loss = self.loss_fn(
+            logits_for_loss, labels, ignore_index=-100, reduction=loss_reduction, compute_z_loss=compute_z_loss
+        )
+        if loss_reduction == "none":
+            # Reshape (batch_size * seq_len,) -> (batch_size, seq_len)
+            ce_loss = ce_loss.view(batch["input_ids"].shape[0], -1)
+            if z_loss is not None:
+                z_loss = z_loss.view(batch["input_ids"].shape[0], -1)
+        return ce_loss, z_loss, logits
+    def train_micro_batch(
+        self, micro_batch: Dict[str, Any], batch_size_in_tokens: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        ce_loss, z_loss, logits = self.model_forward(
+            micro_batch, compute_z_loss=self.cfg.softmax_auxiliary_loss, loss_reduction="sum"
+        )
+        ce_loss = ce_loss / batch_size_in_tokens
+        # In case this helps with memory utilization.
+        del micro_batch
+        # Get loss to optimize for.
+        if self.cfg.softmax_auxiliary_loss:
+            assert z_loss is not None
+            z_loss = z_loss / batch_size_in_tokens
+            loss = ce_loss + z_loss
+        else:
+            loss = ce_loss
+        del logits
+        return loss, ce_loss, z_loss
+    def train_batch(self, batch: Dict[str, Any]) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # Split into micro-batches.
+        micro_batches = self.split_batch(batch)
+        batch_size_in_tokens = batch["input_ids"].numel()
+        # In case this helps with memory utilization.
+        del batch
+        ce_batch_loss = torch.tensor(0.0, device=self.device)
+        z_batch_loss = None if not self.cfg.softmax_auxiliary_loss else torch.tensor(0.0, device=self.device)
+        num_micro_batches = len(micro_batches)
+        for micro_batch_idx, micro_batch in enumerate(micro_batches):
+            # setup sync context for DDP for all micro-batches except the last
+            grad_sync_context = nullcontext
+            if (
+                self.cfg.distributed_strategy == DistributedStrategy.ddp
+                and self.cfg.ddp is not None
+                and self.cfg.ddp.grad_sync_mode == DDPGradSyncMode.batch
+            ):
+                if micro_batch_idx != num_micro_batches - 1:
+                    grad_sync_context = self.dist_model.no_sync
+            # Register output hooks
+            output_hooks: List[torch.utils.hooks.RemovableHandle] = []
+            output_hooks += self._setup_module_output_save_hooks(micro_batch_idx)
+            with grad_sync_context():
+                with torch.autocast("cuda", enabled=True, dtype=self.cfg.autocast_precision):
+                    # Run forward pass.
+                    loss, ce_loss, z_loss = self.train_micro_batch(micro_batch, batch_size_in_tokens)
+                    # Update overall CE batch loss.
+                    ce_batch_loss += ce_loss.detach()
+                    # Update overall Z batch loss.
+                    if z_loss is not None:
+                        assert z_batch_loss is not None
+                        z_batch_loss += z_loss.detach()
+                # Run backward pass.
+                loss.backward()
+            # Remove output hooks
+            for hook in output_hooks:
+                hook.remove()
+        return ce_batch_loss, z_batch_loss
+    def train_step(self, batch: Dict[str, Any], reduce_global_loss: bool = True) -> Dict[str, float]:
+        metrics: Dict[str, float] = {}
+        # Write data-indices to file.
+        if self.indices_file is not None and "index" in batch:
+            indices = "\t".join(str(int(i)) for i in batch["index"])
+            self.indices_file.write(f"{self.global_step}\t{indices}\n")
+        # Record how many instances are going to be skipped (masked out).
+        if (instance_mask := batch.get("instance_mask")) is not None:
+            metrics["train/masked_instances_local_rank"] = (~instance_mask).sum().item()
+        # Zero-gradients.
+        self.optim.zero_grad(set_to_none=True)
+        # Move tensors to the right device.
+        batch = move_to_device(batch, self.device)
+        # Run forward-backward pass.
+        ce_batch_loss, z_batch_loss = self.train_batch(batch)
+        # Collect loss, potentially reducing over all ranks.
+        if reduce_global_loss:
+            dist.reduce(ce_batch_loss, 0)
+            ce_batch_loss.div_(get_world_size())
+            if z_batch_loss is not None:
+                dist.reduce(z_batch_loss, 0)
+                z_batch_loss.div_(get_world_size())
+        # Clip gradient norms and collect param/gradient/optim metrics.
+        should_log_optim_metrics_this_step = self.should_log_optim_metrics_this_step()
+        optim_metrics = self.optim.clip_grads_and_collect_metrics(
+            self.global_step,
+            collect_param_metrics=should_log_optim_metrics_this_step,
+            # passing this process group here ensures metrics are reduced correctly when we're using
+            # HYBRID sharding.
+            process_group=self.dist_model.process_group,
+        )
+        # Adjust the learning rate.
+        for group in self.optim.param_groups:
+            # TODO (epwalsh): if we want to enable different LRs or gradient clipping settings per group
+            # we should pass `group["initial_lr"]` or `group["initial_max_grad_norm"]` here instead of
+            # the corresponding values from `self.cfg`.
+            group["lr"] = self.scheduler.get_lr(
+                self.cfg.optimizer.learning_rate, self.scheduler_current, self.scheduler_max
+            )
+            group["max_grad_norm"] = self.scheduler.get_max_grad_norm(
+                self.cfg.max_grad_norm, self.scheduler_current, self.scheduler_max
+            )
+            group["max_grad_norm_ratio"] = self.scheduler.get_max_grad_norm(
+                self.cfg.max_grad_norm_ratio, self.scheduler_current, self.scheduler_max
+            )
+        # Optimizer step.
+        self.optim.step()
+        # Collect metrics and check for NaN loss.
+        # NOTE: this involves a bunch of host-device syncs so we wait until the last moment to do this.
+        if torch.isnan(ce_batch_loss):
+            raise ValueError("nan loss encountered")
+        if z_batch_loss is not None and torch.isnan(z_batch_loss):
+            raise ValueError("nan loss encountered")
+        for key, value in optim_metrics.items():
+            metrics[f"optim/{key}"] = value.item()
+        self.cur_train_loss = ce_batch_loss.item()
+        self.min_train_loss = min(self.min_train_loss, self.cur_train_loss)
+        metrics["train/CrossEntropyLoss"] = self.cur_train_loss
+        metrics["train/Perplexity"] = math.exp(self.cur_train_loss)
+        if z_batch_loss is not None:
+            metrics["train/ZLoss"] = z_batch_loss.item()
+        # Maybe collect post-step optimizer-specific metrics.
+        if should_log_optim_metrics_this_step:
+            optim_metrics = self.optim.get_post_step_metrics(
+                self.dist_model, process_group=self.dist_model.process_group
+            )
+            for key, value in optim_metrics.items():
+                metrics[f"optim/{key}"] = value.item()
+        return metrics
+    def eval_batch(self, batch: Dict[str, Any]) -> Tuple[torch.Tensor, torch.Tensor]:
+        with torch.autocast("cuda", enabled=True, dtype=self.cfg.autocast_precision):
+            ce_loss, _, logits = self.model_forward(batch, loss_reduction="none")
+        return ce_loss.mean(dim=-1), logits
+    def eval_step(self, batch: Dict[str, Any], evaluator: Evaluator) -> None:
+        # Move tensors to the right device.
+        batch = move_to_device(batch, self.device)
+        # Run forward pass.
+        with torch.no_grad():  # NOTE: 'torch.inference_mode()' doesn't work with 'torch.compile()'.
+            ce_loss, logits = self.eval_batch(batch)
+        # Update metrics.
+        evaluator.update_metrics(
+            batch, ce_loss, logits
+        )  # batch includes all keys that the downstream evaluation needs
+        barrier()
+    def split_batch(self, batch: Dict[str, Any]) -> List[Dict[str, Any]]:
+        microbatch_size = self.cfg.device_train_microbatch_size
+        batch_size = batch["input_ids"].shape[0]
+        if batch_size <= microbatch_size:
+            return [batch]
+        else:
+            micro_batches = {}
+            for key, value in batch.items():
+                if isinstance(value, torch.Tensor):
+                    micro_batches[key] = value.split(microbatch_size, dim=0)
+                elif isinstance(value, list):
+                    micro_batches[key] = [
+                        value[microbatch_size * i : microbatch_size * i + microbatch_size]
+                        for i in range(math.ceil(batch_size / microbatch_size))
+                    ]
+                else:
+                    raise ValueError(f"unexpected item in batch: '{key}={value}'")
+            return [
+                {key: value[i] for key, value in micro_batches.items()}  # type: ignore
+                for i in range(len(micro_batches["input_ids"]))
+            ]
+    def system_metrics(self) -> Dict[str, float]:
+        metrics = {}
+        if self.global_step < 3 or self.global_step % 10 == 0:
+            peak_gpu_mb = peak_gpu_memory()
+            if peak_gpu_mb is not None:
+                metrics["System/Peak GPU Memory (MB)"] = peak_gpu_mb
+        return metrics
+    def log_metrics_to_console(self, prefix: str, metrics: Dict[str, float]):
+        def format_float(value: float) -> str:
+            if value < 0.0001:
+                return str(value)  # scientific notation
+            elif value > 1000:
+                return f"{int(value):,d}"
+            elif value > 100:
+                return f"{value:.1f}"
+            elif value > 10:
+                return f"{value:.2f}"
+            elif value > 1:
+                return f"{value:.3f}"
+            else:
+                return f"{value:.4f}"
+        log.info(
+            f"{prefix}\n"
+            + "\n".join(
+                [
+                    f"    {name}={format_float(value)}"
+                    for name, value in metrics.items()
+                    if name == "optim/total_grad_norm"
+                    or not name.startswith("optim/")  # there's too many optimizer metrics
+                ]
+            )
+        )
+    def should_log_optim_metrics_this_step(self) -> bool:
+        if self.cfg.wandb is None:
+            # We only log optimizer-specific metrics to W&B, since there are usually too many metrics
+            # to log to the console.
+            return False
+        optim_log_interval = self.cfg.optimizer.metrics_log_interval
+        if optim_log_interval is None:
+            optim_log_interval = self.cfg.wandb.log_interval
+        else:
+            optim_log_interval = max(optim_log_interval, self.cfg.wandb.log_interval)
+        return self.global_step % optim_log_interval == 0
+    def should_log_this_step(self) -> bool:
+        if self.global_step % self.cfg.console_log_interval == 0:
+            return True
+        elif self.cfg.wandb is not None and self.global_step % self.cfg.wandb.log_interval == 0:
+            return True
+        else:
+            return False
+    def eval(self) -> Dict[str, Any]:
+        # Zero gradients and set model to 'eval' mode.
+        self.optim.zero_grad(set_to_none=True)
+        self.dist_model.eval()
+        eval_metrics = {}
+        for evaluator in self.evaluators:
+            log.info(f"Running evaluation for '{evaluator.label}'...")
+            # Reset metrics.
+            evaluator.reset_metrics()
+            # Initialize data loader iterator.
+            eval_batches = iter(evaluator.eval_loader)
+            # Adjust how many batches to evaluate on.
+            num_eval_batches = (
+                evaluator.subset_num_batches
+                if evaluator.subset_num_batches is not None
+                else self.cfg.eval_subset_num_batches
+            )
+            if num_eval_batches > 0:
+                num_eval_batches = min(num_eval_batches, len(evaluator.eval_loader))
+                eval_batches = islice(eval_batches, num_eval_batches)
+            # Run model over batches.
+            for eval_step, eval_batch in enumerate(eval_batches):
+                self.eval_step(eval_batch, evaluator)
+                # Log to console.
+                if eval_step + 1 == num_eval_batches or (eval_step + 1) % self.cfg.console_log_interval == 0:
+                    log.info(f"[eval_step={eval_step + 1}/{num_eval_batches}]")
+            # Get final metrics.
+            metrics = evaluator.compute_metrics()
+            eval_metrics.update(metrics)
+            self.log_metrics_to_console(f"{evaluator.label}", metrics)
+            del eval_batches
+        # Eval compiles a bunch more versions, and the result is terrible. This way we get back to zero.
+        if self.cfg.compile is not None:
+            torch.compiler.reset()
+        return eval_metrics
+    def check_if_cancelled(self) -> Tuple[bool, int]:
+        should_cancel = False
+        cancel_reason: Optional[str] = None
+        extra_steps = 0
+        if get_global_rank() == 0:
+            if self.cfg.time_limit is not None and time.time() - self._start_time >= self.cfg.time_limit:
+                # First check if we've reached the training time limit.
+                should_cancel = True
+                cancel_reason = "time limit reached"
+                extra_steps = self.cfg.extra_steps_after_cancel
+            elif (
+                self.cfg.early_stopping_factor is not None
+                and self.global_step > self.cfg.scheduler.t_warmup
+                and self.cur_train_loss > self.cfg.early_stopping_factor * self.min_train_loss
+            ):
+                # Next check if early stopping loss criteria is met.
+                should_cancel = True
+                cancel_reason = "early stopping from loss increase"
+            elif wandb.run is not None and (api_key := os.environ.get("WANDB_API_KEY")) is not None:
+                # Finally, check if someone canceled the run from W&B by adding the 'cancel' / 'canceled' tag..
+                # We won't see it in the run object. So we have to use the import/export API to check.
+                from requests.exceptions import RequestException
+                from wandb.errors import CommError
+                try:
+                    api = wandb.Api(api_key=api_key)
+                    run = api.run(wandb.run.path)
+                    for tag in run.tags or []:
+                        if tag.lower() in {"cancel", "canceled", "cancelled"}:
+                            should_cancel = True
+                            cancel_reason = "Weights & Biases tag"
+                            extra_steps = self.cfg.extra_steps_after_cancel
+                            break
+                except (RequestException, CommError):
+                    log.info("Failed to check if W&B run is cancelled, continuing run.")
+        run_canceled = synchronize_flag(should_cancel, self.device)
+        if run_canceled:
+            extra_steps = synchronize_value(extra_steps, self.device)
+            if cancel_reason is None:
+                if extra_steps > 0:
+                    log.warning(f"Run canceled, stopping in {extra_steps} more steps...")
+                else:
+                    log.warning("Run canceled")
+            else:
+                if extra_steps > 0:
+                    log.warning(f"Run canceled due to {cancel_reason}, stopping in {extra_steps} more steps...")
+                else:
+                    log.warning(f"Run canceled due to {cancel_reason}")
+        return run_canceled, extra_steps
+    def fit(self):
+        if self.cfg.stop_after is not None:
+            if self.cfg.stop_at is None:
+                self.cfg.stop_at = self.global_step + self.cfg.stop_after
+            else:
+                self.cfg.stop_at = min(self.cfg.stop_at, self.global_step + self.cfg.stop_after)
+        if self.cfg.stop_at is None:
+            self.cfg.stop_at = self.max_steps + 10
+        self._start_time = time.time()
+        self._gc_init_state = gc.isenabled()  # cache if garbage collection is enabled, reset on close.
+        # Disable automatic garbage collection, FSDP doesn't work well with it.
+        if self.cfg.gen1_gc_interval is not None:
+            gc.disable()
+        if self.cfg.load_path is not None and self.global_step > 0 and self.cfg.eval_on_load:
+            eval_metrics = self.eval()
+            if wandb.run is not None:
+                wandb.log(eval_metrics, step=self.global_step)
+        # Set model to 'train' mode.
+        self.dist_model.train()
+        # Initialize monitors.
+        assert self.cfg.device_train_batch_size is not None
+        speed_monitor = SpeedMonitor(self.cfg.speed_monitor)
+        lr_monitor = LRMonitor(self.optim)
+        # Log system metrics at the start of training.
+        sys_metrics = self.system_metrics()
+        if sys_metrics:
+            self.log_metrics_to_console("Pre-train system metrics", sys_metrics)
+            if wandb.run is not None:
+                wandb.log(sys_metrics, step=0)
+        # Python Profiler stuff
+        if self.cfg.python_profiling:
+            python_profiler = cProfile.Profile()
+        else:
+            python_profiler = None
+        # PyTorch Profiler stuff
+        if self.cfg.torch_profiling and get_global_rank() == 0:
+            from torch.profiler import schedule
+            profiling_schedule = schedule(wait=1, warmup=5, active=3, repeat=1)
+            def on_trace_ready(p):
+                profiler_output_dir = Path(self.cfg.save_folder) / "profiler"
+                profiler_output_dir.mkdir(exist_ok=True)
+                output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=32)
+                log.info(f"Profile by total GPU time at step {p.step_num}:\n{output}")
+                output = p.key_averages().table(sort_by="self_cpu_time_total", row_limit=32)
+                log.info(f"Profile by total CPU time at step {p.step_num}:\n{output}")
+                p.export_chrome_trace(
+                    str(trace_path := (profiler_output_dir / f"{p.step_num}.chrome_trace.json.gz"))
+                )
+                if self.cfg.remote_save_folder is not None:
+                    upload_folder = f"{self.cfg.remote_save_folder.rstrip('/')}/profiler"
+                    log.info(f"Tracing complete, uploading results to '{upload_folder}'...")
+                    upload(trace_path, f"{upload_folder}/{trace_path.name}")
+            from torch.profiler import ProfilerActivity
+            torch_profiler = torch.profiler.profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                record_shapes=False,
+                profile_memory=False,
+                with_stack=True,
+                schedule=profiling_schedule,
+                on_trace_ready=on_trace_ready,
+            )
+            del profiling_schedule
+        else:
+            import contextlib
+            torch_profiler = contextlib.nullcontext()
+        # Train.
+        first_batch: bool = True
+        cancel_initiated: bool = False
+        stop_at: int = self.cfg.stop_at
+        save_checkpoints: bool = True
+        with torch_profiler as p:
+            for epoch in range(self.epoch or 0, self.max_epochs):
+                for batch in self.train_loader:
+                    # Bookkeeping.
+                    # NOTE: To track the global batch size / number of tokens per batch we make the assumption that all
+                    # batches see the same number of tokens, which should be the case for language model pre-training
+                    # (at least when drop_last=True).
+                    # Alternatively we'd have to use a distributed all reduce over seq_len here, but I don't want that
+                    # overhead. So for now I'm putting these assertions here so if the assumption is violated it will
+                    # fail loudly.
+                    batch_size, seq_len = batch["input_ids"].shape
+                    assert seq_len == self.cfg.model.max_sequence_length
+                    assert batch_size == self.cfg.device_train_batch_size
+                    global_batch_size = batch_size * get_world_size()  # assumes batch size equal across ranks
+                    self.global_step += 1
+                    self.global_train_examples_seen_this_epoch += global_batch_size
+                    self.global_train_tokens_seen += global_batch_size * seq_len
+                    speed_monitor.batch_start(
+                        global_total_tokens=self.global_train_tokens_seen,
+                        device_batch_num_tokens=batch_size * seq_len,  # num tokens in batch for this device
+                        # We start monitoring speed after the first batch since the first
+                        # batch might be an outlier due to compiling and other initialization overhead.
+                        num_fwd_flops=self.model.num_fwd_flops,  # this is per token
+                        num_bck_flops=self.model.num_bck_flops,  # this is per token
+                        record=not first_batch,
+                    )
+                    should_log_this_step = self.should_log_this_step()
+                    # Run train step on batch.
+                    metrics = self.train_step(batch, reduce_global_loss=should_log_this_step)
+                    # Maybe collect other metrics.
+                    if should_log_this_step:
+                        # Speed metrics.
+                        metrics.update(speed_monitor.check())
+                        # System metrics.
+                        metrics.update(self.system_metrics())
+                        # Learning rate metrics.
+                        metrics.update(lr_monitor.check())
+                    # Log metrics to console.
+                    if self.global_step % self.cfg.console_log_interval == 0:
+                        if get_global_rank() == 0:
+                            self.log_metrics_to_console(
+                                f"[step={self.global_step}/{self.max_steps},epoch={epoch}]",
+                                metrics,
+                            )
+                        else:
+                            log.info(f"[step={self.global_step}/{self.max_steps},epoch={epoch}]")
+                    # Log metrics to W&B.
+                    if (
+                        wandb.run is not None
+                        and self.cfg.wandb is not None
+                        and self.global_step % self.cfg.wandb.log_interval == 0
+                    ):
+                        wandb.log(metrics, step=self.global_step)
+                    # Check if/when run should be canceled.
+                    if not cancel_initiated and self.global_step % self.cfg.canceled_check_interval == 0:
+                        cancel_initiated, extra_steps = self.check_if_cancelled()
+                        if cancel_initiated:
+                            stop_at = min(stop_at, self.global_step + extra_steps)
+                    # Maybe save sharded checkpoint.
+                    if self.cfg.distributed_strategy != DistributedStrategy.ddp:
+                        if save_checkpoints and (
+                            cancel_initiated
+                            or (
+                                self.cfg.save_interval is not None
+                                and self.global_step % self.cfg.save_interval == 0
+                                and self.cfg.save_num_checkpoints_to_keep != 0
+                            )
+                        ):
+                            log.info("Saving checkpoint...")
+                            checkpoint_path, _ = self.save_checkpoint(CheckpointType.sharded)
+                            log.info(f"Checkpoint saved to {checkpoint_path}")
+                            # Remove any ephemeral checkpoints.
+                            while self.ephemeral_checkpoints:
+                                self.remove_ephemeral_checkpoint()
+                            # Reset speed monitor so that we don't count the time taken to save checkpoints.
+                            speed_monitor.reset()
+                            # If the run was just canceled this will be the final checkpoint.
+                            if cancel_initiated:
+                                save_checkpoints = False
+                        elif (
+                            self.cfg.save_interval_ephemeral is not None
+                            and self.global_step % self.cfg.save_interval_ephemeral == 0
+                        ):
+                            log.info("Saving ephemeral checkpoint...")
+                            checkpoint_path, _ = self.save_checkpoint(CheckpointType.sharded_ephemeral)
+                            log.info(f"Checkpoint saved to {checkpoint_path}")
+                            # Reset speed monitor so that we don't count the time taken to save checkpoints.
+                            speed_monitor.reset()
+                    # Maybe save unsharded checkpoint.
+                    # This code snippet should always execute when running DDP.
+                    if (
+                        save_checkpoints
+                        and self.cfg.save_interval_unsharded is not None
+                        and self.global_step % self.cfg.save_interval_unsharded == 0
+                        and self.cfg.save_num_unsharded_checkpoints_to_keep != 0
+                    ):
+                        log.info("Saving unsharded checkpoint...")
+                        checkpoint_path, _ = self.save_checkpoint(CheckpointType.unsharded)
+                        log.info(f"Unsharded checkpoint saved to {checkpoint_path}")
+                        # Reset speed monitor so that we don't count the time taken to save checkpoints.
+                        speed_monitor.reset()
+                    # Maybe run evaluations.
+                    if not cancel_initiated and (
+                        self.global_step % self.cfg.eval_interval == 0 or self.global_step >= stop_at
+                    ):
+                        eval_metrics = self.eval()
+                        # Log metrics to W&B.
+                        if wandb.run is not None:
+                            wandb.log(eval_metrics, step=self.global_step)
+                        # Reset speed monitor so that we don't count the time taken to run evaluations.
+                        speed_monitor.reset()
+                        # Reset model to 'train' mode.
+                        self.dist_model.train()
+                    # End of batch.
+                    first_batch = False
+                    if p is not None:
+                        p.step()
+                    if self.global_step >= stop_at:
+                        break
+                    # Run generation 1 garbage collection.
+                    if self.cfg.gen1_gc_interval is not None and self.global_step % self.cfg.gen1_gc_interval == 0:
+                        gc.collect(1)
+                    # Python Profiler stuff
+                    # We do this now, at the bottom of this loop, so we capture the work of getting the next batch.
+                    if python_profiler is not None:
+                        if self.global_step == 5:
+                            python_profiler.enable()
+                        elif self.global_step == 8:
+                            python_profiler.disable()
+                            python_profiler.print_stats(sort=SortKey.CUMULATIVE)
+                            python_profiler = None
+                else:
+                    log.info("Training epoch complete")
+                    self.epoch = epoch + 1
+                    self.global_train_examples_seen_this_epoch = 0
+                    self.dataset.start_index = 0
+                    if self.epoch < self.max_epochs:
+                        log.info(f"Reshuffling data loader for epoch {self.epoch}...")
+                        self.dataset.reshuffle(self.epoch)
+                    continue
+                break
+        # Save final checkpoint.
+        if save_checkpoints:
+            if (
+                self.cfg.save_interval_unsharded is not None
+                and self.last_unsharded_checkpoint_step != self.global_step
+            ):
+                log.info("Saving final unsharded model checkpoint...")
+                checkpoint_path, _ = self.save_checkpoint(CheckpointType.unsharded)
+                log.info(f"Unsharded checkpoint saved to {checkpoint_path}")
+            elif (
+                self.cfg.save_num_checkpoints_to_keep != 0
+                and self.last_sharded_checkpoint_step != self.global_step
+                and self.cfg.distributed_strategy == DistributedStrategy.fsdp
+            ):
+                log.info("Saving final checkpoint...")
+                checkpoint_path, _ = self.save_checkpoint(CheckpointType.sharded)
+                log.info(f"Checkpoint saved to {checkpoint_path}")
+    def close(self, exit_code: int = 0) -> None:
+        gc_cuda()
+        if self.indices_file is not None:
+            self.indices_file.flush()
+            self.indices_file.close()
+        if self._gc_init_state:
+            gc.enable()
+        else:
+            gc.disable()
+        if wandb.run is not None:
+            wandb.finish(exit_code=exit_code, quiet=True)
+    def __enter__(self) -> Trainer:
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        del exc_val, exc_tb
+        self.close(0 if exc_type is None else 1)

util.py ADDED Viewed

	@@ -0,0 +1,929 @@

+import gzip
+import io
+import json
+import logging
+import os
+import re
+import socket
+import sys
+import time
+import warnings
+from datetime import datetime
+from enum import Enum
+from itertools import cycle, islice
+from pathlib import Path
+from queue import Queue
+from threading import Thread
+from typing import Any, Callable, Dict, MutableMapping, Optional, Tuple, Union
+import boto3
+import botocore.exceptions as boto_exceptions
+import datasets
+import requests
+import rich
+from botocore.config import Config
+from cached_path.schemes import SchemeClient, add_scheme_client
+from google.api_core.retry import Retry as GCSRetry
+from google.api_core.retry import if_transient_error as gcs_is_transient_error
+from rich.console import Console, ConsoleRenderable
+from rich.highlighter import NullHighlighter
+from rich.progress import Progress
+from rich.text import Text
+from rich.traceback import Traceback
+from olmo_data.data import get_data_path
+from .aliases import PathOrStr
+from .exceptions import (
+    OLMoCliError,
+    OLMoEnvironmentError,
+    OLMoError,
+    OLMoNetworkError,
+    OLMoThreadError,
+)
+from .torch_util import get_global_rank, get_local_rank, get_node_rank, is_distributed
+try:
+    from functools import cache
+except ImportError:
+    from functools import lru_cache as cache
+class StrEnum(str, Enum):
+    """
+    This is equivalent to Python's :class:`enum.StrEnum` since version 3.11.
+    We include this here for compatibility with older version of Python.
+    """
+    def __str__(self) -> str:
+        return self.value
+    def __repr__(self) -> str:
+        return f"'{str(self)}'"
+_log_extra_fields: Dict[str, Any] = {}
+log = logging.getLogger(__name__)
+class LogFilterType(StrEnum):
+    rank0_only = "rank0_only"
+    local_rank0_only = "local_rank0_only"
+    all_ranks = "all_ranks"
+def log_extra_field(field_name: str, field_value: Any) -> None:
+    global _log_extra_fields
+    if field_value is None:
+        if field_name in _log_extra_fields:
+            del _log_extra_fields[field_name]
+    else:
+        _log_extra_fields[field_name] = field_value
+def setup_logging(log_filter_type: LogFilterType = LogFilterType.rank0_only) -> None:
+    """
+    :param rank0_only: INFO and below messages will only be emitted on the rank0 process.
+    """
+    log_extra_field("hostname", socket.gethostname())
+    if is_distributed():
+        log_extra_field("node_rank", get_node_rank())
+        log_extra_field("local_rank", get_local_rank())
+        log_extra_field("global_rank", get_global_rank())
+    else:
+        log_extra_field("node_rank", 0)
+        log_extra_field("local_rank", 0)
+        log_extra_field("global_rank", 0)
+    old_log_record_factory = logging.getLogRecordFactory()
+    def log_record_factory(*args, **kwargs) -> logging.LogRecord:
+        record = old_log_record_factory(*args, **kwargs)
+        for field_name, field_value in _log_extra_fields.items():
+            setattr(record, field_name, field_value)
+        return record
+    logging.setLogRecordFactory(log_record_factory)
+    handler: logging.Handler
+    if (
+        os.environ.get("OLMo_NONINTERACTIVE", False)
+        or os.environ.get("DEBIAN_FRONTEND", None) == "noninteractive"
+        or not sys.stdout.isatty()
+    ):
+        handler = logging.StreamHandler(sys.stdout)
+        formatter = logging.Formatter(
+            "%(asctime)s\t%(hostname)s:%(local_rank)s\t%(name)s:%(lineno)s\t%(levelname)s\t%(message)s"
+        )
+        formatter.default_time_format = "%Y-%m-%d %H:%M:%S"
+        formatter.default_msec_format = "%s.%03d"
+        handler.setFormatter(formatter)
+    else:
+        handler = RichHandler()
+    def rank0_filter(record: logging.LogRecord) -> int:
+        if record.levelno > logging.INFO:
+            return 1
+        if getattr(record, "global_rank", 0) == 0:
+            return 1
+        else:
+            return 0
+    def local_rank0_filter(record: logging.LogRecord) -> int:
+        if record.levelno > logging.INFO:
+            return 1
+        if getattr(record, "local_rank", 0) == 0:
+            return 1
+        else:
+            return 0
+    if log_filter_type == LogFilterType.rank0_only:
+        filter = rank0_filter
+    elif log_filter_type == LogFilterType.local_rank0_only:
+        filter = local_rank0_filter  # type: ignore
+    elif log_filter_type == LogFilterType.all_ranks:
+        filter = None
+    else:
+        raise ValueError(log_filter_type)
+    if filter is not None:
+        handler.addFilter(filter)  # type: ignore
+    logging.basicConfig(handlers=[handler], level=logging.INFO)
+    logging.captureWarnings(True)
+    logging.getLogger("urllib3").setLevel(logging.ERROR)
+def excepthook(exctype, value, traceback):
+    """
+    Used to patch `sys.excepthook` in order to log exceptions.
+    """
+    if issubclass(exctype, KeyboardInterrupt):
+        sys.__excepthook__(exctype, value, traceback)
+    elif issubclass(exctype, OLMoCliError):
+        rich.get_console().print(f"[yellow]{value}[/]", highlight=False)
+    elif issubclass(exctype, OLMoError):
+        rich.get_console().print(Text(f"{exctype.__name__}:", style="red"), value, highlight=False)
+    else:
+        log.critical("Uncaught %s: %s", exctype.__name__, value, exc_info=(exctype, value, traceback))
+def install_excepthook():
+    sys.excepthook = excepthook
+def filter_warnings():
+    # Filter internal deprecation warnings from torch
+    warnings.filterwarnings(
+        action="ignore",
+        category=UserWarning,
+        message="torch.distributed.*_base is a private function and will be deprecated.*",
+    )
+    warnings.filterwarnings(
+        action="ignore",
+        category=UserWarning,
+        message="TypedStorage is deprecated.*",
+    )
+    warnings.filterwarnings(
+        action="ignore",
+        category=UserWarning,
+        message="Please use DTensor instead.*",
+    )
+    # Torchvision warnings. We don't actually use torchvision.
+    warnings.filterwarnings(
+        action="ignore",
+        message="failed to load.*",
+        module="torchvision.io.image",
+    )
+def set_env_variables():
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def prepare_cli_environment(log_filter_type: Optional[LogFilterType] = None):
+    if log_filter_type is None:
+        log_filter_type = LogFilterType(os.environ.get("LOG_FILTER_TYPE", "rank0_only"))
+    rich.reconfigure(width=max(rich.get_console().width, 180), soft_wrap=True)
+    setup_logging(log_filter_type=log_filter_type)
+    install_excepthook()
+    filter_warnings()
+    set_env_variables()
+def clean_opt(arg: str) -> str:
+    if "=" not in arg:
+        arg = f"{arg}=True"
+    name, val = arg.split("=", 1)
+    name = name.strip("-").replace("-", "_")
+    return f"{name}={val}"
+class RichHandler(logging.Handler):
+    """
+    A simplified version of rich.logging.RichHandler from
+    https://github.com/Textualize/rich/blob/master/rich/logging.py
+    """
+    def __init__(
+        self,
+        *,
+        level: Union[int, str] = logging.NOTSET,
+        console: Optional[Console] = None,
+        markup: bool = False,
+    ) -> None:
+        super().__init__(level=level)
+        self.console = console or rich.get_console()
+        self.highlighter = NullHighlighter()
+        self.markup = markup
+    def emit(self, record: logging.LogRecord) -> None:
+        try:
+            if hasattr(record.msg, "__rich__") or hasattr(record.msg, "__rich_console__"):
+                self.console.print(record.msg)
+            else:
+                msg: Any = record.msg
+                if isinstance(record.msg, str):
+                    msg = self.render_message(record=record, message=record.getMessage())
+                renderables = [
+                    self.get_time_text(record),
+                    self.get_level_text(record),
+                    self.get_location_text(record),
+                    msg,
+                ]
+                if record.exc_info is not None:
+                    tb = Traceback.from_exception(*record.exc_info)  # type: ignore
+                    renderables.append(tb)
+                self.console.print(*renderables)
+        except Exception:
+            self.handleError(record)
+    def render_message(self, *, record: logging.LogRecord, message: str) -> ConsoleRenderable:
+        use_markup = getattr(record, "markup", self.markup)
+        message_text = Text.from_markup(message) if use_markup else Text(message)
+        highlighter = getattr(record, "highlighter", self.highlighter)
+        if highlighter:
+            message_text = highlighter(message_text)
+        return message_text
+    def get_time_text(self, record: logging.LogRecord) -> Text:
+        log_time = datetime.fromtimestamp(record.created)
+        time_str = log_time.strftime("[%Y-%m-%d %X]")
+        return Text(time_str, style="log.time", end=" ")
+    def get_level_text(self, record: logging.LogRecord) -> Text:
+        level_name = record.levelname
+        level_text = Text.styled(level_name.ljust(8), f"logging.level.{level_name.lower()}")
+        level_text.style = "log.level"
+        level_text.end = " "
+        return level_text
+    def get_location_text(self, record: logging.LogRecord) -> Text:
+        name_and_line = f"{record.name}:{record.lineno}" if record.name != "root" else "root"
+        text = f"[{name_and_line}, rank={record.local_rank}]"  # type: ignore
+        return Text(text, style="log.path")
+def wait_for(condition: Callable[[], bool], description: str, timeout: float = 10.0):
+    """Wait for the condition function to return True."""
+    start_time = time.monotonic()
+    while not condition():
+        time.sleep(0.5)
+        if time.monotonic() - start_time > timeout:
+            raise TimeoutError(f"{description} timed out")
+def is_url(path: PathOrStr) -> bool:
+    return re.match(r"[a-z0-9]+://.*", str(path)) is not None
+def dir_is_empty(dir: PathOrStr) -> bool:
+    dir = Path(dir)
+    if not dir.is_dir():
+        return True
+    try:
+        next(dir.glob("*"))
+        return False
+    except StopIteration:
+        return True
+def get_progress_bar() -> Progress:
+    from cached_path import get_download_progress
+    return get_download_progress()
+def resource_path(
+    folder: PathOrStr, fname: str, local_cache: Optional[PathOrStr] = None, progress: Optional[Progress] = None
+) -> Path:
+    if local_cache is not None and (local_path := Path(local_cache) / fname).is_file():
+        log.info(f"Found local cache of {fname} at {local_path}")
+        return local_path
+    else:
+        from cached_path import cached_path
+        return cached_path(f"{str(folder).rstrip('/')}/{fname}", progress=progress)
+def file_size(path: PathOrStr) -> int:
+    """
+    Get the size of a local or remote file in bytes.
+    """
+    if is_url(path):
+        from urllib.parse import urlparse
+        parsed = urlparse(str(path))
+        if parsed.scheme == "gs":
+            return _gcs_file_size(parsed.netloc, parsed.path.strip("/"))
+        elif parsed.scheme in ("s3", "r2", "weka"):
+            return _s3_file_size(parsed.scheme, parsed.netloc, parsed.path.strip("/"))
+        elif parsed.scheme in ("http", "https"):
+            return _http_file_size(parsed.scheme, parsed.netloc, parsed.path.strip("/"))
+        elif parsed.scheme == "file":
+            return file_size(str(path).replace("file://", "", 1))
+        else:
+            raise NotImplementedError(f"file size not implemented for '{parsed.scheme}' files")
+    else:
+        return os.stat(path).st_size
+def upload(source: PathOrStr, target: str, save_overwrite: bool = False):
+    """Upload source file to a target location on GCS or S3."""
+    from urllib.parse import urlparse
+    source = Path(source)
+    assert source.is_file()
+    parsed = urlparse(target)
+    if parsed.scheme == "gs":
+        _gcs_upload(source, parsed.netloc, parsed.path.strip("/"), save_overwrite=save_overwrite)
+    elif parsed.scheme in ("s3", "r2", "weka"):
+        _s3_upload(source, parsed.scheme, parsed.netloc, parsed.path.strip("/"), save_overwrite=save_overwrite)
+    else:
+        raise NotImplementedError(f"Upload not implemented for '{parsed.scheme}' scheme")
+def get_bytes_range(source: PathOrStr, bytes_start: int, num_bytes: int) -> bytes:
+    if is_url(source):
+        from urllib.parse import urlparse
+        parsed = urlparse(str(source))
+        if parsed.scheme == "gs":
+            return _gcs_get_bytes_range(parsed.netloc, parsed.path.strip("/"), bytes_start, num_bytes)
+        elif parsed.scheme in ("s3", "r2", "weka"):
+            return _s3_get_bytes_range(
+                parsed.scheme, parsed.netloc, parsed.path.strip("/"), bytes_start, num_bytes
+            )
+        elif parsed.scheme in ("http", "https"):
+            return _http_get_bytes_range(
+                parsed.scheme, parsed.netloc, parsed.path.strip("/"), bytes_start, num_bytes
+            )
+        elif parsed.scheme == "file":
+            return get_bytes_range(str(source).replace("file://", "", 1), bytes_start, num_bytes)
+        else:
+            raise NotImplementedError(f"get bytes range not implemented for '{parsed.scheme}' files")
+    else:
+        with open(source, "rb") as f:
+            f.seek(bytes_start)
+            return f.read(num_bytes)
+def find_latest_checkpoint(dir: PathOrStr) -> Optional[PathOrStr]:
+    if is_url(dir):
+        from urllib.parse import urlparse
+        parsed = urlparse(str(dir))
+        if parsed.scheme == "gs":
+            return _gcs_find_latest_checkpoint(parsed.netloc, parsed.path.strip("/"))
+        elif parsed.scheme in ("s3", "r2", "weka"):
+            return _s3_find_latest_checkpoint(parsed.scheme, parsed.netloc, parsed.path.strip("/"))
+        elif parsed.scheme == "file":
+            return find_latest_checkpoint(str(dir).replace("file://", "", 1))
+        else:
+            raise NotImplementedError(f"find_latest_checkpoint not implemented for '{parsed.scheme}' files")
+    else:
+        latest_step = 0
+        latest_checkpoint: Optional[Path] = None
+        for path in Path(dir).glob("step*"):
+            if path.is_dir():
+                try:
+                    step = int(path.name.replace("step", "").replace("-unsharded", ""))
+                except ValueError:
+                    continue
+                # We prioritize sharded checkpoints over unsharded checkpoints.
+                if step > latest_step or (step == latest_step and not path.name.endswith("-unsharded")):
+                    latest_step = step
+                    latest_checkpoint = path
+        return latest_checkpoint
+# Google Storage API is unhinged and requires you to specify the retry policy on every single call you make.
+def _gcs_is_retriable(exception: Exception) -> bool:
+    if gcs_is_transient_error(exception):
+        return True
+    if isinstance(exception, requests.exceptions.ReadTimeout):
+        return True
+    return False
+_gcs_retry = GCSRetry(predicate=_gcs_is_retriable, initial=1.0, maximum=10.0, multiplier=2.0, timeout=600.0)
+def _gcs_upload(source: Path, bucket_name: str, key: str, save_overwrite: bool = False):
+    storage_client = _get_gcs_client()
+    bucket = storage_client.bucket(bucket_name)
+    blob = bucket.blob(key)
+    if not save_overwrite and blob.exists():
+        raise FileExistsError(f"gs://{bucket_name}/{key} already exists. Use save_overwrite to overwrite it.")
+    blob.upload_from_filename(source, retry=_gcs_retry)
+def _gcs_file_size(bucket_name: str, key: str) -> int:
+    from google.api_core.exceptions import NotFound
+    storage_client = _get_gcs_client()
+    bucket = storage_client.bucket(bucket_name)
+    blob = bucket.blob(key)
+    try:
+        blob.reload(retry=_gcs_retry)
+    except NotFound:
+        raise FileNotFoundError(f"gs://{bucket_name}/{key}")
+    assert blob.size is not None
+    return blob.size
+def _gcs_get_bytes_range(bucket_name: str, key: str, bytes_start: int, num_bytes: int) -> bytes:
+    from google.api_core.exceptions import NotFound
+    storage_client = _get_gcs_client()
+    bucket = storage_client.bucket(bucket_name)
+    blob = bucket.blob(key)
+    try:
+        return blob.download_as_bytes(start=bytes_start, end=bytes_start + num_bytes - 1, retry=_gcs_retry)
+    except NotFound:
+        raise FileNotFoundError(f"gs://{bucket_name}/{key}")
+@cache
+def _get_gcs_client():
+    from google.cloud import storage as gcs
+    return gcs.Client()
+def _gcs_find_latest_checkpoint(bucket_name: str, prefix: str) -> Optional[str]:
+    if not prefix.endswith("/"):
+        prefix = f"{prefix}/"
+    storage_client = _get_gcs_client()
+    bucket = storage_client.bucket(bucket_name)
+    suffix = "/config.yaml"
+    latest_step: Optional[int] = None
+    latest_checkpoint: Optional[str] = None
+    for blob in bucket.list_blobs(prefix=prefix, match_glob=f"**{suffix}"):
+        # Disregard checkpoints that have an empty config file.
+        if blob.size <= 0:
+            continue
+        name = blob.name[len(prefix) : -len(suffix)]
+        if "/" in name:
+            # We're not considering checkpoints in subdirectories.
+            continue
+        if not name.startswith("step"):
+            continue
+        name = name[4:]
+        if name.endswith("-unsharded"):
+            name = name[: -len("-unsharded")]
+        try:
+            step = int(name)
+        except ValueError:
+            continue
+        # we prefer sharded checkpoints to unsharded ones
+        if (
+            latest_step is None
+            or step > latest_step
+            or (step == latest_step and latest_checkpoint is not None and latest_checkpoint.endswith("-unsharded"))
+        ):
+            latest_step = step
+            latest_checkpoint = f"gs://{bucket_name}/{blob.name[:-len(suffix)]}"
+    return latest_checkpoint
+def _get_s3_profile_name(scheme: str) -> Optional[str]:
+    if scheme == "s3":
+        # For backwards compatibility, we assume S3 uses the default profile if S3_PROFILE is not set.
+        return os.environ.get("S3_PROFILE")
+    if scheme == "r2":
+        profile_name = os.environ.get("R2_PROFILE")
+        if profile_name is None:
+            raise OLMoEnvironmentError(
+                "R2 profile name is not set. Did you forget to set the 'R2_PROFILE' env var?"
+            )
+        return profile_name
+    if scheme == "weka":
+        profile_name = os.environ.get("WEKA_PROFILE")
+        if profile_name is None:
+            raise OLMoEnvironmentError(
+                "Weka profile name is not set. Did you forget to set the 'WEKA_PROFILE' env var?"
+            )
+        return profile_name
+    raise NotImplementedError(f"Cannot get profile name for scheme {scheme}")
+def _get_s3_endpoint_url(scheme: str) -> Optional[str]:
+    if scheme == "s3":
+        return None
+    if scheme == "r2":
+        r2_endpoint_url = os.environ.get("R2_ENDPOINT_URL")
+        if r2_endpoint_url is None:
+            raise OLMoEnvironmentError(
+                "R2 endpoint url is not set. Did you forget to set the 'R2_ENDPOINT_URL' env var?"
+            )
+        return r2_endpoint_url
+    if scheme == "weka":
+        weka_endpoint_url = os.environ.get("WEKA_ENDPOINT_URL")
+        if weka_endpoint_url is None:
+            raise OLMoEnvironmentError(
+                "Weka endpoint url is not set. Did you forget to set the 'WEKA_ENDPOINT_URL' env var?"
+            )
+        return weka_endpoint_url
+    raise NotImplementedError(f"Cannot get endpoint url for scheme {scheme}")
+@cache
+def _get_s3_client(scheme: str):
+    session = boto3.Session(profile_name=_get_s3_profile_name(scheme))
+    return session.client(
+        "s3",
+        endpoint_url=_get_s3_endpoint_url(scheme),
+        config=Config(retries={"max_attempts": 10, "mode": "standard"}),
+        use_ssl=not int(os.environ.get("OLMO_NO_SSL", "0")),
+    )
+def _wait_before_retry(attempt: int):
+    time.sleep(min(0.5 * 2**attempt, 3.0))
+def _s3_upload(
+    source: Path, scheme: str, bucket_name: str, key: str, save_overwrite: bool = False, max_attempts: int = 3
+):
+    err: Optional[Exception] = None
+    if not save_overwrite:
+        for attempt in range(1, max_attempts + 1):
+            try:
+                _get_s3_client(scheme).head_object(Bucket=bucket_name, Key=key)
+                raise FileExistsError(
+                    f"s3://{bucket_name}/{key} already exists. Use save_overwrite to overwrite it."
+                )
+            except boto_exceptions.ClientError as e:
+                if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
+                    err = None
+                    break
+                err = e
+            if attempt < max_attempts:
+                log.warning("%s failed attempt %d with retriable error: %s", _s3_upload.__name__, attempt, err)
+                _wait_before_retry(attempt)
+        if err is not None:
+            raise OLMoNetworkError(f"Failed to check object existence during {scheme} upload") from err
+    try:
+        _get_s3_client(scheme).upload_file(source, bucket_name, key)
+    except boto_exceptions.ClientError as e:
+        raise OLMoNetworkError(f"Failed to upload to {scheme}") from e
+def _s3_file_size(scheme: str, bucket_name: str, key: str, max_attempts: int = 3) -> int:
+    err: Optional[Exception] = None
+    for attempt in range(1, max_attempts + 1):
+        try:
+            return _get_s3_client(scheme).head_object(Bucket=bucket_name, Key=key)["ContentLength"]
+        except boto_exceptions.ClientError as e:
+            if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
+                raise FileNotFoundError(f"s3://{bucket_name}/{key}") from e
+            err = e
+        if attempt < max_attempts:
+            log.warning("%s failed attempt %d with retriable error: %s", _s3_file_size.__name__, attempt, err)
+            _wait_before_retry(attempt)
+    raise OLMoNetworkError(f"Failed to get {scheme} file size") from err
+def _s3_get_bytes_range(
+    scheme: str, bucket_name: str, key: str, bytes_start: int, num_bytes: int, max_attempts: int = 3
+) -> bytes:
+    err: Optional[Exception] = None
+    for attempt in range(1, max_attempts + 1):
+        try:
+            return (
+                _get_s3_client(scheme)
+                .get_object(
+                    Bucket=bucket_name, Key=key, Range=f"bytes={bytes_start}-{bytes_start + num_bytes - 1}"
+                )["Body"]
+                .read()
+            )
+        except boto_exceptions.ClientError as e:
+            if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
+                raise FileNotFoundError(f"{scheme}://{bucket_name}/{key}") from e
+            err = e
+        except (boto_exceptions.HTTPClientError, boto_exceptions.ConnectionError) as e:
+            # ResponseStreamingError (subclass of HTTPClientError) can happen as
+            # a result of a failed read from the stream (http.client.IncompleteRead).
+            # Retrying can help in this case.
+            err = e
+        if attempt < max_attempts:
+            log.warning(
+                "%s failed attempt %d with retriable error: %s", _s3_get_bytes_range.__name__, attempt, err
+            )
+            _wait_before_retry(attempt)
+    # When torch's DataLoader intercepts exceptions, it may try to re-raise them
+    # by recalling their constructor with a single message arg. Torch has some
+    # logic to deal with the absence of a single-parameter constructor, but it
+    # doesn't gracefully handle other possible failures in calling such a constructor
+    # This can cause an irrelevant exception (e.g. KeyError: 'error'), resulting
+    # in us losing the true exception info. To avoid this, we change the exception
+    # to a type that has a single-parameter constructor.
+    raise OLMoNetworkError(f"Failed to get bytes range from {scheme}") from err
+def _s3_find_latest_checkpoint(scheme: str, bucket_name: str, prefix: str) -> Optional[str]:
+    if not prefix.endswith("/"):
+        prefix = f"{prefix}/"
+    response = _get_s3_client(scheme).list_objects(Bucket=bucket_name, Prefix=prefix, Delimiter="/")
+    assert not response["IsTruncated"]  # need to handle this if it happens
+    latest_step = 0
+    latest_checkpoint: Optional[str] = None
+    for item in response.get("CommonPrefixes", []):
+        prefix = item["Prefix"].strip("/")
+        checkpoint_name = os.path.split(prefix)[-1]
+        if not checkpoint_name.startswith("step"):
+            continue
+        try:
+            step = int(checkpoint_name.replace("step", "").replace("-unsharded", ""))
+        except ValueError:
+            continue
+        # Make sure the checkpoint dir contains a config, otherwise the checkpoint is incomplete
+        # (upload might have have failed part way through).
+        try:
+            _s3_file_size(scheme, bucket_name, f"{prefix}/config.yaml")
+        except FileNotFoundError:
+            continue
+        # We prioritize sharded checkpoints over unsharded ones.
+        if step > latest_step or (step == latest_step and not checkpoint_name.endswith("-unsharded")):
+            latest_step = step
+            latest_checkpoint = f"{scheme}://{bucket_name}/{prefix}"
+    return latest_checkpoint
+def _http_file_size(scheme: str, host_name: str, path: str) -> int:
+    import requests
+    response = requests.head(f"{scheme}://{host_name}/{path}", allow_redirects=True)
+    return int(response.headers.get("content-length"))
+def _http_get_bytes_range(scheme: str, host_name: str, path: str, bytes_start: int, num_bytes: int) -> bytes:
+    import requests
+    response = requests.get(
+        f"{scheme}://{host_name}/{path}", headers={"Range": f"bytes={bytes_start}-{bytes_start+num_bytes-1}"}
+    )
+    result = response.content
+    assert (
+        len(result) == num_bytes
+    ), f"expected {num_bytes} bytes, got {len(result)}"  # Some web servers silently ignore range requests and send everything
+    return result
+def save_hf_dataset_to_disk(
+    dataset: datasets.DatasetDict | datasets.Dataset,
+    hf_path: str,
+    name: Optional[str],
+    split: str,
+    datasets_dir: PathOrStr,
+):
+    """
+    Saves a HF dataset to disk under the `datasets_dir`. It can be used to add a HF dataset
+    to `olmo_data` as follows:
+    ```
+    import datasets
+    from olmo.util import save_hf_dataset_to_disk
+    path, name, split = ...
+    dataset = datasets.load_dataset(path, name=name, split=split)
+    save_hf_dataset_to_disk(dataset, path, name, split, "olmo_data/hf_datasets")
+    ```
+    """
+    dataset_path = Path(datasets_dir) / hf_path / (name or "none") / split
+    return dataset.save_to_disk(str(dataset_path))
+def load_hf_dataset(path: str, name: Optional[str], split: str):
+    """
+    Loads a HuggingFace dataset. The dataset is assumed to be saved using
+    `save_hf_dataset_to_disk` and located in `olmo_data/hf_datasets`.
+    """
+    dataset_rel_path = os.path.join("hf_datasets", path, name or "none", split)
+    with get_data_path(dataset_rel_path) as dataset_path:
+        if not dataset_path.is_dir():
+            raise NotADirectoryError(
+                f"HF dataset {path} name {name} split {split} not found in directory {dataset_rel_path}"
+            )
+        return datasets.load_from_disk(str(dataset_path))
+def load_oe_eval_requests(path: str, name: Optional[str] = None, split: Optional[str] = None):
+    """
+    Loads an oe-eval request file from `olmo_data/oe_eval_tasks`.
+    TODO: Add support from loading from S3 instead?
+    """
+    dataset_rel_path = os.path.join("oe_eval_tasks", path)
+    if name is not None:
+        dataset_rel_path = os.path.join(dataset_rel_path, name)
+    with get_data_path(dataset_rel_path) as dataset_path:
+        if not dataset_path.is_dir():
+            raise NotADirectoryError(f"OE Eval dataset not found in directory {dataset_rel_path}")
+        data_file = dataset_path / "requests.jsonl.gz"
+        if not data_file.is_file():
+            data_file = dataset_path / "requests.jsonl"
+        if not data_file.is_file():
+            raise FileNotFoundError(
+                f"OE Eval dataset file requests-{split}.jsonl(.gz) missing in directory {dataset_rel_path}"
+            )
+        requests = []
+        if data_file.suffix == ".gz":
+            with gzip.open(data_file, "r") as file:
+                for line in file:
+                    requests.append(json.loads(line.decode("utf-8").strip()))
+        else:
+            with open(data_file, "r") as file:
+                for line2 in file:
+                    requests.append(json.loads(line2.strip()))
+        config = None
+        config_file = dataset_path / "config.json"
+        if config_file.is_file():
+            with open(config_file, "r") as file:
+                config = json.load(file)
+        return config, requests
+def default_thread_count() -> int:
+    return int(os.environ.get("OLMO_NUM_THREADS") or min(32, (os.cpu_count() or 1) + 4))
+def pass_through_fn(fn, *args, **kwargs):
+    return fn(*args, **kwargs)
+def threaded_generator(g, maxsize: int = 16, thread_name: Optional[str] = None):
+    q: Queue = Queue(maxsize=maxsize)
+    sentinel = object()
+    def fill_queue():
+        try:
+            for value in g:
+                q.put(value)
+        except Exception as e:
+            q.put(e)
+        finally:
+            q.put(sentinel)
+    thread_name = thread_name or repr(g)
+    thread = Thread(name=thread_name, target=fill_queue, daemon=True)
+    thread.start()
+    for x in iter(q.get, sentinel):
+        if isinstance(x, Exception):
+            raise OLMoThreadError(f"generator thread {thread_name} failed") from x
+        else:
+            yield x
+def roundrobin(*iterables):
+    """
+    Call the given iterables in a round-robin fashion. For example:
+    ``roundrobin('ABC', 'D', 'EF') --> A D E B F C``
+    """
+    # Adapted from https://docs.python.org/3/library/itertools.html#itertools-recipes
+    num_active = len(iterables)
+    nexts = cycle(iter(it).__next__ for it in iterables)
+    while num_active:
+        try:
+            for next in nexts:
+                yield next()
+        except StopIteration:
+            # Remove the iterator we just exhausted from the cycle.
+            num_active -= 1
+            nexts = cycle(islice(nexts, num_active))
+def add_cached_path_clients():
+    add_scheme_client(WekaClient)
+class WekaClient(SchemeClient):
+    recoverable_errors = SchemeClient.recoverable_errors + (
+        boto_exceptions.HTTPClientError,
+        boto_exceptions.ConnectionError,
+    )
+    scheme = "weka"
+    def __init__(self, resource: str) -> None:
+        SchemeClient.__init__(self, resource)
+        self.bucket_name, self.path = WekaClient._split_cloud_path(resource, "weka")
+        self.s3 = _get_s3_client("weka")
+        self.object_info = None
+    @staticmethod
+    def _split_cloud_path(url: str, provider: str) -> Tuple[str, str]:
+        """Split a full s3 path into the bucket name and path."""
+        from urllib.parse import urlparse
+        parsed = urlparse(url)
+        if not parsed.netloc or not parsed.path:
+            raise ValueError("bad {} path {}".format(provider, url))
+        bucket_name = parsed.netloc
+        provider_path = parsed.path
+        # Remove '/' at beginning of path.
+        if provider_path.startswith("/"):
+            provider_path = provider_path[1:]
+        return bucket_name, provider_path
+    def _ensure_object_info(self):
+        if self.object_info is None:
+            try:
+                self.object_info = self.s3.head_object(Bucket=self.bucket_name, Key=self.path)
+            except boto_exceptions.ClientError as e:
+                if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
+                    raise FileNotFoundError(f"weka://{self.bucket_name}/{self.path}") from e
+                raise e
+    def get_etag(self) -> Optional[str]:
+        self._ensure_object_info()
+        assert self.object_info is not None
+        return self.object_info.get("ETag")
+    def get_size(self) -> Optional[int]:
+        self._ensure_object_info()
+        assert self.object_info is not None
+        return self.object_info.get("ContentLength")
+    def get_resource(self, temp_file: io.BufferedWriter) -> None:
+        self.s3.download_fileobj(Fileobj=temp_file, Bucket=self.bucket_name, Key=self.path)
+    def get_bytes_range(self, index: int, length: int) -> bytes:
+        response = self.s3.get_object(
+            Bucket=self.bucket_name, Key=self.path, Range=f"bytes={index}-{index+length-1}"
+        )
+        return response["Body"].read()
+def flatten_dict(dictionary, parent_key="", separator=".", include_lists=False):
+    """
+    Flatten a nested dictionary into a single-level dictionary.
+    Args:
+        dictionary (dict): The nested dictionary to be flattened.
+        parent_key (str, optional): The parent key to be prepended to the keys of the flattened dictionary. Defaults to "".
+        separator (str, optional): The separator to be used between the parent key and the keys of the flattened dictionary. Defaults to ".".
+        include_lists (bool, optional): Whether to convert lists to dictionaries with integer keys. Defaults to False.
+    Returns:
+        dict: The flattened dictionary.
+    """
+    d: Dict[str, Any] = {}
+    for key, value in dictionary.items():
+        new_key = parent_key + separator + key if parent_key else key
+        # convert lists to dict with key <int>
+        if isinstance(value, list) and include_lists:
+            value = {f"{i}": v for i, v in enumerate(value)}
+        if isinstance(value, MutableMapping):
+            d.update(**flatten_dict(value, new_key, separator=separator, include_lists=include_lists))
+        else:
+            d[new_key] = value
+    return d

version.py ADDED Viewed

	@@ -0,0 +1,11 @@

+_MAJOR = "0"
+_MINOR = "6"
+# On main and in a nightly release the patch should be one ahead of the last
+# released build.
+_PATCH = "0"
+# This is mainly for nightly builds which have the suffix ".dev$DATE". See
+# https://semver.org/#is-v123-a-semantic-version for the semantics.
+_SUFFIX = ""
+VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
+VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)