Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

boris commited on Mar 14, 2022

Commit

803ccbf

unverified ·

1 Parent(s): 2e02683

feat: support pod (#139)

Browse files

Files changed (14) hide show

src/dalle_mini/data.py +37 -2
src/dalle_mini/model/modeling.py +25 -15
src/dalle_mini/model/utils.py +0 -6
tools/inference/inference_pipeline.ipynb +15 -9
tools/train/config/medium/config.json +0 -1
tools/train/config/mega/config.json +8 -10
tools/train/config/micro/config.json +6 -8
tools/train/config/mini/config.json +0 -1
tools/train/scalable_shampoo/README.md +7 -0
tools/train/{distributed_shampoo.py → scalable_shampoo/distributed_shampoo.py} +67 -170
tools/train/scalable_shampoo/quantization_utils.py +124 -0
tools/train/scalable_shampoo/sm3.py +176 -0
tools/train/scalable_shampoo/symmetric_matrices/symmetric_matrices.py +211 -0
tools/train/train.py +197 -128

src/dalle_mini/data.py CHANGED Viewed

@@ -27,6 +27,7 @@ class Dataset:
     do_eval: bool = True
     seed_dataset: int = None
     shard_by_host: bool = False
     train_dataset: Dataset = field(init=False)
     eval_dataset: Dataset = field(init=False)
     rng_dataset: jnp.ndarray = field(init=False)
@@ -34,6 +35,11 @@ class Dataset:
     def __post_init__(self):
         self.multi_hosts = jax.process_count() > 1
         # define data_files
         if self.train_file is not None or self.validation_file is not None:
             # accept braceexpand notation
@@ -101,6 +107,25 @@ class Dataset:
                 self.seed_dataset = np.random.get_state()[1][0]
             self.rng_dataset = jax.random.PRNGKey(self.seed_dataset)
         # normalize text
         if normalize_text:
             text_normalizer = TextNormalizer()
@@ -144,6 +169,10 @@ class Dataset:
                         getattr(self, ds).map(
                             partial_preprocess_function,
                             batched=True,
                         )
                         if self.streaming
                         else getattr(self, ds).map(
@@ -193,8 +222,8 @@ class Dataset:
             while (self.multi_hosts and split == "train") or first_loop:
                 # in multi-host, we run forever (no epoch) as hosts need to stop
                 # at the same time and training data may not be split equally
-                # For validation data we put the entire set on each host as we could lose
-                # too many samples on pods
                 if epoch is not None:
                     assert split == "train"
                     # reshuffle training data at each epoch
@@ -252,6 +281,12 @@ def shift_tokens_right(input_ids: np.array, decoder_start_token_id: int):
     return shifted_input_ids
 def normalize_function(example, text_column, text_normalizer):
     example[text_column] = text_normalizer(example[text_column])
     return example

     do_eval: bool = True
     seed_dataset: int = None
     shard_by_host: bool = False
+    blank_caption_prob: float = 0.0
     train_dataset: Dataset = field(init=False)
     eval_dataset: Dataset = field(init=False)
     rng_dataset: jnp.ndarray = field(init=False)
     def __post_init__(self):
         self.multi_hosts = jax.process_count() > 1
+        # feed blank captions only in streaming mode for now
+        if self.blank_caption_prob:
+            assert (
+                self.streaming is True
+            ), "blank_caption_prob can only be used in streaming mode"
         # define data_files
         if self.train_file is not None or self.validation_file is not None:
             # accept braceexpand notation
                 self.seed_dataset = np.random.get_state()[1][0]
             self.rng_dataset = jax.random.PRNGKey(self.seed_dataset)
+        # blank captions
+        if self.blank_caption_prob:
+            partial_blank_caption_function = partial(
+                blank_caption_function,
+                text_column=self.text_column,
+                blank_caption_prob=self.blank_caption_prob,
+            )
+            if hasattr(self, "train_dataset"):
+                self.train_dataset = (
+                    self.train_dataset.map(partial_blank_caption_function)
+                    if self.streaming
+                    else self.train_dataset.map(
+                        partial_blank_caption_function,
+                        num_proc=self.preprocessing_num_workers,
+                        load_from_cache_file=False,
+                        desc="Blanking some captions",
+                    )
+                )
         # normalize text
         if normalize_text:
             text_normalizer = TextNormalizer()
                         getattr(self, ds).map(
                             partial_preprocess_function,
                             batched=True,
+                            remove_columns=[
+                                self.text_column,
+                                self.encoding_column,
+                            ],
                         )
                         if self.streaming
                         else getattr(self, ds).map(
             while (self.multi_hosts and split == "train") or first_loop:
                 # in multi-host, we run forever (no epoch) as hosts need to stop
                 # at the same time and training data may not be split equally
+                # For validation data we put the entire batch on each host and then
+                # keep only the one specific to each host (could be improved but not necessary)
                 if epoch is not None:
                     assert split == "train"
                     # reshuffle training data at each epoch
     return shifted_input_ids
+def blank_caption_function(example, text_column, blank_caption_prob):
+    if blank_caption_prob and np.random.rand() < blank_caption_prob:
+        example[text_column] = ""
+    return example
 def normalize_function(example, text_column, text_normalizer):
     example[text_column] = text_normalizer(example[text_column])
     return example

src/dalle_mini/model/modeling.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The Fairseq Authors and The Google Flax Team Authors And The HuggingFace Inc. team and the DalleBart team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -328,6 +328,7 @@ class FlaxBartPreTrainedModel(FlaxBartPreTrainedModel):
         dtype: jnp.dtype = jnp.float32,
         abstract_init: bool = False,
         load_on_cpu: bool = False,
         **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
@@ -347,25 +348,34 @@ class FlaxBartPreTrainedModel(FlaxBartPreTrainedModel):
         self.key = PRNGKey(seed)
         self.dtype = dtype
-        # init weights on CPU
-        if load_on_cpu:
-            # init weights on CPU
-            init_fn = jax.jit(self.init_weights, static_argnums=(1,), backend="cpu")
-        else:
-            init_fn = self.init_weights
-        # randomly initialized parameters
-        random_params = self.init_weights(self.key, input_shape)
         if abstract_init:
             # only set shape and dtype, load parameters separately
             init_fn = partial(init_fn, input_shape=input_shape)
-            random_params = jax.eval_shape(init_fn, self.key)
         else:
-            random_params = init_fn(self.key, input_shape)
-        # save required_params as set
-        self._required_params = set(flatten_dict(unfreeze(random_params)).keys())
-        self.params = random_params
     @property
     def num_params(self):

 # coding=utf-8
+# Copyright 2021-2022 The Fairseq Authors and The Google Flax Team Authors And The HuggingFace Inc. team and & DALL·E Mini team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
         dtype: jnp.dtype = jnp.float32,
         abstract_init: bool = False,
         load_on_cpu: bool = False,
+        init_weights: bool = True,
         **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         self.key = PRNGKey(seed)
         self.dtype = dtype
+        if init_weights:
+            # get shape of params only
+            random_params = self.init_weights(
+                self.key,
+                input_shape,
+                abstract_init=abstract_init,
+                load_on_cpu=load_on_cpu,
+            )
+            # save required_params as set
+            self._required_params = set(flatten_dict(unfreeze(random_params)).keys())
+            self.params = random_params
+    def init_weights(
+        self, rng=None, input_shape=(1, 1), abstract_init=False, load_on_cpu=False
+    ):
+        if rng is None:
+            rng = self.key
+        init_fn = super().init_weights
+        if load_on_cpu:
+            init_fn = jax.jit(init_fn, static_argnums=(1,), backend="cpu")
         if abstract_init:
             # only set shape and dtype, load parameters separately
             init_fn = partial(init_fn, input_shape=input_shape)
+            params = jax.eval_shape(init_fn, rng)
         else:
+            params = init_fn(rng, input_shape)
+        return params
     @property
     def num_params(self):

src/dalle_mini/model/utils.py CHANGED Viewed

@@ -23,12 +23,6 @@ class PretrainedFromWandbMixin:
                 else:
                     artifact = wandb.Api().artifact(pretrained_model_name_or_path)
                 pretrained_model_name_or_path = artifact.download(tmp_dir)
-                if artifact.metadata.get("bucket_path"):
-                    pretrained_model_name_or_path = artifact.metadata["bucket_path"]
-            if pretrained_model_name_or_path.startswith("gs://"):
-                copy_blobs(pretrained_model_name_or_path, tmp_dir)
-                pretrained_model_name_or_path = tmp_dir
             return super(PretrainedFromWandbMixin, cls).from_pretrained(
                 pretrained_model_name_or_path, *model_args, **kwargs

                 else:
                     artifact = wandb.Api().artifact(pretrained_model_name_or_path)
                 pretrained_model_name_or_path = artifact.download(tmp_dir)
             return super(PretrainedFromWandbMixin, cls).from_pretrained(
                 pretrained_model_name_or_path, *model_args, **kwargs

tools/inference/inference_pipeline.ipynb CHANGED Viewed

@@ -83,7 +83,7 @@
     "VQGAN_COMMIT_ID = \"e93a26e7707683d349bf5d5c41c5b0ef69b677a9\"\n",
     "\n",
     "# CLIP model\n",
-    "CLIP_REPO = \"openai/clip-vit-base-patch16\"\n",
     "CLIP_COMMIT_ID = None"
    ]
   },
@@ -129,7 +129,6 @@
     "from dalle_mini.model import DalleBart, DalleBartTokenizer\n",
     "from vqgan_jax.modeling_flax_vqgan import VQModel\n",
     "from transformers import CLIPProcessor, FlaxCLIPModel\n",
-    "import wandb\n",
     "\n",
     "# Load dalle-mini\n",
     "model = DalleBart.from_pretrained(\n",
@@ -168,9 +167,9 @@
     "if dtype == jnp.bfloat16:\n",
     "    model.params = model.to_bf16(model.params)\n",
     "\n",
-    "model_params = replicate(model.params)\n",
-    "vqgan_params = replicate(vqgan.params)\n",
-    "clip_params = replicate(clip.params)"
    ]
   },
   {
@@ -292,7 +291,7 @@
    },
    "outputs": [],
    "source": [
-    "prompt = \"a blue table\""
    ]
   },
   {
@@ -414,12 +413,12 @@
     "    key, subkey = jax.random.split(key)\n",
     "    # generate images\n",
     "    encoded_images = p_generate(\n",
-    "        tokenized_prompt, shard_prng_key(subkey), model_params, gen_top_k, gen_top_p\n",
     "    )\n",
     "    # remove BOS\n",
     "    encoded_images = encoded_images.sequences[..., 1:]\n",
     "    # decode images\n",
-    "    decoded_images = p_decode(encoded_images, vqgan_params)\n",
     "    decoded_images = decoded_images.clip(0.0, 1.0).reshape((-1, 256, 256, 3))\n",
     "    for img in decoded_images:\n",
     "        images.append(Image.fromarray(np.asarray(img * 255, dtype=np.uint8)))"
@@ -453,7 +452,7 @@
     "    max_length=77,\n",
     "    truncation=True,\n",
     ").data\n",
-    "logits = p_clip(shard(clip_inputs), clip_params)\n",
     "logits = logits.squeeze().flatten()"
    ]
   },
@@ -479,6 +478,13 @@
     "    display(images[idx])\n",
     "    print(f\"Score: {logits[idx]:.2f}\\n\")"
    ]
   }
  ],
  "metadata": {

     "VQGAN_COMMIT_ID = \"e93a26e7707683d349bf5d5c41c5b0ef69b677a9\"\n",
     "\n",
     "# CLIP model\n",
+    "CLIP_REPO = \"openai/clip-vit-large-patch14\"\n",
     "CLIP_COMMIT_ID = None"
    ]
   },
     "from dalle_mini.model import DalleBart, DalleBartTokenizer\n",
     "from vqgan_jax.modeling_flax_vqgan import VQModel\n",
     "from transformers import CLIPProcessor, FlaxCLIPModel\n",
     "\n",
     "# Load dalle-mini\n",
     "model = DalleBart.from_pretrained(\n",
     "if dtype == jnp.bfloat16:\n",
     "    model.params = model.to_bf16(model.params)\n",
     "\n",
+    "model._params = replicate(model.params)\n",
+    "vqgan._params = replicate(vqgan.params)\n",
+    "clip._params = replicate(clip.params)"
    ]
   },
   {
    },
    "outputs": [],
    "source": [
+    "prompt = \"view of the beach during sunset\""
    ]
   },
   {
     "    key, subkey = jax.random.split(key)\n",
     "    # generate images\n",
     "    encoded_images = p_generate(\n",
+    "        tokenized_prompt, shard_prng_key(subkey), model.params, gen_top_k, gen_top_p\n",
     "    )\n",
     "    # remove BOS\n",
     "    encoded_images = encoded_images.sequences[..., 1:]\n",
     "    # decode images\n",
+    "    decoded_images = p_decode(encoded_images, vqgan.params)\n",
     "    decoded_images = decoded_images.clip(0.0, 1.0).reshape((-1, 256, 256, 3))\n",
     "    for img in decoded_images:\n",
     "        images.append(Image.fromarray(np.asarray(img * 255, dtype=np.uint8)))"
     "    max_length=77,\n",
     "    truncation=True,\n",
     ").data\n",
+    "logits = p_clip(shard(clip_inputs), clip.params)\n",
     "logits = logits.squeeze().flatten()"
    ]
   },
     "    display(images[idx])\n",
     "    print(f\"Score: {logits[idx]:.2f}\\n\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

tools/train/config/medium/config.json CHANGED Viewed

@@ -28,6 +28,5 @@
   "pad_token_id": 16385,
   "scale_embedding": false,
   "tie_word_embeddings": false,
-  "transformers_version": "4.13.0.dev0",
   "use_cache": true
 }

   "pad_token_id": 16385,
   "scale_embedding": false,
   "tie_word_embeddings": false,
   "use_cache": true
 }

tools/train/config/mega/config.json CHANGED Viewed

@@ -5,21 +5,20 @@
   "bos_token_id": 16385,
   "classifier_dropout": 0.0,
   "d_model": 2048,
-  "decoder_attention_heads": 16,
-  "decoder_ffn_dim": 4096,
   "decoder_layerdrop": 0.0,
-  "decoder_layers": 31,
   "decoder_start_token_id": 16384,
-  "dropout": 0.1,
-  "encoder_attention_heads": 16,
-  "encoder_ffn_dim": 4096,
   "encoder_layerdrop": 0.0,
-  "encoder_layers": 31,
   "encoder_vocab_size": 50264,
   "eos_token_id": 16385,
-  "gradient_checkpointing": false,
   "image_length": 256,
-  "image_vocab_size": 16384,
   "init_std": 0.01,
   "is_encoder_decoder": true,
   "max_text_length": 64,
@@ -28,6 +27,5 @@
   "pad_token_id": 16385,
   "scale_embedding": false,
   "tie_word_embeddings": false,
-  "transformers_version": "4.13.0.dev0",
   "use_cache": true
 }

   "bos_token_id": 16385,
   "classifier_dropout": 0.0,
   "d_model": 2048,
+  "decoder_attention_heads": 32,
+  "decoder_ffn_dim": 8192,
   "decoder_layerdrop": 0.0,
+  "decoder_layers": 24,
   "decoder_start_token_id": 16384,
+  "dropout": 0.0,
+  "encoder_attention_heads": 32,
+  "encoder_ffn_dim": 8192,
   "encoder_layerdrop": 0.0,
+  "encoder_layers": 24,
   "encoder_vocab_size": 50264,
   "eos_token_id": 16385,
   "image_length": 256,
+  "image_vocab_size": 16391,
   "init_std": 0.01,
   "is_encoder_decoder": true,
   "max_text_length": 64,
   "pad_token_id": 16385,
   "scale_embedding": false,
   "tie_word_embeddings": false,
   "use_cache": true
 }

tools/train/config/micro/config.json CHANGED Viewed

@@ -4,22 +4,21 @@
   "attention_dropout": 0.0,
   "bos_token_id": 16385,
   "classifier_dropout": 0.0,
-  "d_model": 1024,
-  "decoder_attention_heads": 16,
-  "decoder_ffn_dim": 2048,
   "decoder_layerdrop": 0.0,
   "decoder_layers": 2,
   "decoder_start_token_id": 16384,
   "dropout": 0.0,
-  "encoder_attention_heads": 16,
-  "encoder_ffn_dim": 2048,
   "encoder_layerdrop": 0.0,
   "encoder_layers": 2,
   "encoder_vocab_size": 50264,
   "eos_token_id": 16385,
-  "gradient_checkpointing": false,
   "image_length": 256,
-  "image_vocab_size": 16384,
   "init_std": 0.02,
   "is_encoder_decoder": true,
   "max_text_length": 64,
@@ -28,6 +27,5 @@
   "pad_token_id": 16385,
   "scale_embedding": false,
   "tie_word_embeddings": false,
-  "transformers_version": "4.13.0.dev0",
   "use_cache": true
 }

   "attention_dropout": 0.0,
   "bos_token_id": 16385,
   "classifier_dropout": 0.0,
+  "d_model": 256,
+  "decoder_attention_heads": 2,
+  "decoder_ffn_dim": 256,
   "decoder_layerdrop": 0.0,
   "decoder_layers": 2,
   "decoder_start_token_id": 16384,
   "dropout": 0.0,
+  "encoder_attention_heads": 2,
+  "encoder_ffn_dim": 256,
   "encoder_layerdrop": 0.0,
   "encoder_layers": 2,
   "encoder_vocab_size": 50264,
   "eos_token_id": 16385,
   "image_length": 256,
+  "image_vocab_size": 16391,
   "init_std": 0.02,
   "is_encoder_decoder": true,
   "max_text_length": 64,
   "pad_token_id": 16385,
   "scale_embedding": false,
   "tie_word_embeddings": false,
   "use_cache": true
 }

tools/train/config/mini/config.json CHANGED Viewed

@@ -28,6 +28,5 @@
   "pad_token_id": 16385,
   "scale_embedding": false,
   "tie_word_embeddings": false,
-  "transformers_version": "4.13.0.dev0",
   "use_cache": true
 }

   "pad_token_id": 16385,
   "scale_embedding": false,
   "tie_word_embeddings": false,
   "use_cache": true
 }

tools/train/scalable_shampoo/README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# Notes
+Files copied from [google-research/scalable_shampoo/optax](https://github.com/google-research/google-research/tree/master/scalable_shampoo/optax).
+Imports have been modified to be relative.
+This will be replaced with `optax-shampoo` package eventually.

tools/train/{distributed_shampoo.py → scalable_shampoo/distributed_shampoo.py} RENAMED Viewed

@@ -1,5 +1,3 @@
-# file from: https://github.com/google-research/google-research/blob/master/scalable_shampoo/optax/distributed_shampoo.py
 # coding=utf-8
 # Copyright 2022 The Google Research Authors.
 #
@@ -44,107 +42,12 @@ import optax
 from flax import struct
 from jax import lax
-# pylint:disable=no-value-for-parameter
-@struct.dataclass
-class QuantizedValue:
-    """State associated with quantized value."""
-    quantized: chex.Array
-    diagonal: chex.Array  # Diagonal (if extract_diagonal is set)
-    bucket_size: chex.Array
-    quantized_dtype: jnp.dtype = struct.field(
-        pytree_node=False
-    )  # Dtype for the quantized value.
-    extract_diagonal: bool = struct.field(pytree_node=False)  # In case its centered.
-    shape: Any = struct.field(pytree_node=False)  # Shape of the tensor.
-    @classmethod
-    def from_float_value(cls, fvalue, quantized_dtype, extract_diagonal=False):
-        if isinstance(fvalue, list) and not fvalue:
-            return QuantizedValue([], [], [], quantized_dtype, extract_diagonal, [])
-        quantized, diagonal_fvalue, bucket_size = QuantizedValue.quantize(
-            fvalue, quantized_dtype, extract_diagonal
-        )
-        return QuantizedValue(
-            quantized,
-            diagonal_fvalue,
-            bucket_size,
-            quantized_dtype,
-            extract_diagonal,
-            list(quantized.shape),
-        )
-    # Quantization is from Lingvo JAX optimizers.
-    # We extend it for int16 quantization of PSD matrices.
-    @classmethod
-    def quantize(cls, fvalue, quantized_dtype, extract_diagonal=False):
-        """Returns quantized value and the bucket."""
-        if quantized_dtype == jnp.float32:
-            return fvalue, [], []
-        elif quantized_dtype == jnp.bfloat16:
-            return fvalue.astype(jnp.bfloat16), [], []
-        float_dtype = fvalue.dtype
-        if quantized_dtype == jnp.int8:
-            # value -128 is not used.
-            num_buckets = jnp.array(127.0, dtype=float_dtype)
-        elif quantized_dtype == jnp.int16:
-            # value -32768 is not used.
-            num_buckets = jnp.array(32767.0, dtype=float_dtype)
-        else:
-            raise ValueError(f"Quantized dtype {quantized_dtype} not supported.")
-        # max value is mapped to num_buckets
-        if extract_diagonal and fvalue.ndim != 2:
-            raise ValueError(
-                f"Input array {fvalue} must be 2D to work with extract_diagonal."
-            )
-        diagonal_fvalue = []
-        if extract_diagonal:
-            diagonal_fvalue = jnp.diag(fvalue)
-            # Remove the diagonal entries.
-            fvalue = fvalue - jnp.diag(diagonal_fvalue)
-        # TODO(rohananil): Extend this by making use of information about the blocks
-        # SM3 style which will be useful for diagonal statistics
-        # We first decide the scale.
-        if fvalue.ndim < 1:
-            raise ValueError(
-                f"Input array {fvalue} must have a strictly positive number of "
-                "dimensions."
-            )
-        max_abs = jnp.max(jnp.abs(fvalue), axis=0)
-        bucket_size = max_abs / num_buckets
-        bs_expanded = bucket_size[jnp.newaxis, Ellipsis]
-        # To avoid divide by 0.0
-        bs_nonzero = jnp.where(
-            bs_expanded > 0.0, bs_expanded, jnp.ones_like(bs_expanded)
-        )
-        ratio = fvalue / bs_nonzero
-        # We use rounding to remove bias.
-        quantized = jnp.round(ratio)
-        return quantized.astype(quantized_dtype), diagonal_fvalue, bucket_size
-    def to_float(self):
-        """Returns the float value."""
-        if isinstance(self.quantized, list) and not self.quantized:
-            return self.quantized
-        if self.quantized_dtype == jnp.float32:
-            return self.quantized
-        if self.quantized_dtype == jnp.bfloat16:
-            return self.quantized.astype(jnp.float32)
-        float_dtype = self.bucket_size.dtype
-        bucket_size = self.bucket_size[jnp.newaxis, Ellipsis]
-        val = self.quantized.astype(float_dtype) * bucket_size
-        if self.extract_diagonal:
-            val += jnp.diag(self.diagonal)
-        return val
 @struct.dataclass
@@ -193,24 +96,21 @@ class LocalShardedParameterStats:
 def init_training_metrics(num_statistics):
-    if num_statistics:
-        return TrainingMetrics(jnp.zeros([num_statistics], jnp.float32))
-    else:
-        return TrainingMetrics([])
 def init_training_metrics_shapes(num_statistics):
-    if num_statistics:
-        return TrainingMetrics([[num_statistics], jnp.float32])
-    else:
-        return TrainingMetrics([None, jnp.float32])
-def init_training_metrics_pspec(num_statistics):
-    if num_statistics:
-        return TrainingMetrics(pjit.PartitionSpec())
-    else:
-        return TrainingMetrics(None)
 class ShardedShampooStats(NamedTuple):
@@ -296,6 +196,30 @@ def power_iteration(
     return v_out, s_out
 def matrix_inverse_pth_root(
     matrix,
     p,
@@ -332,57 +256,19 @@ def matrix_inverse_pth_root(
     assert matrix.shape[0] == matrix.shape[1]
-    # We use float32 for the matrix inverse pth root.
-    # Switch to f64 if you have hardware that supports it.
     matrix_size = matrix.shape[0]
-    alpha = jnp.asarray(-1.0 / p, jnp.float32)
-    identity = jnp.eye(matrix_size, dtype=jnp.float32)
     _, max_ev = power_iteration(
         matrix=matrix, num_iters=100, error_tolerance=1e-6, precision=precision
     )
     ridge_epsilon = ridge_epsilon * jnp.maximum(max_ev, 1e-6)
-    def _unrolled_mat_pow_1(mat_m):
-        """Computes mat_m^1."""
-        return mat_m
-    def _unrolled_mat_pow_2(mat_m):
-        """Computes mat_m^2."""
-        return jnp.matmul(mat_m, mat_m, precision=precision)
-    def _unrolled_mat_pow_4(mat_m):
-        """Computes mat_m^4."""
-        mat_pow_2 = _unrolled_mat_pow_2(mat_m)
-        return jnp.matmul(mat_pow_2, mat_pow_2, precision=precision)
-    def _unrolled_mat_pow_8(mat_m):
-        """Computes mat_m^4."""
-        mat_pow_4 = _unrolled_mat_pow_4(mat_m)
-        return jnp.matmul(mat_pow_4, mat_pow_4, precision=precision)
-    def mat_power(mat_m, p):
-        """Computes mat_m^p, for p == 1, 2, 4 or 8.
-        Args:
-          mat_m: a square matrix
-          p: a positive integer
-        Returns:
-          mat_m^p
-        """
-        # We unrolled the loop for performance reasons.
-        exponent = jnp.round(jnp.log2(p))
-        return lax.switch(
-            jnp.asarray(exponent, jnp.int32),
-            [
-                _unrolled_mat_pow_1,
-                _unrolled_mat_pow_2,
-                _unrolled_mat_pow_4,
-                _unrolled_mat_pow_8,
-            ],
-            (mat_m),
-        )
     def _iter_condition(state):
         (i, unused_mat_m, unused_mat_h, unused_old_mat_h, error, run_step) = state
         error_above_threshold = jnp.logical_and(error > error_tolerance, run_step)
@@ -412,10 +298,10 @@ def matrix_inverse_pth_root(
         _, mat_m, mat_h, old_mat_h, error, convergence = lax.while_loop(
             _iter_condition, _iter_body, init_state
         )
-        error = jnp.max(jnp.abs(mat_m - identity))
         is_converged = jnp.asarray(convergence, old_mat_h.dtype)
         resultant_mat_h = is_converged * mat_h + (1 - is_converged) * old_mat_h
-        resultant_mat_h = jnp.asarray(resultant_mat_h, matrix.dtype)
     return resultant_mat_h, error
@@ -433,6 +319,9 @@ def merge_small_dims(shape_to_merge, max_dim):
     Returns:
       Merged shape.
     """
     resulting_shape = []
     product = 1
     for d in shape_to_merge:
@@ -975,16 +864,22 @@ def distributed_shampoo(
             )
         local_stats = jax.tree_unflatten(treedef, local_stats_flat)
         # Pad the statistics and preconditioner matrices to be a multiple of
         # num devices.
         # TODO(rohananil): Relax to only the size of the mesh axis where the dim
         # is split on.
-        to_pad = -len(padded_statistics) % num_devices_for_pjit
         padded_statistics.extend(
-            [jnp.eye(max_size, dtype=padded_statistics[0].dtype) for _ in range(to_pad)]
         )
         padded_preconditioners.extend(
-            [jnp.eye(max_size, dtype=padded_statistics[0].dtype) for _ in range(to_pad)]
         )
         exponents.extend([1 for _ in range(to_pad)])
         global_stats = GlobalShardedParameterStats(
@@ -1016,7 +911,7 @@ def distributed_shampoo(
         if pspec and len(pspec) > 1:
             return pjit.PartitionSpec(*pspec[1:])
         else:
-            return None
     def sharded_init_partition_spec_fn(
         params, params_partition_spec, partition_spec_for_statistics
@@ -1102,7 +997,7 @@ def distributed_shampoo(
                         False,
                         list(param.shape),
                     ),
-                    init_training_metrics_pspec(len(sizes)),
                     index_start,
                     sizes,
                 )
@@ -1209,6 +1104,9 @@ def distributed_shampoo(
         max_statistics_size = _max_statistics_size_from_params(params_flat)
         to_pad = -num_statistics % num_devices_for_pjit
         num_statistics += to_pad
         statistics_shape = [num_statistics, max_statistics_size, max_statistics_size]
         global_stats = GlobalShardedParameterStats(
             [statistics_shape, jnp.float32],
@@ -2069,7 +1967,7 @@ def distributed_shampoo(
             scaled_grad = grad
             if graft_type == GraftingType.ADAGRAD_NORMALIZED:
-                scaled_grad = grad / jnp.linalg.norm(grad)
             new_diagonal_statistics = state.diagonal_statistics.to_float() + jnp.square(
                 scaled_grad
@@ -2085,7 +1983,7 @@ def distributed_shampoo(
             scaled_grad = grad
             if graft_type == GraftingType.RMSPROP_NORMALIZED:
-                scaled_grad = grad / jnp.linalg.norm(grad)
             w1 = beta2
             w2 = beta2 if beta2 == 1.0 else (1.0 - beta2)
@@ -2212,7 +2110,6 @@ def distributed_shampoo(
         new_stats_flat = _compute_preconditioners(
             new_stats_flat, params_flat, state.count
         )
         outputs = jax.tree_multimap(
             lambda g, s, p: _transform_grad(g, s, p, state.count),
             grads_flat,

 # coding=utf-8
 # Copyright 2022 The Google Research Authors.
 #
 from flax import struct
 from jax import lax
+from .quantization_utils import QuantizedValue
+# Dtype for inverse-pth root routine
+# Switch to f64 if you have hardware that supports it. Enable the jax flag
+# jax_enable_x64 for this to work, otherwise it will default to float32.
+_MAT_INV_PTH_ROOT_DTYPE = jnp.float64
 @struct.dataclass
 def init_training_metrics(num_statistics):
+    # Since the downstream apis expect a jnp.array - we create a dummy one if
+    # num_statistics=0.
+    n = 1 if not num_statistics else num_statistics
+    return TrainingMetrics(jnp.zeros([n], jnp.float32))
 def init_training_metrics_shapes(num_statistics):
+    # Since the downstream apis expect a jnp.array - we create a dummy one if
+    # num_statistics=0.
+    n = 1 if not num_statistics else num_statistics
+    return TrainingMetrics([[n], jnp.float32])
+def init_training_metrics_pspec():
+    return TrainingMetrics(pjit.PartitionSpec())
 class ShardedShampooStats(NamedTuple):
     return v_out, s_out
+def mat_power(mat_m, p, precision=lax.Precision.HIGHEST):
+    """A simple matrix power method. M^p where p can be TracedValue."""
+    power = jnp.eye(mat_m.shape[0], dtype=_MAT_INV_PTH_ROOT_DTYPE)
+    def _iter_condition(state):
+        i, _, _ = state
+        return i > 0
+    def _iter_body(state):
+        i, power, mat = state
+        power = jax.lax.cond(
+            i % 2 == 1,
+            lambda: jnp.matmul(mat, power, precision=precision),
+            lambda: power,
+        )
+        i //= 2
+        mat = jnp.matmul(mat, mat, precision=precision)
+        return i, power, mat
+    _, result, _ = lax.while_loop(_iter_condition, _iter_body, (p, power, mat_m))
+    return result
 def matrix_inverse_pth_root(
     matrix,
     p,
     assert matrix.shape[0] == matrix.shape[1]
+    # We use _MAT_INV_PTH_ROOT_DTYPE for the matrix inverse pth root.
+    # Switch to f64 if you have hardware that supports it. Enable the jax flag
+    # jax_enable_x64 for this to work.
     matrix_size = matrix.shape[0]
+    orig_dtype = matrix.dtype
+    matrix = matrix.astype(_MAT_INV_PTH_ROOT_DTYPE)
+    alpha = jnp.asarray(-1.0 / p, _MAT_INV_PTH_ROOT_DTYPE)
+    identity = jnp.eye(matrix_size, dtype=_MAT_INV_PTH_ROOT_DTYPE)
     _, max_ev = power_iteration(
         matrix=matrix, num_iters=100, error_tolerance=1e-6, precision=precision
     )
     ridge_epsilon = ridge_epsilon * jnp.maximum(max_ev, 1e-6)
     def _iter_condition(state):
         (i, unused_mat_m, unused_mat_h, unused_old_mat_h, error, run_step) = state
         error_above_threshold = jnp.logical_and(error > error_tolerance, run_step)
         _, mat_m, mat_h, old_mat_h, error, convergence = lax.while_loop(
             _iter_condition, _iter_body, init_state
         )
+        error = jnp.max(jnp.abs(mat_m - identity)).astype(jnp.float32)
         is_converged = jnp.asarray(convergence, old_mat_h.dtype)
         resultant_mat_h = is_converged * mat_h + (1 - is_converged) * old_mat_h
+        resultant_mat_h = jnp.asarray(resultant_mat_h, orig_dtype)
     return resultant_mat_h, error
     Returns:
       Merged shape.
     """
+    if shape_to_merge and np.all(np.array(shape_to_merge) == 1):
+        return [1]
     resulting_shape = []
     product = 1
     for d in shape_to_merge:
             )
         local_stats = jax.tree_unflatten(treedef, local_stats_flat)
+        to_pad = -len(padded_statistics) % num_devices_for_pjit
+        if max_size == 0:
+            to_pad = num_devices_for_pjit
+            max_size = block_size
+            stat_dtype = jnp.float32
+        else:
+            stat_dtype = padded_statistics[0].dtype
         # Pad the statistics and preconditioner matrices to be a multiple of
         # num devices.
         # TODO(rohananil): Relax to only the size of the mesh axis where the dim
         # is split on.
         padded_statistics.extend(
+            [jnp.eye(max_size, dtype=stat_dtype) for _ in range(to_pad)]
         )
         padded_preconditioners.extend(
+            [jnp.eye(max_size, dtype=stat_dtype) for _ in range(to_pad)]
         )
         exponents.extend([1 for _ in range(to_pad)])
         global_stats = GlobalShardedParameterStats(
         if pspec and len(pspec) > 1:
             return pjit.PartitionSpec(*pspec[1:])
         else:
+            return []
     def sharded_init_partition_spec_fn(
         params, params_partition_spec, partition_spec_for_statistics
                         False,
                         list(param.shape),
                     ),
+                    init_training_metrics_pspec(),
                     index_start,
                     sizes,
                 )
         max_statistics_size = _max_statistics_size_from_params(params_flat)
         to_pad = -num_statistics % num_devices_for_pjit
         num_statistics += to_pad
+        if num_statistics == 0:
+            num_statistics = num_devices_for_pjit
+            max_statistics_size = block_size
         statistics_shape = [num_statistics, max_statistics_size, max_statistics_size]
         global_stats = GlobalShardedParameterStats(
             [statistics_shape, jnp.float32],
             scaled_grad = grad
             if graft_type == GraftingType.ADAGRAD_NORMALIZED:
+                scaled_grad = grad / (jnp.linalg.norm(grad) + 1e-16)
             new_diagonal_statistics = state.diagonal_statistics.to_float() + jnp.square(
                 scaled_grad
             scaled_grad = grad
             if graft_type == GraftingType.RMSPROP_NORMALIZED:
+                scaled_grad = grad / (jnp.linalg.norm(grad) + 1e-16)
             w1 = beta2
             w2 = beta2 if beta2 == 1.0 else (1.0 - beta2)
         new_stats_flat = _compute_preconditioners(
             new_stats_flat, params_flat, state.count
         )
         outputs = jax.tree_multimap(
             lambda g, s, p: _transform_grad(g, s, p, state.count),
             grads_flat,

tools/train/scalable_shampoo/quantization_utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# coding=utf-8
+# Copyright 2022 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper routines for quantization."""
+from typing import Any
+import chex
+import jax.numpy as jnp
+from flax import struct
+# pylint:disable=no-value-for-parameter
+@struct.dataclass
+class QuantizedValue:
+    """State associated with quantized value."""
+    quantized: chex.Array
+    diagonal: chex.Array  # Diagonal (if extract_diagonal is set)
+    bucket_size: chex.Array
+    quantized_dtype: jnp.dtype = struct.field(
+        pytree_node=False
+    )  # Dtype for the quantized value.
+    extract_diagonal: bool = struct.field(pytree_node=False)  # In case its centered.
+    shape: Any = struct.field(pytree_node=False)  # Shape of the tensor.
+    @classmethod
+    def from_float_value(cls, fvalue, quantized_dtype, extract_diagonal=False):
+        if isinstance(fvalue, list) and not fvalue:
+            return QuantizedValue([], [], [], quantized_dtype, extract_diagonal, [])
+        quantized, diagonal_fvalue, bucket_size = QuantizedValue.quantize(
+            fvalue, quantized_dtype, extract_diagonal
+        )
+        return QuantizedValue(
+            quantized,
+            diagonal_fvalue,
+            bucket_size,
+            quantized_dtype,
+            extract_diagonal,
+            list(quantized.shape),
+        )
+    # Quantization is from Lingvo JAX optimizers.
+    # We extend it for int16 quantization of PSD matrices.
+    @classmethod
+    def quantize(cls, fvalue, quantized_dtype, extract_diagonal=False):
+        """Returns quantized value and the bucket."""
+        if quantized_dtype == jnp.float32:
+            return fvalue, [], []
+        elif quantized_dtype == jnp.bfloat16:
+            return fvalue.astype(jnp.bfloat16), [], []
+        float_dtype = fvalue.dtype
+        if quantized_dtype == jnp.int8:
+            # value -128 is not used.
+            num_buckets = jnp.array(127.0, dtype=float_dtype)
+        elif quantized_dtype == jnp.int16:
+            # value -32768 is not used.
+            num_buckets = jnp.array(32767.0, dtype=float_dtype)
+        else:
+            raise ValueError(f"Quantized dtype {quantized_dtype} not supported.")
+        # max value is mapped to num_buckets
+        if extract_diagonal and fvalue.ndim != 2:
+            raise ValueError(
+                f"Input array {fvalue} must be 2D to work with extract_diagonal."
+            )
+        diagonal_fvalue = []
+        if extract_diagonal:
+            diagonal_fvalue = jnp.diag(fvalue)
+            # Remove the diagonal entries.
+            fvalue = fvalue - jnp.diag(diagonal_fvalue)
+        # TODO(rohananil): Extend this by making use of information about the blocks
+        # SM3 style which will be useful for diagonal statistics
+        # We first decide the scale.
+        if fvalue.ndim < 1:
+            raise ValueError(
+                f"Input array {fvalue} must have a strictly positive number of "
+                "dimensions."
+            )
+        max_abs = jnp.max(jnp.abs(fvalue), axis=0)
+        bucket_size = max_abs / num_buckets
+        bs_expanded = bucket_size[jnp.newaxis, Ellipsis]
+        # To avoid divide by 0.0
+        bs_nonzero = jnp.where(
+            bs_expanded > 0.0, bs_expanded, jnp.ones_like(bs_expanded)
+        )
+        ratio = fvalue / bs_nonzero
+        # We use rounding to remove bias.
+        quantized = jnp.round(ratio)
+        return quantized.astype(quantized_dtype), diagonal_fvalue, bucket_size
+    def to_float(self):
+        """Returns the float value."""
+        if isinstance(self.quantized, list) and not self.quantized:
+            return self.quantized
+        if self.quantized_dtype == jnp.float32:
+            return self.quantized
+        if self.quantized_dtype == jnp.bfloat16:
+            return self.quantized.astype(jnp.float32)
+        float_dtype = self.bucket_size.dtype
+        bucket_size = self.bucket_size[jnp.newaxis, Ellipsis]
+        val = self.quantized.astype(float_dtype) * bucket_size
+        if self.extract_diagonal:
+            val += jnp.diag(self.diagonal)
+        return val

tools/train/scalable_shampoo/sm3.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# coding=utf-8
+# Copyright 2022 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# An implementation of SM3 from:
+#
+# Memory-Efficient Adaptive Optimization, https://arxiv.org/pdf/1901.11150.pdf
+# Rohan Anil, Vineet Gupta, Tomer Koren, Yoram Singer
+#
+# Author: Rohan Anil (rohananil at google dot com)
+#
+"""SM3 Implementation."""
+import functools
+from typing import Any, NamedTuple
+import chex
+import jax
+import jax.numpy as jnp
+import optax
+from .quantization_utils import QuantizedValue
+class SM3State(NamedTuple):
+    count: chex.Array
+    stats: Any
+# Per parameter optimizer state used in data-parallel training.
+class ParameterStats(NamedTuple):
+    """State associated to each parameter of the model being trained."""
+    diagonal_statistics: chex.Array  # Accumulator for diagonal preconditioner
+    diagonal_momentum: QuantizedValue  # Momentum for the diagonal preconditioner
+def sm3(
+    learning_rate, beta1=0.9, beta2=0.999, diagonal_epsilon=1e-10, normalize_grads=False
+):
+    """SM3 optimizer.
+    Memory-Efficient Adaptive Optimization, Rohan Anil, Vineet Gupta, Tomer Koren,
+      Yoram Singer
+    https://arxiv.org/abs/1901.11150
+    Args:
+      learning_rate: the step size used to update the parameters.
+      beta1: momentum parameter.
+      beta2: second moment averaging parameter.
+      diagonal_epsilon: epsilon for sm3
+      normalize_grads: Whether to normalize grads. Author finds it useful when
+        grads are high variance.
+    Returns:
+      a GradientTransformation.
+    """
+    def _quantize_momentum(momentum_statistics):
+        return QuantizedValue.from_float_value(momentum_statistics, jnp.int8)
+    def init_fn(params):
+        """Initialise the optimiser's state."""
+        def _init(param):
+            accumulators = [jnp.zeros([s]) for s in param.shape]
+            momentum = _quantize_momentum(jnp.zeros_like(param))
+            return ParameterStats(accumulators, momentum)
+        return SM3State(
+            count=jnp.zeros([], jnp.int32), stats=jax.tree_map(_init, params)
+        )
+    def _get_expanded_shape(shape, i):
+        rank = len(shape)
+        # Replaces a `shape` of [M, N, K] with 1 in all dimensions except for i.
+        # For eg: i = 1 returns [1, N, 1].
+        return [1] * i + [shape[i]] + [1] * (rank - i - 1)
+    def _moving_averages(grad, accumulators):
+        w = (1.0 - beta2) if beta2 != 1.0 else 1.0
+        if grad.ndim < 2:
+            return beta2 * accumulators[0] + w * grad**2
+        else:
+            min_accumulator = functools.reduce(jnp.minimum, accumulators)
+            return beta2 * min_accumulator + w * grad**2
+    def _moving_averages_momentum(grad, momentum):
+        w = (1.0 - beta1) if beta1 != 1.0 else 1.0
+        return beta1 * momentum.to_float() + w * grad
+    def _sketch_diagonal_statistics(grad, updated_diagonal_statistics):
+        all_diagonal_statistics = []
+        for i in range(grad.ndim):
+            axes = list(range(i)) + list(range(i + 1, grad.ndim))
+            dim_diagonal_statistics = jnp.max(updated_diagonal_statistics, axis=axes)
+            all_diagonal_statistics.append(dim_diagonal_statistics)
+        if grad.ndim == 1:
+            all_diagonal_statistics[0] = updated_diagonal_statistics
+        return all_diagonal_statistics
+    def update_fn(updates, state, params=None):
+        del params
+        stats = state.stats
+        if normalize_grads:
+            updates = jax.tree_map(lambda g: g / (jnp.linalg.norm(g) + 1e-16), updates)
+        # Reshape all vectors into N-d tensors to compute min over them.
+        # [n], [m] -> [n, 1], [1, m]
+        expanded_diagonal_statistics = jax.tree_multimap(
+            lambda grad, state: [  # pylint:disable=g-long-lambda
+                jnp.reshape(
+                    state.diagonal_statistics[i], _get_expanded_shape(grad.shape, i)
+                )
+                for i in range(grad.ndim)
+            ],
+            updates,
+            stats,
+        )
+        # Compute new diagonal statistics
+        new_diagonal_statistics = jax.tree_multimap(
+            _moving_averages, updates, expanded_diagonal_statistics
+        )
+        # Compute preconditioners (1/sqrt(s)) where s is the statistics.
+        new_preconditioners = jax.tree_map(
+            lambda t: 1.0 / jnp.sqrt(t + diagonal_epsilon), new_diagonal_statistics
+        )
+        preconditioned_grads = jax.tree_multimap(
+            lambda g, p: g * p, updates, new_preconditioners
+        )
+        # Compute updated momentum (also handle quantization)
+        updated_momentum = jax.tree_multimap(
+            lambda preconditioned_grad, state: _moving_averages_momentum(  # pylint:disable=g-long-lambda
+                preconditioned_grad, state.diagonal_momentum
+            ),
+            preconditioned_grads,
+            stats,
+        )
+        # Update diagonal statistics.
+        updated_diagonal_statistics = jax.tree_multimap(
+            _sketch_diagonal_statistics, updates, new_diagonal_statistics
+        )
+        # Update momentum.
+        new_sm3_stats = jax.tree_multimap(
+            lambda momentum, diagonal_stats: ParameterStats(  # pylint:disable=g-long-lambda
+                diagonal_stats, _quantize_momentum(momentum)
+            ),
+            updated_momentum,
+            updated_diagonal_statistics,
+        )
+        lr = learning_rate
+        if callable(learning_rate):
+            lr = learning_rate(state.count)
+        new_updates = jax.tree_map(lambda pg: -lr * pg, updated_momentum)
+        return new_updates, SM3State(count=state.count + 1, stats=new_sm3_stats)
+    return optax.GradientTransformation(init_fn, update_fn)

tools/train/scalable_shampoo/symmetric_matrices/symmetric_matrices.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# coding=utf-8
+# Copyright 2022 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""JAX Ops for symmetric matrices used by the Shampoo optimizer."""
+import functools
+from typing import List, Union
+import jax
+import jax.numpy as jnp
+from flax import struct
+from jax import lax
+@struct.dataclass
+class SlicedSymmetricMatrix:
+    """A symmetric matrix represented by lower-triangular block row slices.
+    For example, the symmetric matrix M = [[a, b^T], [b, c]] would be represented
+    by the block rows a and [b, c].
+    The matrix may be batched, in which case each entry of block_rows may have
+    dimension greater than 2. The last two dimensions represent the rows and cols.
+    """
+    block_rows: List[jnp.ndarray]
+def product_with_transpose(
+    mat1,
+    mat2,
+    precision=lax.Precision.DEFAULT,
+):
+    """Returns mat1 * mat2^T for two matrices (possibly batched).
+    The rows and columns are the last two dimensions for each matrix.
+    Args:
+      mat1: First matrix.
+      mat2: Second matrix.
+      precision: JAX precision to use for the multiplication.
+    """
+    return jnp.einsum("...ij,...kj->...ik", mat1, mat2, precision=precision)
+@functools.partial(jax.jit, static_argnames=("block_size", "precision"))
+def sliced_transposed_product(
+    mat,
+    block_size,
+    precision=lax.Precision.DEFAULT,
+):
+    """Returns the blocked slices representing a symmetric matrix mat*mat^T.
+    Args:
+      mat: The matrix for which we will compute mat*mat^T. It does not need to be
+        square, and may be batched.
+      block_size: The size of row blocks to compute.
+      precision: The precision to use in each computation.
+    Raises:
+      ValueError: Raised when the specified block size does not evenly divide
+        the number of rows of the input mat.
+    """
+    num_rows = mat.shape[-2]
+    if num_rows % block_size != 0:
+        raise ValueError(
+            "The row dimension must be divisible by block_size. "
+            f"Instead got row dimension={num_rows} and block_size={block_size}."
+        )
+    block_rows = [
+        product_with_transpose(
+            mat[Ellipsis, i * block_size : (i + 1) * block_size, :],
+            mat[Ellipsis, 0 : (i + 1) * block_size, :],
+            precision,
+        )
+        for i in range(num_rows // block_size)
+    ]
+    return SlicedSymmetricMatrix(block_rows=block_rows)
+@functools.partial(jax.jit, static_argnames=("block_size", "precision"))
+def sliced_transposed_product_concat(
+    mat,
+    block_size,
+    precision=lax.Precision.DEFAULT,
+):
+    """Returns the concatenated slices representing mat*mat^T.
+    Args:
+      mat: The matrix for which we will compute mat*mat^T. It does not need to be
+        square, and may be batched.
+      block_size: The size of row blocks to compute.
+      precision: The precision to use in each computation.
+    Raises:
+      ValueError: Raised when the specified block size does not evenly divide
+        the number of rows of the input mat.
+    """
+    sliced_symmetric_matrix = sliced_transposed_product(
+        mat=mat, block_size=block_size, precision=precision
+    )
+    return jnp.concatenate(sliced_symmetric_matrix.block_rows, axis=-1)
+@jax.jit
+def materialize_matrix(symmetric_matrix):
+    """Returns a materialized symmetric matrix.
+    Args:
+      symmetric_matrix: the matrix represented by lower-triangular block slices.
+    """
+    block_rows = symmetric_matrix.block_rows
+    block_size = block_rows[0].shape[-2]
+    num_blocks = len(block_rows)
+    # Slice the lower-triangular and diagonal blocks into blocks.
+    blocks = [
+        [
+            block_row[Ellipsis, i * block_size : (i + 1) * block_size]
+            for i in range(k + 1)
+        ]
+        for k, block_row in enumerate(block_rows)
+    ]
+    # Generate the (off-diagonal) upper-triangular blocks.
+    off_diags = [[] for _ in range(num_blocks - 1)]
+    for k, block_row in enumerate(block_rows[1:]):
+        for i in range(k + 1):
+            off_diags[i].append(
+                jnp.swapaxes(
+                    a=block_row[Ellipsis, i * block_size : (i + 1) * block_size],
+                    axis1=-1,
+                    axis2=-2,
+                )
+            )
+    return jnp.block(
+        [row + row_t for row, row_t in zip(blocks[:-1], off_diags)] + [blocks[-1]]
+    )
+@functools.partial(jax.jit, static_argnames=("num_blocks"))
+def materialize_matrix_from_concat(
+    block_rows_concat,
+    num_blocks,
+):
+    """Returns a materialized symmetric matrix from concatenated slices.
+    Args:
+      block_rows_concat: The matrix represented as the concatenated
+        lower-triangular blocks.
+      num_blocks: The number of block-rows used to represent the symmetric matrix.
+    """
+    block_size = block_rows_concat.shape[-2]
+    block_rows = [
+        block_rows_concat[
+            Ellipsis,
+            (k * (k + 1))
+            // 2
+            * block_size : (((k + 1) * (k + 2)) // 2 + 1)
+            * block_size,
+        ]
+        for k in range(num_blocks)
+    ]
+    return materialize_matrix(SlicedSymmetricMatrix(block_rows=block_rows))
+@functools.partial(jax.jit, static_argnames=("alpha", "beta"))
+def update_sliced_rows(
+    symmetric_matrix,
+    mat,
+    alpha,
+    beta,
+):
+    """Implements the blocked equivalent of SYRK.
+    Specifically, the symmetric matrix (represented using lower-triangular block
+    rows) is updated using the sliced product of mat.
+    Args:
+      symmetric_matrix: The symmetric matrix to update.
+      mat: The matrix to use for the update = mat * mat^T. The number of rows
+        should match that of symmetric_matrix.
+      alpha: The weight for the update.
+      beta: The weight for the original symmetric matrix.
+    Returns:
+      The updated rows of alpha * mat * mat^T + beta * symmetric_matrix.
+    """
+    block_size = symmetric_matrix.block_rows[0].shape[-2]
+    sym_prod = sliced_transposed_product(mat=mat, block_size=block_size)
+    return SlicedSymmetricMatrix(
+        block_rows=[
+            update * alpha + row * beta
+            for update, row in zip(sym_prod.block_rows, symmetric_matrix.block_rows)
+        ]
+    )

tools/train/train.py CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2021-2022 The HuggingFace & DALL·E Mini Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,7 +37,6 @@ import optax
 import transformers
 import wandb
 from datasets import Dataset
-from distributed_shampoo import GraftingType, distributed_shampoo
 from flax.core.frozen_dict import FrozenDict, freeze
 from flax.serialization import from_bytes, to_bytes
 from flax.training import train_state
@@ -46,6 +45,7 @@ from google.cloud import storage
 from jax.experimental import PartitionSpec, maps
 from jax.experimental.compilation_cache import compilation_cache as cc
 from jax.experimental.pjit import pjit, with_sharding_constraint
 from tqdm import tqdm
 from transformers import HfArgumentParser
@@ -57,7 +57,7 @@ from dalle_mini.model import (
     set_partitions,
 )
-cc.initialize_cache("./jax_cache", max_cache_size_bytes=5 * 2**30)
 logger = logging.getLogger(__name__)
@@ -203,6 +203,12 @@ class DataTrainingArguments:
             "help": "Whether to shard data files by host in multi-host environments."
         },
     )
     max_train_samples: Optional[int] = field(
         default=None,
         metadata={
@@ -314,10 +320,6 @@ class TrainingArguments:
         default=1024,
         metadata={"help": "Chunked size for large layers with Distributed Shampoo."},
     )
-    start_preconditioning_step: int = field(
-        default=100,
-        metadata={"help": "Number of steps before starting to update preconditioner."},
-    )
     preconditioning_compute_steps: int = field(
         default=10, metadata={"help": "Number of steps to update preconditioner."}
     )
@@ -325,6 +327,12 @@ class TrainingArguments:
         default=4096,
         metadata={"help": "Max size for preconditioning with Distributed Shampoo."},
     )
     optim_quantized: bool = field(
         default=False,
         metadata={
@@ -413,11 +421,28 @@ class TrainingArguments:
     dp_devices: int = field(init=False)
     def __post_init__(self):
         assert self.optim in [
             "distributed_shampoo",
             "adam",
             "adafactor",
         ], f"Selected optimizer not supported: {self.optim}"
         if self.per_device_eval_batch_size is None:
             self.per_device_eval_batch_size = self.per_device_train_batch_size
         if (
@@ -430,6 +455,9 @@ class TrainingArguments:
                 f"Output directory ({self.output_dir}) already exists and is not empty."
                 "Use --overwrite_output_dir to overcome."
             )
         assert (
             jax.device_count() % self.mp_devices == 0
         ), f"Number of available devices ({jax.device_count()} must be divisible by number of devices used for model parallelism ({self.mp_devices})."
@@ -514,10 +542,6 @@ def main():
     logger.info(f"Local TPUs: {jax.local_device_count()}")
     logger.info(f"Global TPUs: {jax.device_count()}")
-    if training_args.assert_TPU_available:
-        assert (
-            jax.local_device_count() == 8
-        ), "TPUs in use, please check running processes"
     # Set up wandb run
     if jax.process_index() == 0:
@@ -544,8 +568,7 @@ def main():
             config=config,
             seed=training_args.seed_model,
             dtype=getattr(jnp, model_args.dtype),
-            abstract_init=True,
-            load_on_cpu=True,
             # initializing params with gradient checkpointing creates issues
             # we correctly set it later per training_args
             gradient_checkpointing=False,
@@ -555,29 +578,23 @@ def main():
             config,
             seed=training_args.seed_model,
             dtype=getattr(jnp, model_args.dtype),
-            load_on_cpu=True,
         )
-    # update model config per training args
-    # Done after initialization of weights to avoid issues with remat
-    # This is still considered correctly during training as function is pjitted
-    model.config.gradient_checkpointing = training_args.gradient_checkpointing
     if training_args.gradient_checkpointing:
-        # eval model cannot use remat
-        eval_config = copy.deepcopy(model.config)
-        eval_config.gradient_checkpointing = False
-        eval_model = DalleBart(
-            eval_config,
             seed=training_args.seed_model,
             dtype=getattr(jnp, model_args.dtype),
-            abstract_init=True,
-            load_on_cpu=True,
         )
-        del eval_model._params
-        eval_fn = eval_model.__call__
     else:
-        eval_fn = model.__call__
     # get model metadata
     model_metadata = model_args.get_metadata()
@@ -620,7 +637,7 @@ def main():
     eval_batch_size_per_step = eval_batch_size_per_node * jax.process_count()
     len_train_dataset, len_eval_dataset = dataset.length
     steps_per_epoch = (
-        len_train_dataset // batch_size_per_step
         if len_train_dataset is not None
         else None
     )
@@ -633,7 +650,7 @@ def main():
     logger.info(f"  Num examples = {len_train_dataset}")
     logger.info(f"  Num Epochs = {num_epochs}")
     logger.info(
-        f"  Batch size per device = {training_args.per_device_train_batch_size}"
     )
     logger.info(f"  Number of devices = {jax.device_count()}")
     logger.info(
@@ -701,22 +718,32 @@ def main():
     # create adam optimizer
     if training_args.optim == "distributed_shampoo":
         # parameters from https://github.com/tensorflow/lingvo/blob/03ee9d7cd50764b0424c7c863733c91fc0b053ec/lingvo/jax/optimizers.py#L729
         optimizer = distributed_shampoo(
             learning_rate_fn,
             block_size=training_args.block_size,
             beta1=training_args.beta1,
             beta2=training_args.beta2,
             diagonal_epsilon=1e-10,
-            matrix_epsilon=1e-8,
-            start_preconditioning_step=training_args.start_preconditioning_step,
             preconditioning_compute_steps=training_args.preconditioning_compute_steps,
             statistics_compute_steps=1,
             best_effort_shape_interpretation=True,
-            graft_type=GraftingType.RMSPROP_NORMALIZED,
             nesterov=False,
             exponent_override=0,
-            statistics_partition_spec=PartitionSpec(None, "batch", None),
-            preconditioner_partition_spec=PartitionSpec("batch", None, None),
             num_devices_for_pjit=training_args.dp_devices,
             shard_optimizer_states=True,
             inverse_failure_threshold=0.1,
@@ -779,7 +806,7 @@ def main():
             opt_state_spec = opt_fn.pspec_fn(
                 params=model.params,
                 params_partition_spec=param_spec,
-                partition_spec_for_statistics=PartitionSpec(None, "batch", None),
             )
         else:
             raise NotImplementedError
@@ -790,7 +817,8 @@ def main():
     # create a mesh
     mesh_shape = (training_args.dp_devices, training_args.mp_devices)
     devices = np.asarray(jax.devices()).reshape(*mesh_shape)
-    mesh = maps.Mesh(devices, ("batch", "mp"))
     # define state spec
     state_spec = TrainState(
@@ -801,28 +829,39 @@ def main():
         epoch=None,
         train_time=None,
         train_samples=None,
-        apply_fn=model.__call__,
         tx=optimizer,
     )
-    # create training state
     with maps.mesh(mesh.devices, mesh.axis_names):
         if not model_args.restore_state:
             def init_state(params):
                 return TrainState.create(
-                    apply_fn=model.__call__,
                     tx=optimizer,
-                    params=params,
                     dropout_rng=dropout_rng,
                 )
             state = pjit(
                 init_state,
-                in_axis_resources=(param_spec,),
                 out_axis_resources=state_spec,
                 donate_argnums=(0,),
-            )(model.params)
         else:
             # load opt_state
@@ -836,7 +875,7 @@ def main():
             def restore_state(params, opt_state):
                 return TrainState(
-                    apply_fn=model.__call__,
                     tx=optimizer,
                     params=params,
                     opt_state=opt_state,
@@ -846,7 +885,10 @@ def main():
             state = pjit(
                 restore_state,
-                in_axis_resources=(param_spec, opt_state_spec),
                 out_axis_resources=state_spec,
                 donate_argnums=(0, 1),
             )(model.params, opt_state)
@@ -854,37 +896,32 @@ def main():
             # remove opt_state from CPU
             del opt_state
-    # free memory
     del model._params, opt_state_spec, opt_state_shape
     # define batch specs
-    keys = ["attention_mask", "decoder_input_ids", "input_ids", "labels"]
-    batch_spec = freeze({k: PartitionSpec("batch") for k in keys})
-    grad_batch_spec = freeze({k: PartitionSpec(None, "batch") for k in keys})
-    # label smoothed cross entropy
     def loss_fn(logits, labels):
         loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1]))
         loss = loss.mean()
         return loss
     # Define gradient update step fn
     def train_step(state, batch, delta_time):
-        # we reshape to (gradient_accumulation_steps, dp_devices, ...)
-        # allows feeding partial batch size per node for full model parallel
-        batch = jax.tree_map(
-            lambda x: x.reshape(
-                (
-                    training_args.gradient_accumulation_steps,
-                    training_args.dp_devices,
-                    training_args.per_device_train_batch_size,
-                )
-                + x.shape[2:]
-            ),
-            batch,
-        )
-        # ensure data is sharded correctly per dp device
-        batch = with_sharding_constraint(batch, grad_batch_spec)
         # get a minibatch (one gradient accumulation slice)
         def get_minibatch(batch, grad_idx):
@@ -904,62 +941,71 @@ def main():
         grad_fn = jax.value_and_grad(compute_loss)
         def loss_and_grad(grad_idx, dropout_rng):
-            # minibatch at grad_idx, shape (dp_devices, per_device_train_batch_size, ...)
-            minibatch = get_minibatch(batch, grad_idx)
-            # calculate loss and grads independently per dp_device
-            dropout_rng, _ = jax.random.split(dropout_rng)
-            # ensure inputs are sharded per device
-            minibatch = jax.tree_map(
-                lambda x: with_sharding_constraint(x, PartitionSpec("batch")),
-                minibatch,
-            )
-            # only 1 single rng per grad step, let us handle larger batch size
-            loss_grads = jax.vmap(grad_fn, in_axes=(None, 0, None), out_axes=(0, 0))(
-                state.params, minibatch, dropout_rng
             )
-            # ensure outputs are sharded per device
-            loss_grads = jax.tree_map(
-                lambda x: with_sharding_constraint(x, PartitionSpec("batch")),
-                loss_grads,
-            )
-            # average across all devices
-            loss_grads = jax.tree_map(lambda x: jnp.mean(x, axis=0), loss_grads)
             # return loss and grads
-            return loss_grads, dropout_rng
         if training_args.gradient_accumulation_steps == 1:
-            loss_grad, dropout_rng = loss_and_grad(0, state.dropout_rng)
         else:
             # create initial state for cumul_minibatch_step loop
             init_minibatch_step = (
-                (
-                    0.0,
-                    jax.tree_map(jnp.zeros_like, state.params),
                 ),
                 state.dropout_rng,
             )
             # accumulate gradients
             def cumul_minibatch_step(grad_idx, cumul_loss_grad_dropout):
-                cumul_loss_grad, dropout_rng = cumul_loss_grad_dropout
-                loss_grad, dropout_rng = loss_and_grad(grad_idx, dropout_rng)
-                cumul_loss_grad = jax.tree_map(jnp.add, cumul_loss_grad, loss_grad)
-                return cumul_loss_grad, dropout_rng
             # loop over gradients
-            loss_grad, dropout_rng = jax.lax.fori_loop(
                 0,
                 training_args.gradient_accumulation_steps,
                 cumul_minibatch_step,
                 init_minibatch_step,
             )
             # sum -> mean
-            loss_grad = jax.tree_map(
-                lambda x: x / training_args.gradient_accumulation_steps, loss_grad
             )
         # update state
-        loss, grads = loss_grad
         state = state.apply_gradients(
             grads=grads,
             dropout_rng=dropout_rng,
@@ -976,37 +1022,32 @@ def main():
     # Define eval fn
     def eval_step(state, batch):
-        # we reshape to (dp_devices, ...)
-        batch = jax.tree_map(
-            lambda x: x.reshape(
-                (
-                    training_args.dp_devices,
-                    training_args.per_device_eval_batch_size,
-                )
-                + x.shape[1:]
-            ),
-            batch,
-        )
-        # ensure data is sharded correctly per dp device
-        batch = with_sharding_constraint(batch, batch_spec)
         def compute_eval_loss(batch):
             batch, labels = batch.pop("labels")
             logits = eval_fn(**batch, params=state.params, train=False)[0]
             return loss_fn(logits, labels)
-        # calculate loss independently per dp_device
-        loss = jax.vmap(compute_eval_loss, in_axes=(0,), out_axes=0)(batch)
-        # ensure they are sharded over dp devices
-        loss = with_sharding_constraint(loss, PartitionSpec("batch"))
-        # average across all devices
-        loss = jnp.mean(loss)
         return loss
     # Create parallel version of the train and eval step
     p_train_step = pjit(
         train_step,
-        in_axis_resources=(state_spec, grad_batch_spec, None),
         out_axis_resources=(state_spec, None),
         donate_argnums=(0,),
     )
@@ -1022,7 +1063,10 @@ def main():
     step = int(state.step)
     metrics_logger = MetricsLogger(step)
     epochs = tqdm(
-        range(state.epoch, num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0
     )
     def run_evaluation():
@@ -1041,6 +1085,7 @@ def main():
                 position=2,
                 leave=False,
                 total=eval_steps,
             ):
                 # need to keep only eval_batch_size_per_node items relevant to the node
                 batch = jax.tree_map(
@@ -1050,6 +1095,17 @@ def main():
                     batch,
                 )
                 batch = jax.tree_map(lambda x: x[jax.process_index()], batch)
                 # freeze batch to pass safely to jax transforms
                 batch = freeze(batch)
                 # accumulate losses async
@@ -1166,6 +1222,7 @@ def main():
                     )
                 wandb.run.log_artifact(artifact_state)
     with maps.mesh(mesh.devices, mesh.axis_names):
         for epoch in epochs:
             state.replace(epoch=epoch)
@@ -1186,21 +1243,33 @@ def main():
                 position=1,
                 leave=False,
                 total=steps_per_epoch,
             ):
                 # calculate delta time (we have a lag of one step but it's ok)
                 new_time = time.perf_counter()
                 delta_time = new_time - last_time
                 last_time = new_time
-                # reshape data into (gradient_accumulation_steps, dp_devices, batch_per_dp, ...)
                 batch = jax.tree_map(
-                    lambda x: x.reshape(
-                        (
-                            training_args.gradient_accumulation_steps,
-                            batch_size_per_node_per_grad_step,
-                        )
-                        + x.shape[1:]
-                    ),
                     batch,
                 )
                 # freeze batch to pass safely to jax transforms

 #!/usr/bin/env python
 # coding=utf-8
+# Copyright 2021-2022 The HuggingFace & DALL·E Mini team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 import transformers
 import wandb
 from datasets import Dataset
 from flax.core.frozen_dict import FrozenDict, freeze
 from flax.serialization import from_bytes, to_bytes
 from flax.training import train_state
 from jax.experimental import PartitionSpec, maps
 from jax.experimental.compilation_cache import compilation_cache as cc
 from jax.experimental.pjit import pjit, with_sharding_constraint
+from scalable_shampoo.distributed_shampoo import GraftingType, distributed_shampoo
 from tqdm import tqdm
 from transformers import HfArgumentParser
     set_partitions,
 )
+cc.initialize_cache("./jax_cache", max_cache_size_bytes=10 * 2**30)
 logger = logging.getLogger(__name__)
             "help": "Whether to shard data files by host in multi-host environments."
         },
     )
+    blank_caption_prob: Optional[float] = field(
+        default=0.0,
+        metadata={
+            "help": "Probability of removing some captions for classifier-free guidance."
+        },
+    )
     max_train_samples: Optional[int] = field(
         default=None,
         metadata={
         default=1024,
         metadata={"help": "Chunked size for large layers with Distributed Shampoo."},
     )
     preconditioning_compute_steps: int = field(
         default=10, metadata={"help": "Number of steps to update preconditioner."}
     )
         default=4096,
         metadata={"help": "Max size for preconditioning with Distributed Shampoo."},
     )
+    graft_type: str = field(
+        default="rmsprop_normalized",
+        metadata={
+            "help": "The type of grafting to use. Can be 'rmsprop_normalized' (default), 'rmsprop', 'adagrad', 'adagrad_normalized', 'sgd' or 'sqrt_n'"
+        },
+    )
     optim_quantized: bool = field(
         default=False,
         metadata={
     dp_devices: int = field(init=False)
     def __post_init__(self):
+        if self.assert_TPU_available:
+            assert (
+                jax.local_device_count() == 8
+            ), "TPUs in use, please check running processes"
         assert self.optim in [
             "distributed_shampoo",
             "adam",
             "adafactor",
         ], f"Selected optimizer not supported: {self.optim}"
+        assert self.graft_type in [
+            "rmsprop_normalized",
+            "rmsprop",
+            "adagrad",
+            "adagrad_normalized",
+            "sgd",
+            "sqrt_n",
+        ], f"Selected graft type not supported: {self.graft_type}"
+        assert self.lr_decay in [
+            None,
+            "linear",
+            "exponential",
+        ], f"Selected learning rate decay not supported: {self.lr_decay}"
         if self.per_device_eval_batch_size is None:
             self.per_device_eval_batch_size = self.per_device_train_batch_size
         if (
                 f"Output directory ({self.output_dir}) already exists and is not empty."
                 "Use --overwrite_output_dir to overcome."
             )
+        assert (
+            self.mp_devices > 0
+        ), f"Number of devices for model parallelism must be > 0"
         assert (
             jax.device_count() % self.mp_devices == 0
         ), f"Number of available devices ({jax.device_count()} must be divisible by number of devices used for model parallelism ({self.mp_devices})."
     logger.info(f"Local TPUs: {jax.local_device_count()}")
     logger.info(f"Global TPUs: {jax.device_count()}")
     # Set up wandb run
     if jax.process_index() == 0:
             config=config,
             seed=training_args.seed_model,
             dtype=getattr(jnp, model_args.dtype),
+            abstract_init=True,  # we overwrite them with loaded checkpoint
             # initializing params with gradient checkpointing creates issues
             # we correctly set it later per training_args
             gradient_checkpointing=False,
             config,
             seed=training_args.seed_model,
             dtype=getattr(jnp, model_args.dtype),
+            abstract_init=True,
         )
+    # define model eval and train functions
+    eval_fn = model.__call__
     if training_args.gradient_checkpointing:
+        remat_config = copy.deepcopy(model.config)
+        remat_config.gradient_checkpointing = True
+        remat_model = DalleBart(
+            remat_config,
             seed=training_args.seed_model,
             dtype=getattr(jnp, model_args.dtype),
+            init_weights=False,
         )
+        train_fn = remat_model.__call__
     else:
+        train_fn = model.__call__
     # get model metadata
     model_metadata = model_args.get_metadata()
     eval_batch_size_per_step = eval_batch_size_per_node * jax.process_count()
     len_train_dataset, len_eval_dataset = dataset.length
     steps_per_epoch = (
+        len_train_dataset // batch_size_per_node
         if len_train_dataset is not None
         else None
     )
     logger.info(f"  Num examples = {len_train_dataset}")
     logger.info(f"  Num Epochs = {num_epochs}")
     logger.info(
+        f"  Batch size per dp device = {training_args.per_device_train_batch_size}"
     )
     logger.info(f"  Number of devices = {jax.device_count()}")
     logger.info(
     # create adam optimizer
     if training_args.optim == "distributed_shampoo":
         # parameters from https://github.com/tensorflow/lingvo/blob/03ee9d7cd50764b0424c7c863733c91fc0b053ec/lingvo/jax/optimizers.py#L729
+        graft_type = {
+            "sgd": GraftingType.SGD,
+            "adagrad": GraftingType.ADAGRAD,
+            "rmsprop": GraftingType.RMSPROP,
+            "rmsprop_normalized": GraftingType.RMSPROP_NORMALIZED,
+            "sqrt_n": GraftingType.SQRT_N,
+            "adagrad_normalized": GraftingType.ADAGRAD_NORMALIZED,
+        }[training_args.graft_type]
         optimizer = distributed_shampoo(
             learning_rate_fn,
             block_size=training_args.block_size,
             beta1=training_args.beta1,
             beta2=training_args.beta2,
             diagonal_epsilon=1e-10,
+            matrix_epsilon=1e-6,
+            start_preconditioning_step=max(
+                training_args.preconditioning_compute_steps + 1, 101
+            ),
             preconditioning_compute_steps=training_args.preconditioning_compute_steps,
             statistics_compute_steps=1,
             best_effort_shape_interpretation=True,
+            graft_type=graft_type,
             nesterov=False,
             exponent_override=0,
+            statistics_partition_spec=PartitionSpec(None, "dp", None),
+            preconditioner_partition_spec=PartitionSpec("dp", None, None),
             num_devices_for_pjit=training_args.dp_devices,
             shard_optimizer_states=True,
             inverse_failure_threshold=0.1,
             opt_state_spec = opt_fn.pspec_fn(
                 params=model.params,
                 params_partition_spec=param_spec,
+                partition_spec_for_statistics=PartitionSpec(None, "dp", None),
             )
         else:
             raise NotImplementedError
     # create a mesh
     mesh_shape = (training_args.dp_devices, training_args.mp_devices)
     devices = np.asarray(jax.devices()).reshape(*mesh_shape)
+    mesh = maps.Mesh(devices, ("dp", "mp"))
+    logger.info(f"  Mesh shape: {mesh_shape}")
     # define state spec
     state_spec = TrainState(
         epoch=None,
         train_time=None,
         train_samples=None,
+        apply_fn=train_fn,
         tx=optimizer,
     )
+    # init params if not available yet
+    def maybe_init_params(params):
+        if model_args.model_name_or_path:
+            # model params are correctly loaded
+            return params
+        else:
+            # params have not been initialized yet
+            return model.init_weights()
     with maps.mesh(mesh.devices, mesh.axis_names):
+        logger.info("  Creating state")
         if not model_args.restore_state:
             def init_state(params):
                 return TrainState.create(
+                    apply_fn=train_fn,
                     tx=optimizer,
+                    params=maybe_init_params(params),
                     dropout_rng=dropout_rng,
                 )
             state = pjit(
                 init_state,
+                in_axis_resources=(param_spec,)
+                if model_args.model_name_or_path
+                else None,
                 out_axis_resources=state_spec,
                 donate_argnums=(0,),
+            )(model.params if model_args.model_name_or_path else None)
         else:
             # load opt_state
             def restore_state(params, opt_state):
                 return TrainState(
+                    apply_fn=train_fn,
                     tx=optimizer,
                     params=params,
                     opt_state=opt_state,
             state = pjit(
                 restore_state,
+                in_axis_resources=(
+                    param_spec,
+                    opt_state_spec,
+                ),
                 out_axis_resources=state_spec,
                 donate_argnums=(0, 1),
             )(model.params, opt_state)
             # remove opt_state from CPU
             del opt_state
+    # free CPU memory
     del model._params, opt_state_spec, opt_state_shape
     # define batch specs
+    batch_spec = PartitionSpec("dp")
+    grad_batch_spec = PartitionSpec(None, "dp")
+    # define loss
     def loss_fn(logits, labels):
         loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1]))
         loss = loss.mean()
         return loss
+    # "vmap trick" avoids a crash when mp_devices > 1 (not sure why it happens)
+    # lead to better perf: see https://wandb.ai/dalle-mini/dalle-mini/reports/JAX-pmap-vs-pjit--VmlldzoxNDg1ODA2
+    use_vmap_trick = True
+    # make grad_param_spec for vmap
+    if use_vmap_trick:
+        grad_param_spec = jax.tree_map(
+            lambda x: PartitionSpec(*("dp",) + (x if x is not None else (None,))),
+            param_spec,
+        )
     # Define gradient update step fn
     def train_step(state, batch, delta_time):
         # get a minibatch (one gradient accumulation slice)
         def get_minibatch(batch, grad_idx):
         grad_fn = jax.value_and_grad(compute_loss)
         def loss_and_grad(grad_idx, dropout_rng):
+            # minibatch at grad_idx for gradient accumulation (None otherwise)
+            minibatch = (
+                get_minibatch(batch, grad_idx) if grad_idx is not None else batch
             )
+            # ensure it is sharded properly
+            minibatch = with_sharding_constraint(minibatch, batch_spec)
+            # only 1 single rng per grad step, let us handle larger batch size (not sure why)
+            dropout_rng, _ = jax.random.split(dropout_rng)
+            if use_vmap_trick:
+                # "vmap trick", calculate loss and grads independently per dp_device
+                loss, grads = jax.vmap(
+                    grad_fn, in_axes=(None, 0, None), out_axes=(0, 0)
+                )(state.params, minibatch, dropout_rng)
+                # ensure they are sharded correctly
+                loss = with_sharding_constraint(loss, batch_spec)
+                grads = with_sharding_constraint(grads, grad_param_spec)
+                # average across all devices
+                # Note: we could average per device only after gradient accumulation, right before params update
+                loss, grads = jax.tree_map(lambda x: jnp.mean(x, axis=0), (loss, grads))
+            else:
+                # "vmap trick" does not work in multi-hosts and requires too much hbm
+                loss, grads = grad_fn(state.params, minibatch, dropout_rng)
+            # ensure grads are sharded
+            grads = with_sharding_constraint(grads, param_spec)
             # return loss and grads
+            return loss, grads, dropout_rng
         if training_args.gradient_accumulation_steps == 1:
+            loss, grads, dropout_rng = loss_and_grad(None, state.dropout_rng)
         else:
             # create initial state for cumul_minibatch_step loop
             init_minibatch_step = (
+                0.0,
+                with_sharding_constraint(
+                    jax.tree_map(jnp.zeros_like, state.params), param_spec
                 ),
                 state.dropout_rng,
             )
             # accumulate gradients
             def cumul_minibatch_step(grad_idx, cumul_loss_grad_dropout):
+                cumul_loss, cumul_grads, dropout_rng = cumul_loss_grad_dropout
+                loss, grads, dropout_rng = loss_and_grad(grad_idx, dropout_rng)
+                cumul_loss, cumul_grads = jax.tree_map(
+                    jnp.add, (cumul_loss, cumul_grads), (loss, grads)
+                )
+                cumul_grads = with_sharding_constraint(cumul_grads, param_spec)
+                return cumul_loss, cumul_grads, dropout_rng
             # loop over gradients
+            loss, grads, dropout_rng = jax.lax.fori_loop(
                 0,
                 training_args.gradient_accumulation_steps,
                 cumul_minibatch_step,
                 init_minibatch_step,
             )
+            grads = with_sharding_constraint(grads, param_spec)
             # sum -> mean
+            loss, grads = jax.tree_map(
+                lambda x: x / training_args.gradient_accumulation_steps, (loss, grads)
             )
         # update state
+        grads = with_sharding_constraint(grads, param_spec)
         state = state.apply_gradients(
             grads=grads,
             dropout_rng=dropout_rng,
     # Define eval fn
     def eval_step(state, batch):
         def compute_eval_loss(batch):
             batch, labels = batch.pop("labels")
             logits = eval_fn(**batch, params=state.params, train=False)[0]
             return loss_fn(logits, labels)
+        if use_vmap_trick:
+            loss = jax.vmap(compute_eval_loss)(batch)
+            # ensure they are sharded correctly
+            loss = with_sharding_constraint(loss, batch_spec)
+            # average across all devices
+            loss = jnp.mean(loss)
+        else:
+            loss = compute_eval_loss(batch)
         return loss
     # Create parallel version of the train and eval step
     p_train_step = pjit(
         train_step,
+        in_axis_resources=(
+            state_spec,
+            grad_batch_spec
+            if training_args.gradient_accumulation_steps > 1
+            else batch_spec,
+            None,
+        ),
         out_axis_resources=(state_spec, None),
         donate_argnums=(0,),
     )
     step = int(state.step)
     metrics_logger = MetricsLogger(step)
     epochs = tqdm(
+        range(state.epoch, num_epochs),
+        desc=f"Epoch ... (1/{num_epochs})",
+        position=0,
+        disable=jax.process_index() > 0,
     )
     def run_evaluation():
                 position=2,
                 leave=False,
                 total=eval_steps,
+                disable=jax.process_index() > 0,
             ):
                 # need to keep only eval_batch_size_per_node items relevant to the node
                 batch = jax.tree_map(
                     batch,
                 )
                 batch = jax.tree_map(lambda x: x[jax.process_index()], batch)
+                # add dp dimension when using "vmap trick"
+                if use_vmap_trick:
+                    bs_shape = (
+                        jax.local_device_count() // training_args.mp_devices,
+                        training_args.per_device_eval_batch_size,
+                    )
+                    batch = jax.tree_map(
+                        lambda x: x.reshape(bs_shape + x.shape[1:]), batch
+                    )
                 # freeze batch to pass safely to jax transforms
                 batch = freeze(batch)
                 # accumulate losses async
                     )
                 wandb.run.log_artifact(artifact_state)
+    logger.info("  Ready to start training")
     with maps.mesh(mesh.devices, mesh.axis_names):
         for epoch in epochs:
             state.replace(epoch=epoch)
                 position=1,
                 leave=False,
                 total=steps_per_epoch,
+                disable=jax.process_index() > 0,
             ):
                 # calculate delta time (we have a lag of one step but it's ok)
                 new_time = time.perf_counter()
                 delta_time = new_time - last_time
                 last_time = new_time
+                # set correct shape to batch
+                # - add grad_step dim if gradient_accumulation_steps > 1
+                # - split per dp device if not multi-host for vmap trick (does not work in multi-host)
+                bs_shape = (
+                    (batch_size_per_node_per_grad_step,)
+                    if not use_vmap_trick
+                    else (
+                        jax.local_device_count()
+                        // training_args.mp_devices,  # local dp devices
+                        training_args.per_device_train_batch_size,
+                    )
+                )
+                if training_args.gradient_accumulation_steps > 1:
+                    # reshape data into (gradient_accumulation_steps, batch_per_node, ...)
+                    # to avoid any data redistribution when sharding
+                    bs_shape = (training_args.gradient_accumulation_steps,) + bs_shape
+                # reshape batch
                 batch = jax.tree_map(
+                    lambda x: x.reshape(bs_shape + x.shape[1:]),
                     batch,
                 )
                 # freeze batch to pass safely to jax transforms