Spaces:

flax-community
/

dalle-mini

Running

boris commited on Oct 24, 2021

Commit

e2400cc

1 Parent(s): 36cb737

fix: OOM with checkpoints

Files changed (1) hide show

dev/seq2seq/run_seq2seq_flax.py CHANGED Viewed

@@ -262,15 +262,15 @@ class TrainState(train_state.TrainState):
     def restore_state(self, artifact_dir):
         # restore optimizer state
         with (Path(artifact_dir) / "opt_state.msgpack").open("rb") as f:
-            opt_state = from_bytes(self.opt_state, f.read())
         # restore steps
         with (Path(artifact_dir) / "training_state.json").open("r") as f:
             training_state = json.load(f)
-        step = training_state["step"]
         # replace state
-        return self.replace(step=step, opt_state=opt_state)
 class CustomFlaxBartModule(FlaxBartModule):
@@ -802,6 +802,7 @@ def main():
     # Replicate the train state on each device
     state = state.replicate()
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len_train_dataset}")

     def restore_state(self, artifact_dir):
         # restore optimizer state
         with (Path(artifact_dir) / "opt_state.msgpack").open("rb") as f:
+            new_opt_state = from_bytes(self.opt_state, f.read())
         # restore steps
         with (Path(artifact_dir) / "training_state.json").open("r") as f:
             training_state = json.load(f)
+        new_step = training_state["step"]
         # replace state
+        return self.replace(step=new_step, opt_state=new_opt_state)
 class CustomFlaxBartModule(FlaxBartModule):
     # Replicate the train state on each device
     state = state.replicate()
+    del model._params
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len_train_dataset}")