Spaces:
Running
on
Zero
Running
on
Zero
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import argparse | |
| import logging | |
| import os | |
| from pathlib import Path | |
| from typing import List, Optional | |
| import submitit | |
| from dinov2.utils.cluster import ( | |
| get_slurm_executor_parameters, | |
| get_slurm_partition, | |
| get_user_checkpoint_path, | |
| ) | |
| logger = logging.getLogger("dinov2") | |
| def get_args_parser( | |
| description: Optional[str] = None, | |
| parents: Optional[List[argparse.ArgumentParser]] = None, | |
| add_help: bool = True, | |
| ) -> argparse.ArgumentParser: | |
| parents = parents or [] | |
| slurm_partition = get_slurm_partition() | |
| parser = argparse.ArgumentParser( | |
| description=description, | |
| parents=parents, | |
| add_help=add_help, | |
| ) | |
| parser.add_argument( | |
| "--ngpus", | |
| "--gpus", | |
| "--gpus-per-node", | |
| default=8, | |
| type=int, | |
| help="Number of GPUs to request on each node", | |
| ) | |
| parser.add_argument( | |
| "--nodes", | |
| "--nnodes", | |
| default=2, | |
| type=int, | |
| help="Number of nodes to request", | |
| ) | |
| parser.add_argument( | |
| "--timeout", | |
| default=2800, | |
| type=int, | |
| help="Duration of the job", | |
| ) | |
| parser.add_argument( | |
| "--partition", | |
| default=slurm_partition, | |
| type=str, | |
| help="Partition where to submit", | |
| ) | |
| parser.add_argument( | |
| "--use-volta32", | |
| action="store_true", | |
| help="Request V100-32GB GPUs", | |
| ) | |
| parser.add_argument( | |
| "--comment", | |
| default="", | |
| type=str, | |
| help="Comment to pass to scheduler, e.g. priority message", | |
| ) | |
| parser.add_argument( | |
| "--exclude", | |
| default="", | |
| type=str, | |
| help="Nodes to exclude", | |
| ) | |
| return parser | |
| def get_shared_folder() -> Path: | |
| user_checkpoint_path = get_user_checkpoint_path() | |
| if user_checkpoint_path is None: | |
| raise RuntimeError("Path to user checkpoint cannot be determined") | |
| path = user_checkpoint_path / "experiments" | |
| path.mkdir(exist_ok=True) | |
| return path | |
| def submit_jobs(task_class, args, name: str): | |
| if not args.output_dir: | |
| args.output_dir = str(get_shared_folder() / "%j") | |
| Path(args.output_dir).mkdir(parents=True, exist_ok=True) | |
| executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30) | |
| kwargs = {} | |
| if args.use_volta32: | |
| kwargs["slurm_constraint"] = "volta32gb" | |
| if args.comment: | |
| kwargs["slurm_comment"] = args.comment | |
| if args.exclude: | |
| kwargs["slurm_exclude"] = args.exclude | |
| executor_params = get_slurm_executor_parameters( | |
| nodes=args.nodes, | |
| num_gpus_per_node=args.ngpus, | |
| timeout_min=args.timeout, # max is 60 * 72 | |
| slurm_signal_delay_s=120, | |
| slurm_partition=args.partition, | |
| **kwargs, | |
| ) | |
| executor.update_parameters(name=name, **executor_params) | |
| task = task_class(args) | |
| job = executor.submit(task) | |
| logger.info(f"Submitted job_id: {job.job_id}") | |
| str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id)) | |
| logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}") | |