From af842a18edf2bc61b16c23ea44bb187f6d080c48 Mon Sep 17 00:00:00 2001 From: anicolson Date: Mon, 12 Sep 2022 10:21:40 +1000 Subject: [PATCH] Fixes for /~https://github.com/csiro-mlai/dl_hpc_starter_pack/issues/6 and /~https://github.com/csiro-mlai/dl_hpc_starter_pack/issues/3. Also fixed the entrypoint. Modified cluster.py to handle entrypoints. --- README.md | 101 ++++++++++-------------- setup.cfg | 9 ++- src/dlhpcstarter/__main__.py | 6 ++ src/dlhpcstarter/cluster.py | 13 ++- task/cifar10/config/baseline_local.yaml | 4 + task/cifar10/config/resnet18_local.yaml | 6 ++ task/cifar10/stages.py | 1 + 7 files changed, 73 insertions(+), 67 deletions(-) create mode 100644 task/cifar10/config/baseline_local.yaml create mode 100644 task/cifar10/config/resnet18_local.yaml diff --git a/README.md b/README.md index f14bcdf..870ad8a 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,17 @@ - [Neptune.ai](https://neptune.ai/) is used to track experiments; metric scores are automatically uploaded to [Neptune.ai](https://neptune.ai/), allowing you to easily track your experiments from your browser. - Scripts for submission to a cluster manager, such as [SLURM](https://slurm.schedmd.com/documentation.html) are written for you. Also, cluster manager jobs are automatically resubmitted and resumed if they haven't finished before the time-limit. +# Installation + +The Deep Learning and HPC starter pack is available on PyPI: +```shell +pip install dlhpcstarter +``` # Table of Contents -- [Repository map](#repository-map) +- [How to structure your project](#how-to-structure-your-project) +- [Package map](#package-map) - [Tasks](#tasks) - [Models](#models) - [Innovate via Model Composition and Inheritance](#innovate-via-model-composition-and-inheritance) @@ -29,51 +36,44 @@ - [Stages and Trainer](#stages-and-trainer) - [Tying it all together: `main.py`](#tying-it-all-together-mainpy) - [Cluster manager and distributed computing](#cluster-manager-and-distributed-computing) -- [Installing required packages in a `venv`](#installing-required-packages-in-a-venv) - [Monitoring using Neptune.ai](#monitoring-using-neptuneai) - [Where all the outputs go: `exp_dir`](#where-all-the-outputs-go-exp_dir) - [Repository Wish List](#repository-wish-list) -# Repository map +# How to structure your project --- - - -Overview of the repository. ***The most important parts are: `task`, `config`, `models`, and `stages`.*** +There will be a `task` directory containing each of your tasks, e.g., `cifar10`. For each task, you will have a set of configurations and models, which are stored in the `config` and `models` directories, respectively. Each task will also have a `stages` module for each stage of model development. ``` -├── src -│ │ -│ └── tools - for all other modules; tools that are repeadetly used. -│ └── cluster.py - contains the cluster management object. -│ └── command_line_arguments.py - argparse for reading command line arguments. -│ └── trainer.py - contains a wrapper for pytorch_lightning.Trainer. -│ └── utils.py - small utility definitions. -│ -│ -│ -│ ├── task │ │ │ └── TASK_NAME - name of the task, e.g., cifar10. │ └── config - .yaml configuration files for a model. │ └── models - .py modules that contain pytorch_lightning.LightningModule definitions that represent models. │ └── stages.py - training and testing stages for a task. -│ -│ -│ -│ -├── main.py - main.py does the following: -│ 1. Reads command line arguments using argparse. -│ 2. Imports the 'stages' function for the task from task/TASK_NAME/stages.py. -│ 3. Loads the specified configuration .yaml for the job from task/TASK_NAME/config. -│ 4. Submits the job (the configuration + 'stages') to the cluster manager (or runs it locally if 'submit' is false). -│ -│ -│ -│ -└── requirements.txt - Packages required by the library (pip install -r requirements.txt). ``` +# Package map + +--- + +The package is structured as follows: + +``` +├── dlhpcstarter +│ │ +│ ├── tools - for all other modules; tools that are repeadetly used. +│ ├── __main__.py - __main__.py does the following: +│ │ 1. Reads command line arguments using argparse. +│ │ 2. Imports the 'stages' function for the task from task/TASK_NAME/stages.py. +│ │ 3. Loads the specified configuration .yaml for the job from task/TASK_NAME/config. +│ │ 4. Submits the job (the configuration + 'stages') to the cluster manager (or runs it locally if 'submit' is false). +│ └── cluster.py - contains the cluster management object. +│ └── command_line_arguments.py - argparse for reading command line arguments. +│ └── trainer.py - contains a wrapper for pytorch_lightning.Trainer. +│ └── utils.py - small utility definitions. + +``` # Tasks @@ -268,7 +268,7 @@ Currently, there are two methods for giving arguments: ***`task` and `config` must be given as command line arguments for `argparse`:*** ```shell -python3 main.py --config baseline --task cifar10 +dlhpcstarter --config baseline --task cifar10 ``` ***`module`, `definition`, and `exp_dir` can be given either as command line arguments, or be placed in the configuration file.*** @@ -317,7 +317,7 @@ dataset_dir: /my/datasets/directory ``` ```shell -python3 main.py --config baseline_rev_a --task cifar10 +dlhpcstarter --config baseline_rev_a --task cifar10 ``` # Next level: Configuration composition via Hydra @@ -474,17 +474,19 @@ trainer = trainer_instance(**vars(args)) Place any of the parameters for the trainer detailed at https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#trainer-class-api in your configuration file, and they will be passed to the `pytorch_lightning.Trainer` instance. -# Tying it all together: `main.py` +# Tying it all together: `dlhpcstarter` --- -***This is an overview of what occurs in `main.py`, you do not need to modify it.*** +***This is an overview of what occurs when the entrypoint `dlhpcstarter` is executed, this is not necessary to understand to use the package.*** + + -The main function does the following: +`dlhpcstarter` does the following: - Gets the command line arguments using `argparse`, e.g., arguments like this: ```shell - python3 main.py --config baseline --task cifar10 + dlhpcstarter --config baseline --task cifar10 ``` - Imports the `stages` definition for the task using `src.utils.importer`. - Reads the configuration `.yaml` and combines it with the command line arguments. @@ -518,7 +520,7 @@ The following arguments are used to configure a job for a cluster manager (the d ***These can be given as command line arguments:*** ```shell - python3 main.py --config baseline --task cifar10 --submit 1 --num-gpus 4 --num-workers 5 --memory 32GB +dlhpcstarter --config baseline --task cifar10 --submit 1 --num-gpus 4 --num-workers 5 --memory 32GB ``` ***Or they can be placed in the configuration `.yaml` file:*** @@ -543,29 +545,10 @@ dataset_dir: /my/datasets/directory ``` And executed with: ```shell - python3 main.py --config baseline --task cifar10 --submit True +dlhpcstarter --config baseline --task cifar10 --submit True ``` -# Installing required packages in a `venv` - -Set the following variables: -```shell -ENV_NAME = /my/env/name -REQ_PATH = /my/repositories/path/dl_hpc_starter_pack/requirements.txt -``` -Note: - - `ENV_NAME` can be of your choosing. - - `REQ_PATH` is the path of the `requirements.txt` in this repository. - -Then run the following with the `python` version of your choosing (most HPCs have `python` available as a module package: `module load python`): -``` -python3 -m venv --system-site-packages $ENV_NAME -source $ENV_NAME/bin/activate -pip install --upgrade pip -pip install --upgrade -r $REQ_PATH --no-cache-dir -``` - -If using a cluster manager, add the path to the `bin/activate` of the venv: +If using a cluster manager, add the path to the `bin/activate` of your virtual environment: ```yaml ... venv_path: /my/env/name/bin/activate diff --git a/setup.cfg b/setup.cfg index 5273fab..ead5fd8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = dlhpcstarter -version = 0.0.0 +version = 0.0.1 description = Deep Learning and HPC Starter Pack long_description = file: README.md long_description_content_type = text/markdown; charset=UTF-8 @@ -22,6 +22,7 @@ install_requires = numpy>=1.21 pandas>=1.3 pytorch-lightning>=1.7 + rich>=12.5 scipy>=1.7 torch>=1.12 torchmetrics>=0.9 @@ -34,6 +35,6 @@ test = pytest [options.packages.find] where=src -;[options.entry_points] -;console_scripts = -; dlhpcstarter = src.main:main +[options.entry_points] +console_scripts = + dlhpcstarter = dlhpcstarter.__main__:main diff --git a/src/dlhpcstarter/__main__.py b/src/dlhpcstarter/__main__.py index 82eb615..b9de20e 100644 --- a/src/dlhpcstarter/__main__.py +++ b/src/dlhpcstarter/__main__.py @@ -1,3 +1,4 @@ +import os from argparse import Namespace from dlhpcstarter.command_line_arguments import read_command_line_arguments from dlhpcstarter.utils import importer, gpu_usage_and_visibility, load_config_and_update_args @@ -77,6 +78,8 @@ def submit(stages_fnc: Callable, args: Namespace): num_nodes=args.num_nodes, num_workers=args.num_workers, memory=args.memory, + python_cmd='', + entrypoint='dlhpcstarter', ) # Cluster commands @@ -85,6 +88,9 @@ def submit(stages_fnc: Callable, args: Namespace): # Source virtual environment cluster.add_command('source ' + args.venv_path) + # NCCL debug flag + cluster.add_command('export NCCL_DEBUG=INFO') + # Request the quality of service for the job if args.qos: cluster.add_manager_cmd(cmd='qos', value=args.qos) diff --git a/src/dlhpcstarter/cluster.py b/src/dlhpcstarter/cluster.py index f86f2b6..36d952a 100644 --- a/src/dlhpcstarter/cluster.py +++ b/src/dlhpcstarter/cluster.py @@ -1,6 +1,6 @@ from argparse import Namespace from subprocess import call -from typing import Callable +from typing import Callable, Optional import datetime import os import signal @@ -28,6 +28,7 @@ def __init__( memory: str = '16GB', no_srun: bool = False, python_cmd: str = 'python3', + entrypoint: Optional[str] = None, resubmit: bool = True, ): """ @@ -46,6 +47,7 @@ def __init__( memory - minimum memory amount. no_srun - don't use 'srun'. python_cmd - python command name. + entrypoint - entrypoint to use instead of a script. resubmit - automatically resubmit job just before timout. """ self.fnc = fnc @@ -62,6 +64,7 @@ def __init__( self.memory = memory self.no_srun = no_srun self.python_cmd = python_cmd + self.entrypoint = entrypoint self.resubmit = resubmit self.script_name = os.path.realpath(sys.argv[0]) @@ -230,10 +233,12 @@ def build_slurm_command(self, manager_cmd_script_path, timestamp, session): args_string = self.args_to_string(self.fnc_kwargs) args_string = '{} --{} {}'.format(args_string, "slurm_cmd_path", manager_cmd_script_path) - if self.no_srun: - cmd = '{} {} {}'.format(self.python_cmd, self.script_name, args_string) + if self.entrypoint: + cmd = f'{self.entrypoint} {args_string}' else: - cmd = 'srun {} {} {}'.format(self.python_cmd, self.script_name, args_string) + cmd = '{} {} {}'.format(self.python_cmd, self.script_name, args_string) + if not self.no_srun: + cmd = 'srun ' + cmd sub_commands.append(cmd) return '\n'.join(sub_commands) diff --git a/task/cifar10/config/baseline_local.yaml b/task/cifar10/config/baseline_local.yaml new file mode 100644 index 0000000..d5c11e3 --- /dev/null +++ b/task/cifar10/config/baseline_local.yaml @@ -0,0 +1,4 @@ +defaults: + - baseline + - paths/local_paths@_global_ + - _self_ diff --git a/task/cifar10/config/resnet18_local.yaml b/task/cifar10/config/resnet18_local.yaml new file mode 100644 index 0000000..1522d0a --- /dev/null +++ b/task/cifar10/config/resnet18_local.yaml @@ -0,0 +1,6 @@ +defaults: + - baseline_local + - _self_ + +module: resnet18 +definition: ResNet18 diff --git a/task/cifar10/stages.py b/task/cifar10/stages.py index 123130e..44c2b41 100644 --- a/task/cifar10/stages.py +++ b/task/cifar10/stages.py @@ -1,3 +1,4 @@ +import os from argparse import Namespace from pytorch_lightning.utilities.seed import seed_everything from dlhpcstarter.tools.ext.collect_env_details import main as collect_env_details