From af842a18edf2bc61b16c23ea44bb187f6d080c48 Mon Sep 17 00:00:00 2001
From: anicolson <aaron.nicolson@live.com.au>
Date: Mon, 12 Sep 2022 10:21:40 +1000
Subject: [PATCH] Fixes for
 /~https://github.com/csiro-mlai/dl_hpc_starter_pack/issues/6 and
 /~https://github.com/csiro-mlai/dl_hpc_starter_pack/issues/3. Also fixed the
 entrypoint. Modified cluster.py to handle entrypoints.

---
 README.md                               | 101 ++++++++++--------------
 setup.cfg                               |   9 ++-
 src/dlhpcstarter/__main__.py            |   6 ++
 src/dlhpcstarter/cluster.py             |  13 ++-
 task/cifar10/config/baseline_local.yaml |   4 +
 task/cifar10/config/resnet18_local.yaml |   6 ++
 task/cifar10/stages.py                  |   1 +
 7 files changed, 73 insertions(+), 67 deletions(-)
 create mode 100644 task/cifar10/config/baseline_local.yaml
 create mode 100644 task/cifar10/config/resnet18_local.yaml

diff --git a/README.md b/README.md
index f14bcdf..870ad8a 100644
--- a/README.md
+++ b/README.md
@@ -16,10 +16,17 @@
  - [Neptune.ai](https://neptune.ai/) is used to track experiments; metric scores are automatically uploaded to [Neptune.ai](https://neptune.ai/), allowing you to easily track your experiments from your browser.
  - Scripts for submission to a cluster manager, such as [SLURM](https://slurm.schedmd.com/documentation.html) are written for you. Also, cluster manager jobs are automatically resubmitted and resumed if they haven't finished before the time-limit.
 
+# Installation
+
+The Deep Learning and HPC starter pack is available on PyPI:
+```shell
+pip install dlhpcstarter
+```
 
 # Table of Contents
 
-- [Repository map](#repository-map)
+- [How to structure your project](#how-to-structure-your-project)
+- [Package map](#package-map)
 - [Tasks](#tasks)
 - [Models](#models)
 - [Innovate via Model Composition and Inheritance](#innovate-via-model-composition-and-inheritance)
@@ -29,51 +36,44 @@
 - [Stages and Trainer](#stages-and-trainer)
 - [Tying it all together: `main.py`](#tying-it-all-together-mainpy)
 - [Cluster manager and distributed computing](#cluster-manager-and-distributed-computing)
-- [Installing required packages in a `venv`](#installing-required-packages-in-a-venv)
 - [Monitoring using Neptune.ai](#monitoring-using-neptuneai)
 - [Where all the outputs go: `exp_dir`](#where-all-the-outputs-go-exp_dir)
 - [Repository Wish List](#repository-wish-list)
 
-# Repository map
+# How to structure your project
 
 ---
-
-
-Overview of the repository. ***The most important parts are: `task`, `config`, `models`, and `stages`.***
+There will be a `task` directory containing each of your tasks, e.g., `cifar10`. For each task, you will have a set of configurations and models, which are stored in the `config` and `models` directories, respectively. Each task will also have a `stages` module for each stage of model development.
 ```
-├──  src
-│    │
-│    └── tools                     - for all other modules; tools that are repeadetly used.
-│    └── cluster.py                - contains the cluster management object.
-│    └── command_line_arguments.py - argparse for reading command line arguments.
-│    └── trainer.py                - contains a wrapper for pytorch_lightning.Trainer.
-│    └── utils.py                  - small utility definitions.
-│
-│
-│
-│
 ├──  task  
 │    │
 │    └── TASK_NAME     - name of the task, e.g., cifar10.
 │        └── config    - .yaml configuration files for a model.
 │        └── models    - .py modules that contain pytorch_lightning.LightningModule definitions that represent models.
 │        └── stages.py - training and testing stages for a task.
-│
-│
-│
-│
-├──  main.py - main.py does the following:
-│               1. Reads command line arguments using argparse.
-│               2. Imports the 'stages' function for the task from task/TASK_NAME/stages.py.
-│               3. Loads the specified configuration .yaml for the job from task/TASK_NAME/config.
-│               4. Submits the job (the configuration + 'stages') to the cluster manager (or runs it locally if 'submit' is false).
-│
-│
-│
-│
-└── requirements.txt - Packages required by the library (pip install -r requirements.txt).
 ```
 
+# Package map
+
+---
+
+The package is structured as follows:
+
+```
+├──  dlhpcstarter
+│    │
+│    ├── tools                     - for all other modules; tools that are repeadetly used.
+│    ├──  __main__.py - __main__.py does the following:
+│    │               1. Reads command line arguments using argparse.
+│    │               2. Imports the 'stages' function for the task from task/TASK_NAME/stages.py.
+│    │               3. Loads the specified configuration .yaml for the job from task/TASK_NAME/config.
+│    │               4. Submits the job (the configuration + 'stages') to the cluster manager (or runs it locally if 'submit' is false).
+│    └── cluster.py                - contains the cluster management object.
+│    └── command_line_arguments.py - argparse for reading command line arguments.
+│    └── trainer.py                - contains a wrapper for pytorch_lightning.Trainer.
+│    └── utils.py                  - small utility definitions.
+
+```
 
 # Tasks
 
@@ -268,7 +268,7 @@ Currently, there are two methods for giving arguments:
 ***`task` and `config` must be given as command line arguments for `argparse`:***
 
 ```shell
-python3 main.py --config baseline --task cifar10
+dlhpcstarter --config baseline --task cifar10
 ```
 
 ***`module`, `definition`, and `exp_dir` can be given either as command line arguments, or be placed in the configuration file.***
@@ -317,7 +317,7 @@ dataset_dir: /my/datasets/directory
 ```
 
 ```shell
-python3 main.py --config baseline_rev_a --task cifar10
+dlhpcstarter --config baseline_rev_a --task cifar10
 ```
 
 # Next level: Configuration composition via Hydra
@@ -474,17 +474,19 @@ trainer = trainer_instance(**vars(args))
 Place any of the parameters for the trainer detailed at 
 https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#trainer-class-api in your configuration file, and they will be passed to the `pytorch_lightning.Trainer` instance.
 
-# Tying it all together: `main.py`
+# Tying it all together: `dlhpcstarter`
 
 ---
 
-***This is an overview of what occurs in `main.py`, you do not need to modify it.***
+***This is an overview of what occurs when the entrypoint `dlhpcstarter` is executed, this is not necessary to understand to use the package.***
+
+
 
-The main function does the following:
+`dlhpcstarter` does the following:
 
  - Gets the command line arguments using `argparse`, e.g., arguments like this:
     ```shell
-    python3 main.py --config baseline --task cifar10
+    dlhpcstarter --config baseline --task cifar10
     ```
  - Imports the `stages` definition for the task using `src.utils.importer`.
  - Reads the configuration `.yaml` and combines it with the command line arguments.
@@ -518,7 +520,7 @@ The following arguments are used to configure a job for a cluster manager (the d
 ***These can be given as command line arguments:***
 
  ```shell
- python3 main.py --config baseline --task cifar10 --submit 1 --num-gpus 4 --num-workers 5 --memory 32GB
+dlhpcstarter --config baseline --task cifar10 --submit 1 --num-gpus 4 --num-workers 5 --memory 32GB
  ```
 
 ***Or they can be placed in the configuration `.yaml` file:***
@@ -543,29 +545,10 @@ dataset_dir: /my/datasets/directory
 ```
 And executed with:
 ```shell
- python3 main.py --config baseline --task cifar10 --submit True
+dlhpcstarter --config baseline --task cifar10 --submit True
  ```
 
-# Installing required packages in a `venv`
-
-Set the following variables:
-```shell
-ENV_NAME = /my/env/name
-REQ_PATH = /my/repositories/path/dl_hpc_starter_pack/requirements.txt
-```
-Note:
- - `ENV_NAME` can be of your choosing. 
- - `REQ_PATH` is the path of the `requirements.txt` in this repository.
-
-Then run the following with the `python` version of your choosing (most HPCs have `python` available as a module package: `module load python`):
-```
-python3 -m venv --system-site-packages $ENV_NAME
-source $ENV_NAME/bin/activate
-pip install --upgrade pip
-pip install --upgrade -r $REQ_PATH --no-cache-dir
-```
-
-If using a cluster manager, add the path to the `bin/activate` of the venv:
+If using a cluster manager, add the path to the `bin/activate` of your virtual environment:
 ```yaml
 ...
 venv_path: /my/env/name/bin/activate
diff --git a/setup.cfg b/setup.cfg
index 5273fab..ead5fd8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = dlhpcstarter
-version = 0.0.0
+version = 0.0.1
 description = Deep Learning and HPC Starter Pack
 long_description = file: README.md
 long_description_content_type = text/markdown; charset=UTF-8
@@ -22,6 +22,7 @@ install_requires =
     numpy>=1.21
     pandas>=1.3
     pytorch-lightning>=1.7
+    rich>=12.5
     scipy>=1.7
     torch>=1.12
     torchmetrics>=0.9
@@ -34,6 +35,6 @@ test = pytest
 [options.packages.find]
 where=src
 
-;[options.entry_points]
-;console_scripts =
-;    dlhpcstarter = src.main:main
+[options.entry_points]
+console_scripts =
+    dlhpcstarter = dlhpcstarter.__main__:main
diff --git a/src/dlhpcstarter/__main__.py b/src/dlhpcstarter/__main__.py
index 82eb615..b9de20e 100644
--- a/src/dlhpcstarter/__main__.py
+++ b/src/dlhpcstarter/__main__.py
@@ -1,3 +1,4 @@
+import os
 from argparse import Namespace
 from dlhpcstarter.command_line_arguments import read_command_line_arguments
 from dlhpcstarter.utils import importer, gpu_usage_and_visibility, load_config_and_update_args
@@ -77,6 +78,8 @@ def submit(stages_fnc: Callable, args: Namespace):
             num_nodes=args.num_nodes,
             num_workers=args.num_workers,
             memory=args.memory,
+            python_cmd='',
+            entrypoint='dlhpcstarter',
         )
 
         # Cluster commands
@@ -85,6 +88,9 @@ def submit(stages_fnc: Callable, args: Namespace):
         # Source virtual environment
         cluster.add_command('source ' + args.venv_path)
 
+        # NCCL debug flag
+        cluster.add_command('export NCCL_DEBUG=INFO')
+
         # Request the quality of service for the job
         if args.qos:
             cluster.add_manager_cmd(cmd='qos', value=args.qos)
diff --git a/src/dlhpcstarter/cluster.py b/src/dlhpcstarter/cluster.py
index f86f2b6..36d952a 100644
--- a/src/dlhpcstarter/cluster.py
+++ b/src/dlhpcstarter/cluster.py
@@ -1,6 +1,6 @@
 from argparse import Namespace
 from subprocess import call
-from typing import Callable
+from typing import Callable, Optional
 import datetime
 import os
 import signal
@@ -28,6 +28,7 @@ def __init__(
             memory: str = '16GB',
             no_srun: bool = False,
             python_cmd: str = 'python3',
+            entrypoint: Optional[str] = None,
             resubmit: bool = True,
     ):
         """
@@ -46,6 +47,7 @@ def __init__(
             memory - minimum memory amount.
             no_srun - don't use 'srun'.
             python_cmd - python command name.
+            entrypoint - entrypoint to use instead of a script.
             resubmit - automatically resubmit job just before timout.
         """
         self.fnc = fnc
@@ -62,6 +64,7 @@ def __init__(
         self.memory = memory
         self.no_srun = no_srun
         self.python_cmd = python_cmd
+        self.entrypoint = entrypoint
         self.resubmit = resubmit
 
         self.script_name = os.path.realpath(sys.argv[0])
@@ -230,10 +233,12 @@ def build_slurm_command(self, manager_cmd_script_path, timestamp, session):
         args_string = self.args_to_string(self.fnc_kwargs)
         args_string = '{} --{} {}'.format(args_string, "slurm_cmd_path", manager_cmd_script_path)
 
-        if self.no_srun:
-            cmd = '{} {} {}'.format(self.python_cmd, self.script_name, args_string)
+        if self.entrypoint:
+            cmd = f'{self.entrypoint} {args_string}'
         else:
-            cmd = 'srun {} {} {}'.format(self.python_cmd, self.script_name, args_string)
+            cmd = '{} {} {}'.format(self.python_cmd, self.script_name, args_string)
+        if not self.no_srun:
+            cmd = 'srun ' + cmd
         sub_commands.append(cmd)
 
         return '\n'.join(sub_commands)
diff --git a/task/cifar10/config/baseline_local.yaml b/task/cifar10/config/baseline_local.yaml
new file mode 100644
index 0000000..d5c11e3
--- /dev/null
+++ b/task/cifar10/config/baseline_local.yaml
@@ -0,0 +1,4 @@
+defaults:
+  - baseline
+  - paths/local_paths@_global_
+  - _self_
diff --git a/task/cifar10/config/resnet18_local.yaml b/task/cifar10/config/resnet18_local.yaml
new file mode 100644
index 0000000..1522d0a
--- /dev/null
+++ b/task/cifar10/config/resnet18_local.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - baseline_local
+  - _self_
+
+module: resnet18
+definition: ResNet18
diff --git a/task/cifar10/stages.py b/task/cifar10/stages.py
index 123130e..44c2b41 100644
--- a/task/cifar10/stages.py
+++ b/task/cifar10/stages.py
@@ -1,3 +1,4 @@
+import os
 from argparse import Namespace
 from pytorch_lightning.utilities.seed import seed_everything
 from dlhpcstarter.tools.ext.collect_env_details import main as collect_env_details