Merge pull request #64 from RobertTLange/gcp-launch

GCP Experiment Launch of VMs
mle-infrastructure · May 6, 2021 · 61a5969 · 61a5969
2 parents 176fb97 + 40dec19
commit 61a5969
Show file tree

Hide file tree

Showing 34 changed files with 774 additions and 226 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,13 +4,19 @@
 
 - Adds `HypothesisTester`: Simple time average difference comparison between individual runs. With multiple testing correction and p-value plotting. Example `hypothesis_testing.ipynb` notebook.
 - Adds `MetaLog` and `HyperLog` classes: Implement convenient functionalities like `hyper_log.filter()` and ease the post-processing analysis.
+- Adds GCP job launch/monitor support for all experiment types and organizes GCS syncing of results.
 
 ##### Changed
 
 - `load_result_logs` is now directly imported with `import mle_toolbox` since it is part of the core functionality.
+- Major restructuring of `experiment` sub-directory (`local`, `cluster`, `cloud`) with easy 3 part extension for new resources:
+    1. `monitor`
+    2. `launch`
+    3. `check_job_args`
 
 ##### Fixed
 
+- Fixes plotting with new `MetaLog` and `HyperLog` classes.
 
 ### v0.2.7 - 04/24/2021
 

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 
 > Coming up with the right research hypotheses is hard - testing them should be easy.
 
-ML researchers need to coordinate different types of experiments on separate remote resources. The *Machine Learning Experiment (MLE)-Toolbox* is designed to facilitate the workflow by providing a simple interface, standardized logging, many common ML experiment types (multi-seed/configurations, grid-searches and hyperparameter optimization pipelines). You can run experiments on your local machine, on [Slurm](https://slurm.schedmd.com/overview.html) and [Sun Grid Engine](http://bioinformatics.mdc-berlin.de/intro2UnixandSGE/sun_grid_engine_for_beginners/README.html) clusters. The results are archived (locally/Google Cloud Storage bucket) and can easily be retrieved or automatically summarized/reported as `.md`/`.html` files.
+ML researchers need to coordinate different types of experiments on separate remote resources. The *Machine Learning Experiment (MLE)-Toolbox* is designed to facilitate the workflow by providing a simple interface, standardized logging, many common ML experiment types (multi-seed/configurations, grid-searches and hyperparameter optimization pipelines). You can run experiments on your local machine, high-performance compute clusters ([Slurm](https://slurm.schedmd.com/overview.html) and [Sun Grid Engine](http://bioinformatics.mdc-berlin.de/intro2UnixandSGE/sun_grid_engine_for_beginners/README.html)) as well as on cloud VMs ([GCP](https://cloud.google.com/gcp/)). The results are archived (locally/[GCS bucket](https://cloud.google.com/products/storage/)) and can easily be retrieved or automatically summarized/reported as `.md`/`.html` files.
 
 ## The 4 Step `mle-toolbox` Cooking Recipe :stew:
 
@@ -15,7 +15,7 @@ ML researchers need to coordinate different types of experiments on separate rem
 
 ## Installation :memo:
 
-If you want to use the toolbox on your local machine follow the instructions locally. Otherwise do so on your respective remote resource (Slurm or SGE). A PyPI installation is available via:
+If you want to use the toolbox on your local machine follow the instructions locally. Otherwise do so on your respective cluster resource (Slurm/SGE). A PyPI installation is available via:
 
 ```
 pip install mle-toolbox

diff --git a/examples/requirements.txt b/examples/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+matplotlib
+mle-toolbox
diff --git a/mle_toolbox/experiment/cloud/gcp/helpers_launch_gcp.py b/mle_toolbox/experiment/cloud/gcp/helpers_launch_gcp.py
@@ -0,0 +1,166 @@
+import os, time
+import subprocess as sp
+from dotmap import DotMap
+from typing import Union
+from .startup_script_gcp import *
+
+
+cores_to_machine_type = {1: "n2-highcpu-2",
+                         2: "c2-standard-4",
+                         4: "c2-standard-8",
+                         8: "c2-standard-16",
+                         15: "c2-standard-30"}
+
+gpu_types = ["nvidia-tesla-p100",
+             "nvidia-tesla-v100",
+             "nvidia-tesla-t4",
+             "nvidia-tesla-p4",
+             "nvidia-tesla-k80"]
+
+
+base_gcp_args = DotMap({
+                'ZONE': 'us-west1-a',
+                'ACCELERATOR_TYPE': None,
+                'ACCELERATOR_COUNT': 0,
+                'MACHINE_TYPE': 'n2-highcpu-8',
+                'IMAGE_NAME': 'c1-deeplearning-tf-2-4-cu110-v20210414-debian-10',
+                'IMAGE_PROJECT': 'ml-images',
+                })
+
+
+tpu_gcp_args = DotMap({
+                'ZONE': 'europe-west4-a',
+                'ACCELERATOR_TYPE': 'v3-8',
+                'RUNTIME_VERSION': 'v2-alpha',
+                })
+
+
+def gcp_get_submission_cmd(vm_name: str,
+                           job_args: DotMap,
+                           startup_fname: str) -> list:
+    """ Construct gcloud VM instance creation cmd to execute via cmd line. """
+    if job_args.use_tpus:
+        job_gcp_args = tpu_gcp_args
+    else:
+        job_gcp_args = base_gcp_args
+        job_gcp_args.MACHINE_TYPE = cores_to_machine_type[
+                                        job_args.num_logical_cores]
+        if job_args.num_gpus > 0:
+            job_gcp_args.ACCELERATOR_TYPE = "nvidia-tesla-v100"
+            job_gcp_args.ACCELERATOR_COUNT = job_args.num_gpus
+
+    if job_args.use_tpus:
+        # TPU VM Alpha gcloud create CMD
+        gcp_launch_cmd = [
+            'gcloud', 'alpha', 'compute', 'tpus', 'tpu-vm', 'create',
+            f'{vm_name}',
+            f'--preemptible',
+            f'--zone={job_gcp_args.ZONE}',
+            f'--accelerator-type={job_gcp_args.ACCELERATOR_TYPE}',
+            f'--version={job_gcp_args.RUNTIME_VERSION}',
+            f'--metadata-from-file=startup-script={startup_fname}',
+            '--no-user-output-enabled', '--verbosity', 'error'
+                    ]
+    else:
+        # CPU VM gcloud create CMD w/o any GPU attached
+        gcp_launch_cmd = [
+            'gcloud', 'compute', 'instances', 'create',
+            f'{vm_name}',
+            f'--preemptible',
+            f'--zone={job_gcp_args.ZONE}',
+            f'--machine-type={job_gcp_args.MACHINE_TYPE}',
+            f'--image={job_gcp_args.IMAGE_NAME}',
+            f'--image-project={job_gcp_args.IMAGE_PROJECT}',
+            f'--metadata-from-file=startup-script={startup_fname}',
+            '--scopes=cloud-platform,storage-full',
+            '--boot-disk-size=128GB',
+            '--boot-disk-type=pd-standard',
+            '--no-user-output-enabled', '--verbosity', 'error'
+                    ]
+
+        # Attach GPUs to Job if desired - make sure to install nvidia driver
+        if (job_gcp_args.ACCELERATOR_COUNT > 0 and
+            job_gcp_args.ACCELERATOR_TYPE is not None):
+            gcp_launch_cmd += [
+            '--metadata=install-nvidia-driver=True'
+            '--maintenance-policy=TERMINATE',
+            f'--accelerator=type={job_gcp_args.ACCELERATOR_TYPE},'
+            + f'count={job_gcp_args.ACCELERATOR_COUNT}'
+            ]
+
+    return gcp_launch_cmd, job_gcp_args
+
+
+def gcp_generate_startup_file(remote_code_dir: str,
+                              remote_results_dir: str,
+                              gcp_bucket_name: str,
+                              job_filename: str,
+                              experiment_dir: str,
+                              startup_fname: str,
+                              cmd_line_arguments: str,
+                              use_tpus: bool=False,
+                              use_cuda: bool=False) -> str:
+    """ Generate bash script template to launch at VM startup. """
+    # Build the start job execution script
+    # 1. Connecting to tmux via: gcloud compute ssh $VM -- /sudo_tmux_a.sh
+    # 2a. Launch venv & install dependencies from requirements.txt
+    # 2b. [OPTIONAL] Setup JAX TPU/GPU build
+    # 3. Separate tmux split for rsync of results to GCS bucket
+    startup_script_content = (
+                "#!/bin/bash" +
+                tmux_setup +
+                clone_gcp_bucket_dir.format(
+                    remote_dir=remote_code_dir,
+                    gcp_bucket_name=gcp_bucket_name) +
+                install_venv.format(
+                    remote_dir=remote_code_dir)
+                )
+
+    if use_tpus:
+        # Install TPU version JAX
+        startup_script_content += jax_tpu_build
+    elif use_cuda:
+        # Install GPU version JAX
+        startup_script_content += jax_gpu_build
+
+    # Write the desired python/bash execution to slurm job submission file
+    f_name, f_extension = os.path.splitext(job_filename)
+    if f_extension == ".py":
+        startup_script_content += exec_python.format(
+                                    remote_dir=remote_code_dir,
+                                    filename=job_filename,
+                                    cmd_line_arguments=cmd_line_arguments)
+    elif f_extension == ".sh":
+        startup_script_content += exec_bash.format(
+                                    remote_dir=remote_code_dir,
+                                    filename=job_filename,
+                                    cmd_line_arguments=cmd_line_arguments)
+    else:
+        raise ValueError(f"Script with {f_extension} cannot be handled"
+                         " by mle-toolbox. Only base .py, .sh experiments"
+                         " are so far implemented. Please open an issue.")
+
+    startup_script_content += sync_results_from_dir.format(
+                                remote_code_dir=remote_code_dir,
+                                remote_results_dir=remote_results_dir,
+                                gcp_bucket_name=gcp_bucket_name,
+                                experiment_dir=experiment_dir)
+
+    # Write startup script to physical file
+    with open(startup_fname, 'w', encoding='utf8') as f:
+        f.write(startup_script_content)
+
+
+def gcp_delete_vm_instance(vm_name: str, use_tpus: bool=False):
+    """ Quitely delete job by its name + zone. TODO: Add robustness check. """
+    vm_zone = 'europe-west4-a'if use_tpus else 'us-west1-a'
+    if not use_tpus:
+        gcp_delete_cmd = ['gcloud', 'compute', 'instances', 'delete',
+                          f'{vm_name}', '--zone', f'{vm_zone}', '--quiet',
+                          '--no-user-output-enabled', '--verbosity', 'error']
+    else:
+        gcp_delete_cmd = ['gcloud', 'alpha', 'compute', 'tpus', 'tpu-vm',
+                          'delete', f'{vm_name}', '--zone', f'{vm_zone}',
+                          '--quiet', '--no-user-output-enabled',
+                          '--verbosity', 'error']
+    sp.run(gcp_delete_cmd)
diff --git a/mle_toolbox/experiment/cloud/gcp/startup_script_gcp.py b/mle_toolbox/experiment/cloud/gcp/startup_script_gcp.py
@@ -0,0 +1,94 @@
+# Useful string lego building blocks for GCP startup file formatting
+#   1. Copy code directory from GCS bucket
+#   2. Setting up tmux session (a) htop (b) startup exec (c) GCS rsync results
+#   3. Installation of venv, requirements and jaxlib accelerator dependencies
+#   4. Python Base Job File Execution
+#   5. Sync Results with GCS bucket
+
+
+clone_gcp_bucket_dir = """
+mkdir {remote_dir}
+sudo chmod 777 {remote_dir}
+gsutil cp -r gs://{gcp_bucket_name}/{remote_dir} /
+"""
+
+tmux_setup = """
+# Login directly with:
+# gcloud compute ssh $VM -- /sudo_tmux_a.sh
+echo -e '#!/bin/bash\nsudo /tmux_a.sh' > /sudo_tmux_a.sh
+chmod a+x /sudo_tmux_a.sh
+echo -e '#!/bin/bash\ntmux a' > /tmux_a.sh
+chmod a+x /tmux_a.sh
+"""
+
+install_venv = '''# Setup virtual env + install base required packages
+tmux new-session -s gcp_exp -d htop ENTER
+tmux split-window
+tmux send "
+    set -x
+    [ -d gcp_exp ] || (
+    cd /{remote_dir}
+    python3 -m pip install virtualenv
+    python3 -m virtualenv env
+    . env/bin/activate
+    pip install -U pip
+    pip install -r requirements.txt
+    )
+"
+'''
+
+jax_tpu_build = '''
+tmux send "
+    cd /{remote_dir}
+    . env/bin/activate
+    PYTHON_VERSION=cp36  # Supported python versions: cp36, cp37, cp38
+    pip install --upgrade --user https://storage.googleapis.com/jax-releases/tpu/jaxlib-0.1.55+tpu-$PYTHON_VERSION-none-manylinux2010_x86_64.whl
+    pip install --upgrade --user jax
+    "
+'''
+
+jax_gpu_build = '''
+tmux send "
+    cd /{remote_dir}
+    . env/bin/activate
+    pip install --upgrade jax jaxlib==0.1.64+cuda110 -f https://storage.googleapis.com/jax-releases/jax_releases.html
+    "
+'''
+
+exec_python = '''
+tmux send "
+(
+  cd /{remote_dir} &&
+  . env/bin/activate &&
+  python3 {filename} {cmd_line_arguments}
+) 2>&1 | tee -a /{remote_dir}/log.txt
+
+echo WILL SHUT DOWN IN 5 MIN ...
+sleep 100 && sudo shutdown now
+"
+'''
+
+exec_bash = '''
+tmux send "
+(
+  cd /{remote_dir} &&
+  . env/bin/activate &&
+  bash {filename} {cmd_line_arguments}
+) 2>&1 | tee -a /{remote_dir}/log.txt
+
+echo WILL SHUT DOWN IN 5 MIN ...
+sleep 100 && sudo shutdown now
+"
+'''
+
+sync_results_from_dir = '''
+# Wait for experiment startup before continuous rsync
+sleep 120
+tmux split-window -h
+tmux send "
+while true; do
+    gsutil rsync -x 'env' -r /{remote_code_dir}/{experiment_dir} gs://{gcp_bucket_name}/{remote_results_dir}/{experiment_dir}
+    sleep 10
+done
+" ENTER
+'''