-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #64 from RobertTLange/gcp-launch
GCP Experiment Launch of VMs
- Loading branch information
Showing
34 changed files
with
774 additions
and
226 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
numpy | ||
matplotlib | ||
mle-toolbox |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
import os, time | ||
import subprocess as sp | ||
from dotmap import DotMap | ||
from typing import Union | ||
from .startup_script_gcp import * | ||
|
||
|
||
cores_to_machine_type = {1: "n2-highcpu-2", | ||
2: "c2-standard-4", | ||
4: "c2-standard-8", | ||
8: "c2-standard-16", | ||
15: "c2-standard-30"} | ||
|
||
gpu_types = ["nvidia-tesla-p100", | ||
"nvidia-tesla-v100", | ||
"nvidia-tesla-t4", | ||
"nvidia-tesla-p4", | ||
"nvidia-tesla-k80"] | ||
|
||
|
||
base_gcp_args = DotMap({ | ||
'ZONE': 'us-west1-a', | ||
'ACCELERATOR_TYPE': None, | ||
'ACCELERATOR_COUNT': 0, | ||
'MACHINE_TYPE': 'n2-highcpu-8', | ||
'IMAGE_NAME': 'c1-deeplearning-tf-2-4-cu110-v20210414-debian-10', | ||
'IMAGE_PROJECT': 'ml-images', | ||
}) | ||
|
||
|
||
tpu_gcp_args = DotMap({ | ||
'ZONE': 'europe-west4-a', | ||
'ACCELERATOR_TYPE': 'v3-8', | ||
'RUNTIME_VERSION': 'v2-alpha', | ||
}) | ||
|
||
|
||
def gcp_get_submission_cmd(vm_name: str, | ||
job_args: DotMap, | ||
startup_fname: str) -> list: | ||
""" Construct gcloud VM instance creation cmd to execute via cmd line. """ | ||
if job_args.use_tpus: | ||
job_gcp_args = tpu_gcp_args | ||
else: | ||
job_gcp_args = base_gcp_args | ||
job_gcp_args.MACHINE_TYPE = cores_to_machine_type[ | ||
job_args.num_logical_cores] | ||
if job_args.num_gpus > 0: | ||
job_gcp_args.ACCELERATOR_TYPE = "nvidia-tesla-v100" | ||
job_gcp_args.ACCELERATOR_COUNT = job_args.num_gpus | ||
|
||
if job_args.use_tpus: | ||
# TPU VM Alpha gcloud create CMD | ||
gcp_launch_cmd = [ | ||
'gcloud', 'alpha', 'compute', 'tpus', 'tpu-vm', 'create', | ||
f'{vm_name}', | ||
f'--preemptible', | ||
f'--zone={job_gcp_args.ZONE}', | ||
f'--accelerator-type={job_gcp_args.ACCELERATOR_TYPE}', | ||
f'--version={job_gcp_args.RUNTIME_VERSION}', | ||
f'--metadata-from-file=startup-script={startup_fname}', | ||
'--no-user-output-enabled', '--verbosity', 'error' | ||
] | ||
else: | ||
# CPU VM gcloud create CMD w/o any GPU attached | ||
gcp_launch_cmd = [ | ||
'gcloud', 'compute', 'instances', 'create', | ||
f'{vm_name}', | ||
f'--preemptible', | ||
f'--zone={job_gcp_args.ZONE}', | ||
f'--machine-type={job_gcp_args.MACHINE_TYPE}', | ||
f'--image={job_gcp_args.IMAGE_NAME}', | ||
f'--image-project={job_gcp_args.IMAGE_PROJECT}', | ||
f'--metadata-from-file=startup-script={startup_fname}', | ||
'--scopes=cloud-platform,storage-full', | ||
'--boot-disk-size=128GB', | ||
'--boot-disk-type=pd-standard', | ||
'--no-user-output-enabled', '--verbosity', 'error' | ||
] | ||
|
||
# Attach GPUs to Job if desired - make sure to install nvidia driver | ||
if (job_gcp_args.ACCELERATOR_COUNT > 0 and | ||
job_gcp_args.ACCELERATOR_TYPE is not None): | ||
gcp_launch_cmd += [ | ||
'--metadata=install-nvidia-driver=True' | ||
'--maintenance-policy=TERMINATE', | ||
f'--accelerator=type={job_gcp_args.ACCELERATOR_TYPE},' | ||
+ f'count={job_gcp_args.ACCELERATOR_COUNT}' | ||
] | ||
|
||
return gcp_launch_cmd, job_gcp_args | ||
|
||
|
||
def gcp_generate_startup_file(remote_code_dir: str, | ||
remote_results_dir: str, | ||
gcp_bucket_name: str, | ||
job_filename: str, | ||
experiment_dir: str, | ||
startup_fname: str, | ||
cmd_line_arguments: str, | ||
use_tpus: bool=False, | ||
use_cuda: bool=False) -> str: | ||
""" Generate bash script template to launch at VM startup. """ | ||
# Build the start job execution script | ||
# 1. Connecting to tmux via: gcloud compute ssh $VM -- /sudo_tmux_a.sh | ||
# 2a. Launch venv & install dependencies from requirements.txt | ||
# 2b. [OPTIONAL] Setup JAX TPU/GPU build | ||
# 3. Separate tmux split for rsync of results to GCS bucket | ||
startup_script_content = ( | ||
"#!/bin/bash" + | ||
tmux_setup + | ||
clone_gcp_bucket_dir.format( | ||
remote_dir=remote_code_dir, | ||
gcp_bucket_name=gcp_bucket_name) + | ||
install_venv.format( | ||
remote_dir=remote_code_dir) | ||
) | ||
|
||
if use_tpus: | ||
# Install TPU version JAX | ||
startup_script_content += jax_tpu_build | ||
elif use_cuda: | ||
# Install GPU version JAX | ||
startup_script_content += jax_gpu_build | ||
|
||
# Write the desired python/bash execution to slurm job submission file | ||
f_name, f_extension = os.path.splitext(job_filename) | ||
if f_extension == ".py": | ||
startup_script_content += exec_python.format( | ||
remote_dir=remote_code_dir, | ||
filename=job_filename, | ||
cmd_line_arguments=cmd_line_arguments) | ||
elif f_extension == ".sh": | ||
startup_script_content += exec_bash.format( | ||
remote_dir=remote_code_dir, | ||
filename=job_filename, | ||
cmd_line_arguments=cmd_line_arguments) | ||
else: | ||
raise ValueError(f"Script with {f_extension} cannot be handled" | ||
" by mle-toolbox. Only base .py, .sh experiments" | ||
" are so far implemented. Please open an issue.") | ||
|
||
startup_script_content += sync_results_from_dir.format( | ||
remote_code_dir=remote_code_dir, | ||
remote_results_dir=remote_results_dir, | ||
gcp_bucket_name=gcp_bucket_name, | ||
experiment_dir=experiment_dir) | ||
|
||
# Write startup script to physical file | ||
with open(startup_fname, 'w', encoding='utf8') as f: | ||
f.write(startup_script_content) | ||
|
||
|
||
def gcp_delete_vm_instance(vm_name: str, use_tpus: bool=False): | ||
""" Quitely delete job by its name + zone. TODO: Add robustness check. """ | ||
vm_zone = 'europe-west4-a'if use_tpus else 'us-west1-a' | ||
if not use_tpus: | ||
gcp_delete_cmd = ['gcloud', 'compute', 'instances', 'delete', | ||
f'{vm_name}', '--zone', f'{vm_zone}', '--quiet', | ||
'--no-user-output-enabled', '--verbosity', 'error'] | ||
else: | ||
gcp_delete_cmd = ['gcloud', 'alpha', 'compute', 'tpus', 'tpu-vm', | ||
'delete', f'{vm_name}', '--zone', f'{vm_zone}', | ||
'--quiet', '--no-user-output-enabled', | ||
'--verbosity', 'error'] | ||
sp.run(gcp_delete_cmd) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
# Useful string lego building blocks for GCP startup file formatting | ||
# 1. Copy code directory from GCS bucket | ||
# 2. Setting up tmux session (a) htop (b) startup exec (c) GCS rsync results | ||
# 3. Installation of venv, requirements and jaxlib accelerator dependencies | ||
# 4. Python Base Job File Execution | ||
# 5. Sync Results with GCS bucket | ||
|
||
|
||
clone_gcp_bucket_dir = """ | ||
mkdir {remote_dir} | ||
sudo chmod 777 {remote_dir} | ||
gsutil cp -r gs://{gcp_bucket_name}/{remote_dir} / | ||
""" | ||
|
||
tmux_setup = """ | ||
# Login directly with: | ||
# gcloud compute ssh $VM -- /sudo_tmux_a.sh | ||
echo -e '#!/bin/bash\nsudo /tmux_a.sh' > /sudo_tmux_a.sh | ||
chmod a+x /sudo_tmux_a.sh | ||
echo -e '#!/bin/bash\ntmux a' > /tmux_a.sh | ||
chmod a+x /tmux_a.sh | ||
""" | ||
|
||
install_venv = '''# Setup virtual env + install base required packages | ||
tmux new-session -s gcp_exp -d htop ENTER | ||
tmux split-window | ||
tmux send " | ||
set -x | ||
[ -d gcp_exp ] || ( | ||
cd /{remote_dir} | ||
python3 -m pip install virtualenv | ||
python3 -m virtualenv env | ||
. env/bin/activate | ||
pip install -U pip | ||
pip install -r requirements.txt | ||
) | ||
" | ||
''' | ||
|
||
jax_tpu_build = ''' | ||
tmux send " | ||
cd /{remote_dir} | ||
. env/bin/activate | ||
PYTHON_VERSION=cp36 # Supported python versions: cp36, cp37, cp38 | ||
pip install --upgrade --user https://storage.googleapis.com/jax-releases/tpu/jaxlib-0.1.55+tpu-$PYTHON_VERSION-none-manylinux2010_x86_64.whl | ||
pip install --upgrade --user jax | ||
" | ||
''' | ||
|
||
jax_gpu_build = ''' | ||
tmux send " | ||
cd /{remote_dir} | ||
. env/bin/activate | ||
pip install --upgrade jax jaxlib==0.1.64+cuda110 -f https://storage.googleapis.com/jax-releases/jax_releases.html | ||
" | ||
''' | ||
|
||
exec_python = ''' | ||
tmux send " | ||
( | ||
cd /{remote_dir} && | ||
. env/bin/activate && | ||
python3 {filename} {cmd_line_arguments} | ||
) 2>&1 | tee -a /{remote_dir}/log.txt | ||
echo WILL SHUT DOWN IN 5 MIN ... | ||
sleep 100 && sudo shutdown now | ||
" | ||
''' | ||
|
||
exec_bash = ''' | ||
tmux send " | ||
( | ||
cd /{remote_dir} && | ||
. env/bin/activate && | ||
bash {filename} {cmd_line_arguments} | ||
) 2>&1 | tee -a /{remote_dir}/log.txt | ||
echo WILL SHUT DOWN IN 5 MIN ... | ||
sleep 100 && sudo shutdown now | ||
" | ||
''' | ||
|
||
sync_results_from_dir = ''' | ||
# Wait for experiment startup before continuous rsync | ||
sleep 120 | ||
tmux split-window -h | ||
tmux send " | ||
while true; do | ||
gsutil rsync -x 'env' -r /{remote_code_dir}/{experiment_dir} gs://{gcp_bucket_name}/{remote_results_dir}/{experiment_dir} | ||
sleep 10 | ||
done | ||
" ENTER | ||
''' |
Oops, something went wrong.