From b95d1022c2fd344dd8612094fd88e13cb189dcb8 Mon Sep 17 00:00:00 2001 From: Frank Liu Date: Wed, 6 Mar 2019 19:16:12 -0800 Subject: [PATCH] #14199: catch subprocess.CalledProcessError in get_gpus() (#14212) * Fixes #14199: use proper API get number of gpus. 1. Added get_gpus() and get_gpu_memory() API to python binding. 2. Update example script to use proper API for getting gpu numbers. * retrigger CI --- benchmark/python/control_flow/rnn.py | 7 +------ example/image-classification/common/util.py | 8 +++----- python/mxnet/test_utils.py | 11 ++--------- python/mxnet/util.py | 16 ++++++++++++++++ tools/bandwidth/test_measure.py | 10 ++++------ 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/benchmark/python/control_flow/rnn.py b/benchmark/python/control_flow/rnn.py index 08498724b1b4..24e326c9afd1 100644 --- a/benchmark/python/control_flow/rnn.py +++ b/benchmark/python/control_flow/rnn.py @@ -79,12 +79,7 @@ def _array(shape, ctx): def _get_gpus(): - try: - re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True) - except OSError: - return [] - return range(len([i for i in re.split('\n') if 'GPU' in i])) - + return range(mx.util.get_gpu_count()) def run_benchmark(cell_type, ctx, seq_len, batch_size, hidden_dim): obj = {"foreach": ForeachRNN, "while_loop": WhileRNN}[args.benchmark] diff --git a/example/image-classification/common/util.py b/example/image-classification/common/util.py index 5f70411ab084..8737b69a7351 100644 --- a/example/image-classification/common/util.py +++ b/example/image-classification/common/util.py @@ -19,6 +19,8 @@ import os import errno +import mxnet as mx + def download_file(url, local_fname=None, force_write=False): # requests is not default installed import requests @@ -49,8 +51,4 @@ def get_gpus(): """ return a list of GPUs """ - try: - re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True) - except OSError: - return [] - return range(len([i for i in re.split('\n') if 'GPU' in i])) + return range(mx.util.get_gpu_count()) diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py index 4138e4d2d755..6d1749b1a611 100644 --- a/python/mxnet/test_utils.py +++ b/python/mxnet/test_utils.py @@ -23,7 +23,6 @@ import struct import traceback import numbers -import subprocess import sys import os import errno @@ -213,6 +212,7 @@ def _get_powerlaw_dataset_csr(num_rows, num_cols, density=0.1, dtype=None): else: return mx.nd.array(output_arr).tostype("csr") + def assign_each(the_input, function): """Return ndarray composed of passing each array value through some function""" if function is None: @@ -1391,14 +1391,7 @@ def list_gpus(): If there are n GPUs, then return a list [0,1,...,n-1]. Otherwise returns []. """ - re = '' - nvidia_smi = ['nvidia-smi', '/usr/bin/nvidia-smi', '/usr/local/nvidia/bin/nvidia-smi'] - for cmd in nvidia_smi: - try: - re = subprocess.check_output([cmd, "-L"], universal_newlines=True) - except (subprocess.CalledProcessError, OSError): - pass - return range(len([i for i in re.split('\n') if 'GPU' in i])) + return range(mx.util.get_gpu_count()) def download(url, fname=None, dirname=None, overwrite=False, retries=5): """Download an given URL diff --git a/python/mxnet/util.py b/python/mxnet/util.py index 62c05d252828..fc8d985b9566 100644 --- a/python/mxnet/util.py +++ b/python/mxnet/util.py @@ -16,9 +16,12 @@ # under the License. """general utility functions""" +import ctypes import os import sys +from .base import _LIB, check_call + def makedirs(d): """Create directories recursively if they don't exist. os.makedirs(exist_ok=True) is not @@ -28,3 +31,16 @@ def makedirs(d): mkpath(d) else: os.makedirs(d, exist_ok=True) # pylint: disable=unexpected-keyword-arg + + +def get_gpu_count(): + size = ctypes.c_int() + check_call(_LIB.MXGetGPUCount(ctypes.byref(size))) + return size.value + + +def get_gpu_memory(gpu_dev_id): + free_mem = ctypes.c_uint64(0) + total_mem = ctypes.c_uint64(0) + check_call(_LIB.MXGetGPUMemoryInformation64(gpu_dev_id, ctypes.byref(free_mem), ctypes.byref(total_mem))) + return free_mem.value, total_mem.value diff --git a/tools/bandwidth/test_measure.py b/tools/bandwidth/test_measure.py index 375290fe6853..d14a7aae5196 100644 --- a/tools/bandwidth/test_measure.py +++ b/tools/bandwidth/test_measure.py @@ -21,13 +21,11 @@ from measure import run import subprocess import logging + +import mxnet as mx + def get_gpus(): - try: - re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True) - except OSError: - return '' - gpus = [i for i in re.split('\n') if 'GPU' in i] - return ','.join([str(i) for i in range(len(gpus))]) + return ','.join([str(i) for i in range(mx.util.get_gpu_count())]) def test_measure(**kwargs): logging.info(kwargs)