Skip to content

Commit

Permalink
Handle horovod errors (#24)
Browse files Browse the repository at this point in the history
* add status manger to handle error in Horovod

* add unit tests for testing errors

* leverage MXNet callback to populate errors

* invoke callback in a cleaner way

* pass in dmlc::Error instead of char*

* update imagenet example

* use a function to invoke callback

* fix wording
  • Loading branch information
yuxihu authored Jan 18, 2019
1 parent 61b47dc commit acd0d3f
Show file tree
Hide file tree
Showing 8 changed files with 245 additions and 78 deletions.
35 changes: 17 additions & 18 deletions examples/mxnet_imagenet_resnet50.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import math
import os


from gluoncv.model_zoo import get_model
import horovod.mxnet as hvd
import mxnet as mx
Expand Down Expand Up @@ -67,25 +66,23 @@
(default is : 40,60)')
parser.add_argument('--warmup-lr', type=float, default=0.0,
help='starting warmup learning rate (default: 0.0)')
parser.add_argument('--warmup-epochs', type=int, default=5,
help='number of warmup epochs (default: 5)')
parser.add_argument('--warmup-epochs', type=int, default=10,
help='number of warmup epochs (default: 10)')
parser.add_argument('--last-gamma', action='store_true', default=False,
help='whether to init gamma of the last BN layer in \
each bottleneck to 0 (default: False)')
parser.add_argument('--model', type=str, default='resnet50_v1',
help='type of model to use. see vision_model for options.')
parser.add_argument('--use-pretrained', action='store_true', default=False,
help='load pretrained model weights (default: False)')
parser.add_argument('--optimizer', type=str, default='nag',
help='optimizer to use for training (default: nag)')
parser.add_argument('--eval-epoch', action='store_true', default=False,
help='evaluate validation accuracy after each epoch (default: False)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training (default: False)')
parser.add_argument('--log-interval', type=int, default=0,
help='number of batches to wait before logging (default: 0)')
parser.add_argument('--save-frequency', type=int, default=0,
help='frequency of model saving. (default: 0)')
parser.add_argument('--save-frequency', type=int, default=10,
help='frequency of model saving. (default: 10)')


args = parser.parse_args()
Expand Down Expand Up @@ -266,7 +263,7 @@ def reset(self):
val_data = None


def main():
def train():
# Get model from GluonCV model zoo
# https://gluon-cv.mxnet.io/model_zoo/index.html
net = get_model(args.model, **kwargs)
Expand Down Expand Up @@ -303,7 +300,7 @@ def main():
'lr_scheduler': lr_sched}
if args.dtype == 'float16':
optimizer_params['multi_precision'] = True
opt = mx.optimizer.create(args.optimizer, sym=out, **optimizer_params)
opt = mx.optimizer.create('sgd', sym=out, **optimizer_params)

# Horovod: wrap optimizer with DistributedOptimizer
opt = hvd.DistributedOptimizer(opt)
Expand All @@ -329,7 +326,8 @@ def main():
eval_data = val_data
batch_callback = None
if args.log_interval > 0:
batch_callback = mx.callback.Speedometer(batch_size, max(1, args.log_interval))
batch_callback = mx.callback.Speedometer(batch_size,
max(1, args.log_interval))
epoch_callback = None
if args.save_frequency > 0:
epoch_callback = mx.callback.do_checkpoint(
Expand All @@ -346,14 +344,15 @@ def main():
optimizer=opt,
optimizer_params=optimizer_params)

# Evaluate performance
acc_top1 = mx.metric.Accuracy()
acc_top5 = mx.metric.TopKAccuracy(5)
res = mod.score(val_data, [acc_top1, acc_top5])
for name, val in res:
logging.info('Epoch[%d] Rank[%d] Validation-%s=%f',
args.num_epochs - 1, hvd.rank(), name, val)
# Evaluate performance if not using synthetic data
if args.use_rec:
acc_top1 = mx.metric.Accuracy()
acc_top5 = mx.metric.TopKAccuracy(5)
res = mod.score(val_data, [acc_top1, acc_top5])
for name, val in res:
logging.info('Epoch[%d] Rank[%d] Validation-%s=%f',
args.num_epochs - 1, hvd.rank(), name, val)


if __name__ == '__main__':
main()
train()
5 changes: 4 additions & 1 deletion horovod/mxnet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@

from horovod.common import check_extension

from horovod.mxnet.mpi_ops import allreduce, allreduce_
check_extension('horovod.mxnet', 'HOROVOD_WITH_MXNET',
__file__, 'mpi_lib')

from horovod.mxnet.mpi_ops import allgather
from horovod.mxnet.mpi_ops import allreduce, allreduce_
from horovod.mxnet.mpi_ops import broadcast, broadcast_
from horovod.mxnet.mpi_ops import init, shutdown
from horovod.mxnet.mpi_ops import size, local_size, rank, local_rank
Expand Down
15 changes: 0 additions & 15 deletions horovod/mxnet/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
#include "cuda.h"
#endif

#include <mxnet/base.h>

#include "adapter.h"
#include "cuda_util.h"
#include "tensor_util.h"
Expand Down Expand Up @@ -124,19 +122,6 @@ template <class T> Framework MXOpContext<T>::framework() const {
return Framework::MXNET;
}

void ThrowIfError(Status status) {
switch (status.type()) {
case StatusType::OK:
return;
case StatusType::PRECONDITION_ERROR:
throw std::logic_error(status.reason());
case StatusType::ABORTED:
throw std::runtime_error(status.reason());
default: // Includes UNKNOWN_ERROR
throw std::runtime_error(status.reason());
}
}

template class MXTensor<NDArray>;
template class MXTemporaryBuffer<NDArray>;
template class MXOpContext<NDArray>;
Expand Down
8 changes: 7 additions & 1 deletion horovod/mxnet/adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#ifndef HOROVOD_MXNET_ADAPTER_H
#define HOROVOD_MXNET_ADAPTER_H

#include <mxnet/base.h>

#include "../common/common.h"

namespace horovod {
Expand Down Expand Up @@ -68,7 +70,11 @@ template <class T> class MXOpContext : public OpContext {
T* output_;
};

void ThrowIfError(Status status);
inline void ThrowIfError(const Status& status) {
if (!status.ok()) {
throw dmlc::Error(status.reason());
}
}

} // namespace mxnet
} // namespace horovod
Expand Down
Loading

0 comments on commit acd0d3f

Please sign in to comment.