Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DO NOT MERGE] Added error handling in MXNet #19

Open
wants to merge 47 commits into
base: mxnet_feature_fp16
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
d625394
Make mxnet build successful in CPU
apeforest Nov 30, 2018
02ab771
update required mxnet version
apeforest Dec 3, 2018
9abcc4e
remove outdated comment
apeforest Dec 3, 2018
cd096e4
remove commented line
apeforest Dec 3, 2018
f9b2083
Merge remote-tracking branch 'origin/mxnet_feature_fp16' into develop…
apeforest Dec 3, 2018
b0e2e58
fix test in CPU
apeforest Dec 4, 2018
dd4f9e2
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Dec 4, 2018
b617e14
refactor
apeforest Dec 4, 2018
84ed58e
Merge branch 'mxnet_feature_fp16' into develop/mxnet
yuxihu Dec 4, 2018
2b902ae
link nccl to mpi_lib for mxnet
yuxihu Dec 4, 2018
ff57e51
Merge branch 'develop/mxnet' of /~https://github.com/ctcyang/horovod in…
apeforest Dec 4, 2018
6013957
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Dec 4, 2018
bc47aa9
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Dec 19, 2018
297e79a
make mxnet build process the same as tensorflow
apeforest Dec 19, 2018
f28ba01
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Dec 28, 2018
ab78201
compute allreduce average in C++ to avoid perf deg
apeforest Dec 28, 2018
dc62625
rename variable
apeforest Dec 28, 2018
c56322f
add mxnet mnist example
apeforest Jan 1, 2019
4eb787e
fix lint
apeforest Jan 1, 2019
3e5491a
reduce epoch and acc check
apeforest Jan 2, 2019
9589209
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 2, 2019
b42f0c5
broadcast initial parames
apeforest Jan 2, 2019
13adbb3
Update README
apeforest Jan 2, 2019
b4aa9f2
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 3, 2019
f9c9d73
remove unused handle manager
apeforest Jan 3, 2019
dc96acc
renaming variable type
apeforest Jan 3, 2019
aaf3d7f
return non empty op name
apeforest Jan 4, 2019
0797570
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 4, 2019
89ba103
scale learning rate by workers
apeforest Jan 4, 2019
60877b7
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 4, 2019
b3a24db
refactor test_mxnet to make it easier to read
apeforest Jan 5, 2019
6e4b845
fix a bug in building on GPU
apeforest Jan 5, 2019
710c703
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 5, 2019
0112e6a
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 5, 2019
4a1c010
polish imagenet example
apeforest Jan 6, 2019
61741e8
add handle_manager
apeforest Jan 6, 2019
c24d0bd
error handling in MXNet
apeforest Jan 7, 2019
effd043
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 7, 2019
1c9443f
add exception handling
apeforest Jan 8, 2019
9b9bab1
rename c_api_common
apeforest Jan 8, 2019
2d64e05
wrap MXNet C API with exception handling
apeforest Jan 8, 2019
1cd08be
remove unused function declaration
apeforest Jan 9, 2019
77cbb8b
fix a typo
apeforest Jan 9, 2019
4f1a626
fix a bug
apeforest Jan 9, 2019
c1c476c
fix build error
apeforest Jan 9, 2019
51f81d0
Merge branch 'mxnet_feature_fp16' into develop/mxnet
Jan 14, 2019
75c56f7
Merge remote-tracking branch 'origin/mxnet_feature_fp16' into develop…
Jan 14, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions horovod/mxnet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@

from horovod.common import check_extension

check_extension('horovod.mxnet', 'HOROVOD_WITH_MXNET',
__file__, 'mpi_lib')

from horovod.mxnet.mpi_ops import allreduce, allreduce_
from horovod.mxnet.mpi_ops import allgather
from horovod.mxnet.mpi_ops import broadcast, broadcast_
Expand Down
11 changes: 3 additions & 8 deletions horovod/mxnet/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,16 +125,11 @@ template <class T> Framework MXOpContext<T>::framework() const {
}

void ThrowIfError(Status status) {
switch (status.type()) {
case StatusType::OK:
if (StatusType::OK == status.type()) {
return;
case StatusType::PRECONDITION_ERROR:
throw std::logic_error(status.reason());
case StatusType::ABORTED:
throw std::runtime_error(status.reason());
default: // Includes UNKNOWN_ERROR
throw std::runtime_error(status.reason());
}

throw dmlc::Error(status.reason());
}

template class MXTensor<NDArray>;
Expand Down
1 change: 1 addition & 0 deletions horovod/mxnet/adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ template <class T> class MXOpContext : public OpContext {
};

void ThrowIfError(Status status);
int CheckStatus(const Status &status);

} // namespace mxnet
} // namespace horovod
Expand Down
76 changes: 76 additions & 0 deletions horovod/mxnet/handle_manager.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================

#include "handle_manager.h"

namespace horovod {
namespace mxnet {

typedef ::mxnet::Engine::CallbackOnComplete Callback;

int HandleManager::AllocateHandle() {
int handle = last_handle_.fetch_add(1) + 1;
std::lock_guard<std::mutex> guard(mutex_);
results_[handle] = nullptr;
return handle;
}

void HandleManager::MarkDone(int handle, const Status& status) {
std::lock_guard<std::mutex> guard(mutex_);
results_[handle] = std::make_shared<Status>(status);
}

void HandleManager::AttachCallback(int handle, Callback cb) {
std::unique_lock<std::mutex> lock(mutex_);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can use lock_guard here

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

if (callbacks_.find(handle) == callbacks_.end()) {
callbacks_[handle] = std::make_shared<Callback>(cb);
}
}

void HandleManager::ExecuteCallback(int handle) {
std::unique_lock<std::mutex> lock(mutex_);
if (callbacks_.find(handle) == callbacks_.end()) {
return;
}
auto cb_ptr = callbacks_[handle];
lock.unlock();
if (cb_ptr != nullptr) {
(*cb_ptr)();
}
}

bool HandleManager::PollHandle(int handle) {
std::lock_guard<std::mutex> guard(mutex_);
if (results_.find(handle) == results_.end()) {
throw std::invalid_argument("Handle " + std::to_string(handle) +
" was not created or has been cleared.");
}
return results_[handle] != nullptr;
}

std::shared_ptr<Status> HandleManager::ReleaseHandle(int handle) {
std::lock_guard<std::mutex> guard(mutex_);
if (results_.find(handle) == results_.end()) {
throw std::invalid_argument("Handle " + std::to_string(handle) +
" was not created or has been cleared.");
}
auto status = results_[handle];
results_.erase(handle);
callbacks_.erase(handle);
return status;
}

} // namespace mxnet
} // namespace horovod
56 changes: 56 additions & 0 deletions horovod/mxnet/handle_manager.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================

#ifndef HOROVOD_MXNET_HANDLE_MANAGER_H
#define HOROVOD_MXNET_HANDLE_MANAGER_H

#include <atomic>
#include <memory>
#include <mutex>
#include <unordered_map>

#include "../common/common.h"

#include <mxnet/engine.h>

namespace horovod {
namespace mxnet {

using namespace horovod::common;

typedef ::mxnet::Engine Engine;
typedef ::mxnet::NDArray NDArray;
typedef ::mxnet::Engine::CallbackOnComplete Callback;

class HandleManager {
public:
int AllocateHandle();
void AttachCallback(int handle, Callback cb);
void MarkDone(int handle, const Status& status);
void ExecuteCallback(int handle);
bool PollHandle(int handle);
std::shared_ptr<Status> ReleaseHandle(int handle);

private:
std::atomic_int last_handle_;
std::unordered_map<int, std::shared_ptr<Status>> results_;
std::unordered_map<int, std::shared_ptr<Callback>> callbacks_;
std::mutex mutex_;
};

} // namespace mxnet
} // namespace horovod

#endif // HOROVOD_MXNET_HANDLE_MANAGER_H
Loading