Skip to content

Commit

Permalink
[rabit harden] fix model recovery tests failures (#510)
Browse files Browse the repository at this point in the history
* [rabit harden] fix rabit model recovery unit test failure

* fix cpplint isssue in python3

* per feedback cleanup

* make travis osx CXX setting consistent with cmakeetst

* Revert "make travis osx CXX setting consistent with cmakeetst"

This reverts commit 4c84baf.
  • Loading branch information
chenqin authored and CodingCat committed Mar 8, 2019
1 parent 7e2a1ec commit cb9e014
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 5 deletions.
1 change: 1 addition & 0 deletions include/dmlc/concurrency.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <queue>
#include <mutex>
#include <vector>
#include <utility>
#include <condition_variable>
#include "dmlc/base.h"

Expand Down
2 changes: 2 additions & 0 deletions include/dmlc/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#define DMLC_MEMORY_H_

#include <vector>
#include <memory>
#include <utility>
#include "./base.h"
#include "./logging.h"
#include "./thread_local.h"
Expand Down
2 changes: 1 addition & 1 deletion include/dmlc/optional.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct nullopt_t {
explicit nullopt_t(int a) {}
#else
/*! \brief dummy constructor */
constexpr nullopt_t(int a) {}
constexpr explicit nullopt_t(int a) {}
#endif
};

Expand Down
2 changes: 2 additions & 0 deletions include/dmlc/thread_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <dmlc/logging.h>
#include <string>
#include <mutex>
#include <utility>
#include <memory>
#include <set>
#include <thread>
#include <unordered_set>
Expand Down
22 changes: 18 additions & 4 deletions tracker/dmlc_tracker/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,42 @@ def exec_cmd(cmd, role, taskid, pass_env):
"""Execute the command line command."""
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt':
cmd[0] = './' + cmd[0]
cmd = ' '.join(cmd)
cmdline = ' '.join(cmd)
env = os.environ.copy()
for k, v in pass_env.items():
env[k] = str(v)

env['DMLC_TASK_ID'] = str(taskid)
env['DMLC_ROLE'] = role
env['DMLC_JOB_CLUSTER'] = 'local'

num_retry = env.get('DMLC_NUM_ATTEMPT', 0)

#overwrite default num of retry with commandline value
for param in cmd:
if param.startswith('DMLC_NUM_ATTEMPT'):
num_retry = int(param.split('=')[1])
logging.debug('num of retry %d',num_retry)

while True:
if os.name == 'nt':
ret = subprocess.call(cmd, shell=True, env=env)
ret = subprocess.call(cmdline, shell=True, env=env)
else:
ret = subprocess.call(cmd, shell=True, executable='bash', env=env)
ret = subprocess.call(cmdline, shell=True, executable='bash', env=env)
if ret == 0:
logging.debug('Thread %d exit with 0', taskid)
return
else:
num_retry -= 1
newcmd = []
if num_retry >= 0:
# failure trail increase by 1 and restart failed worker
for arg in cmd:
if arg.startswith('rabit_num_trial'):
val = arg.split('=')[1]
arg = arg.replace(val, str(int(val)+1))
newcmd.append(arg)
cmdline = ' '.join(newcmd)
cmd = newcmd
continue
if os.name == 'nt':
sys.exit(-1)
Expand Down

0 comments on commit cb9e014

Please sign in to comment.