Skip to content

Commit

Permalink
[launch] fix log more stable; default to stdout (#41314)
Browse files Browse the repository at this point in the history
  • Loading branch information
kuizhiqing authored Apr 2, 2022
1 parent 0f6412c commit 90b95be
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 14 deletions.
1 change: 1 addition & 0 deletions python/paddle/distributed/launch/context/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def get_ports_occupied(self):
return self.free_ports

def get_free_port(self):
# for loop to avoid port conflict
for _ in range(100):
with closing(socket.socket(socket.AF_INET,
socket.SOCK_STREAM)) as s:
Expand Down
5 changes: 3 additions & 2 deletions python/paddle/distributed/launch/controllers/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,9 @@ def watch(self) -> bool:
while not self.ctx.status.is_done():
status = self.pod.watch(timeout=2)

if self.ctx.continous_log():
self.pod.logs()
#if self.ctx.continous_log():
# default to print log
self.pod.logs()

# completed
if status == self.ctx.status.COMPLETED:
Expand Down
25 changes: 14 additions & 11 deletions python/paddle/distributed/launch/job/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,31 +145,34 @@ def __str__(self):
self.errfile,
self._env, )

def logs(self, fn=None, offset=0, whence=1, lines=1000):
def logs(self, fn=None, offset=0, whence=1, limit=1000):
if not self._log_handler:
self._log_handler = open(self._out)

if fn is None:
fn = sys.stdout

self._log_handler.seek(offset, whence)

try:
idx = 0
for line in self._log_handler:
fn.write(line)
idx += 1
if idx > lines:
if offset != 0 or whence != 1:
self._log_handler.seek(offset, whence)

for _ in range(limit):
line = self._log_handler.readline()
if not line:
break
finally:
fn.write(line)
except:
return

def tail(self, length=3000):
if not self._log_handler:
self._log_handler = open(self._out)

self._log_handler.seek(0, 2)
ed = self._log_handler.tell()
try:
self._log_handler.seek(0, 2)
ed = self._log_handler.tell()
except:
pass

if ed > length:
self.logs(offset=ed - length, whence=0)
Expand Down
2 changes: 1 addition & 1 deletion python/paddle/distributed/launch/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def launch():
- ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``.
- ``--log_level``: The log level to set for logging.setLevel which can be CRITICAL/ERROR/WARNING/INFO/DEBUG/NOTSET, case insensitive. The rank 0 log will not print in the terminal by default, while you can enable it by adding --log_level=debug. Default ``--log_level=INFO``.
- ``--log_level``: The log level to set for logging.setLevel which can be CRITICAL/ERROR/WARNING/INFO/DEBUG/NOTSET, case insensitive. Default ``--log_level=INFO``.
- ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnodes=2:3``. Default ``--nnodes=1``.
Expand Down

0 comments on commit 90b95be

Please sign in to comment.