Skip to content

Commit

Permalink
Print IPS in auto parallel Engine (PaddlePaddle#46554)
Browse files Browse the repository at this point in the history
  • Loading branch information
From00 authored and zhaoyingli committed Oct 19, 2022
1 parent 84b8145 commit 24f4753
Showing 1 changed file with 32 additions and 26 deletions.
58 changes: 32 additions & 26 deletions python/paddle/distributed/auto_parallel/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import paddle
import paddle.utils as utils

from paddle import fluid, static
from paddle import fluid, profiler, static
from paddle.jit import to_static
from paddle.metric import Metric
from paddle.static import InputSpec
Expand Down Expand Up @@ -570,7 +570,8 @@ def _print_log(self,
step=None,
lr=None,
fetch_new_names=None,
fetch_sections=None):
fetch_sections=None,
profiler_log=""):
prefix = "[{}] ".format(mode)
logs = {}
if epoch is not None:
Expand All @@ -596,7 +597,7 @@ def _print_log(self,
else:
for i in range(section_start, section_end):
logs[fetch_new_names[i] + ": {} "] = outs[i]
string = prefix + ''.join(list(logs.keys()))
string = prefix + ''.join(list(logs.keys())) + profiler_log
self._logger.info(string.format(*list(logs.values())))

def fit(self,
Expand Down Expand Up @@ -695,29 +696,34 @@ def fit(self,
mode=self.mode)
lr_scheduler = self._get_lr_scheduler(self.main_program)

for epoch in range(epochs):
for step, _ in enumerate(train_dataloader):
try:
outs = self._executor.run(
self.main_program,
fetch_list=fetch_list,
use_program_cache=self._strategy.use_cache,
return_numpy=self._strategy.return_numpy)
except core.EOFException:
break
if lr_scheduler and step % self._k_steps == 0:
lr_scheduler.step()
lr = self._get_lr(self._lr_optimizer)
self._print_log(outs, self.mode, epoch, step, lr,
fetch_new_names, fetch_sections)

if valid_data and epoch % valid_freq == 0:
self.evaluate(valid_data, valid_sample_split, batch_size,
valid_steps, collate_fn, callbacks)
self._switch_mode("train")
else:
self._reset_metrics()
return outs
with profiler.Profiler(timer_only=True) as prof:
for epoch in range(epochs):
for step, _ in enumerate(train_dataloader):
try:
outs = self._executor.run(
self.main_program,
fetch_list=fetch_list,
use_program_cache=self._strategy.use_cache,
return_numpy=self._strategy.return_numpy)
except core.EOFException:
break
if lr_scheduler and step % self._k_steps == 0:
lr_scheduler.step()
lr = self._get_lr(self._lr_optimizer)

prof.step()

self._print_log(outs, self.mode, epoch, step, lr,
fetch_new_names, fetch_sections,
prof.step_info())

if valid_data and epoch % valid_freq == 0:
self.evaluate(valid_data, valid_sample_split, batch_size,
valid_steps, collate_fn, callbacks)
self._switch_mode("train")
else:
self._reset_metrics()
return outs

def evaluate(self,
valid_data,
Expand Down

0 comments on commit 24f4753

Please sign in to comment.