Skip to content

Commit

Permalink
[cherry-pick] Refine statistic table and bug fix (#41581)
Browse files Browse the repository at this point in the history
* Refine statistic table (#41524)

* Add get profiler from config (#41532)

* no

* maintain old profiler

* add get profiler from serialization config

* add unit test

* improve coverage

* fix

* Revert "improve coverage"

This reverts commit 4a980bf.

* fix unit

* fix

* fix
  • Loading branch information
rainyfly authored Apr 11, 2022
1 parent f3296ea commit 365975f
Show file tree
Hide file tree
Showing 4 changed files with 428 additions and 114 deletions.
141 changes: 141 additions & 0 deletions python/paddle/fluid/tests/unittests/test_newprofiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import unittest
import numpy as np
import tempfile

import paddle
import paddle.profiler as profiler
Expand Down Expand Up @@ -138,6 +139,146 @@ def test_nvprof(self):
y = x / 2.0


class TestGetProfiler(unittest.TestCase):
def test_getprofiler(self):
config_content = '''
{
"targets": ["CPU"],
"scheduler": [3,4],
"on_trace_ready": {
"export_chrome_tracing":{
"module": "paddle.profiler",
"use_direct": false,
"args": [],
"kwargs": {
"dir_name": "testdebug/"
}
}
},
"timer_only": false
}
'''
filehandle = tempfile.NamedTemporaryFile(mode='w')
filehandle.write(config_content)
filehandle.flush()
import paddle.profiler.profiler as profiler
profiler = profiler.get_profiler(filehandle.name)
x_value = np.random.randn(2, 3, 3)
x = paddle.to_tensor(
x_value, stop_gradient=False, place=paddle.CPUPlace())
with profiler:
for i in range(5):
y = x / 2.0
ones_like_y = paddle.ones_like(y)
profiler.step()

# below tests are just for coverage, wrong config
# test use_direct
config_content = '''
{
"targets": ["Cpu", "Gpu"],
"scheduler": {
"make_scheduler":{
"module": "paddle.profiler",
"use_direct": true,
"args": [],
"kwargs": {}
}
},
"on_trace_ready": {
"export_chrome_tracing":{
"module": "paddle.profiler1",
"use_direct": true,
"args": [],
"kwargs": {
}
}
},
"timer_only": false
}
'''
filehandle = tempfile.NamedTemporaryFile(mode='w')
filehandle.write(config_content)
filehandle.flush()
import paddle.profiler.profiler as profiler
try:
profiler = profiler.get_profiler(filehandle.name)
except:
pass

# test scheduler
config_content = '''
{
"targets": ["Cpu", "Gpu"],
"scheduler": {
"make_scheduler":{
"module": "paddle.profiler",
"use_direct": false,
"args": [],
"kwargs": {
"closed": 1,
"ready": 1,
"record": 2
}
}
},
"on_trace_ready": {
"export_chrome_tracing":{
"module": "paddle.profiler",
"use_direct": true,
"args": [],
"kwargs": {
}
}
},
"timer_only": false
}
'''
filehandle = tempfile.NamedTemporaryFile(mode='w')
filehandle.write(config_content)
filehandle.flush()
import paddle.profiler.profiler as profiler
profiler = profiler.get_profiler(filehandle.name)

# test exception
config_content = '''
{
"targets": [1],
"scheduler": {
"make_scheduler1":{
"module": "paddle.profiler",
"use_direct": false,
"args": [],
"kwargs": {
"closed": 1,
"ready": 1,
"record": 2
}
}
},
"on_trace_ready": {
"export_chrome_tracing1":{
"module": "paddle.profiler",
"use_direct": false,
"args": [],
"kwargs": {
"dir_name": "testdebug/"
}
}
},
"timer_only": 1
}
'''
filehandle = tempfile.NamedTemporaryFile(mode='w')
filehandle.write(config_content)
filehandle.flush()
import paddle.profiler.profiler as profiler
profiler = profiler.get_profiler(filehandle.name)
# test path error
import paddle.profiler.profiler as profiler
profiler = profiler.get_profiler('nopath.json')


class RandomDataset(Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
Expand Down
88 changes: 46 additions & 42 deletions python/paddle/fluid/tests/unittests/test_profiler_statistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,20 +185,22 @@ def test_statistic_case1(self):
profiler.TracerEventType.Communication), 5)
self.assertEqual(len(event_summary.items), 2)
self.assertEqual(len(event_summary.userdefined_items), 1)
self.assertEqual(len(event_summary.model_perspective_items), 3)
self.assertEqual(len(event_summary.model_perspective_items), 4)
self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 100)
self.assertEqual(
event_summary.model_perspective_items['Forward'].gpu_time, 135)
event_summary.model_perspective_items['Forward'].general_gpu_time,
135)
self.assertEqual(
event_summary.model_perspective_items['Backward'].gpu_time, 0)
event_summary.model_perspective_items['Backward'].general_gpu_time,
0)
self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy']
.general_gpu_time, 60)
print(
profiler.profiler_statistic._build_table(
statistic_data,
Expand Down Expand Up @@ -226,31 +228,31 @@ def test_statistic_case2(self):
userdefined_node = HostPythonNode('Communication Time',
profiler.TracerEventType.UserDefined,
100, 110, 1000, 1001)
reduce_all_launchkernel0 = HostPythonNode(
allreduce_launchkernel0 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
1000, 1001)

nccl_reduce_all_kernel0 = DevicePythonNode(
'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 105, 120,
nccl_allreduce_kernel0 = DevicePythonNode(
'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 105, 120,
0, 0, 2)

communication_node = HostPythonNode(
'Communication', profiler.TracerEventType.Communication, 105, 110,
1000, 1001)

reduce_all_op1 = HostPythonNode('reduce_all_op1',
profiler.TracerEventType.Operator, 105,
108, 1000, 1001)
reduce_all_op1_infershape = HostPythonNode(
'reduce_all_op1::infershape',
profiler.TracerEventType.OperatorInner, 105, 106, 1000, 1001)
allreduce_op1 = HostPythonNode('allreduce_op1',
profiler.TracerEventType.Operator, 105,
108, 1000, 1001)
allreduce_op1_infershape = HostPythonNode(
'allreduce_op1::infershape', profiler.TracerEventType.OperatorInner,
105, 106, 1000, 1001)

reduce_all_launchkernel1 = HostPythonNode(
allreduce_launchkernel1 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 106, 107,
1000, 1001)

nccl_reduce_all_kernel1 = DevicePythonNode(
'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 130, 150,
nccl_allreduce_kernel1 = DevicePythonNode(
'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 130, 150,
0, 0, 2)

backward_node = HostPythonNode('Gradient Backward',
Expand Down Expand Up @@ -305,19 +307,19 @@ def test_statistic_case2(self):
'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
0, 0, 1)

reduce_all_node2 = HostPythonNode('reduce_all',
profiler.TracerEventType.Operator,
230, 250, 1000, 1001)
allreduce_node2 = HostPythonNode('allreduce',
profiler.TracerEventType.Operator, 230,
250, 1000, 1001)

reduce_all_node2_infershape = HostPythonNode(
'reduce_all_node2::infershape',
allreduce_node2_infershape = HostPythonNode(
'allreduce_node2::infershape',
profiler.TracerEventType.OperatorInner, 231, 232, 1000, 1001)
reduce_all_launchkernel2 = HostPythonNode(
allreduce_launchkernel2 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 235, 240,
1000, 1001)

nccl_reduce_all_kernel2 = DevicePythonNode(
'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 250, 280,
nccl_allreduce_kernel2 = DevicePythonNode(
'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 250, 280,
0, 0, 2)

root_node.children_node.append(profilerstep_node)
Expand All @@ -329,12 +331,12 @@ def test_statistic_case2(self):
yolonet_node.children_node.extend(
[sync_batch_norm_node, userdefined_node])
userdefined_node.children_node.append(communication_node)
userdefined_node.runtime_node.append(reduce_all_launchkernel0)
reduce_all_launchkernel0.device_node.append(nccl_reduce_all_kernel0)
communication_node.children_node.append(reduce_all_op1)
reduce_all_op1.children_node.append(reduce_all_op1_infershape)
reduce_all_op1.runtime_node.append(reduce_all_launchkernel1)
reduce_all_launchkernel1.device_node.append(nccl_reduce_all_kernel1)
userdefined_node.runtime_node.append(allreduce_launchkernel0)
allreduce_launchkernel0.device_node.append(nccl_allreduce_kernel0)
communication_node.children_node.append(allreduce_op1)
allreduce_op1.children_node.append(allreduce_op1_infershape)
allreduce_op1.runtime_node.append(allreduce_launchkernel1)
allreduce_launchkernel1.device_node.append(nccl_allreduce_kernel1)
conv2d_node.children_node.extend(
[conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
conv2d_compute.runtime_node.append(conv2d_launchkernel)
Expand All @@ -350,10 +352,10 @@ def test_statistic_case2(self):
sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
optimization_node.children_node.append(reduce_all_node2)
reduce_all_node2.children_node.append(reduce_all_node2_infershape)
reduce_all_node2.runtime_node.append(reduce_all_launchkernel2)
reduce_all_launchkernel2.device_node.append(nccl_reduce_all_kernel2)
optimization_node.children_node.append(allreduce_node2)
allreduce_node2.children_node.append(allreduce_node2_infershape)
allreduce_node2.runtime_node.append(allreduce_launchkernel2)
allreduce_launchkernel2.device_node.append(nccl_allreduce_kernel2)
thread_tree = {'thread1001': root_node}
extra_info = {
'Process Cpu Utilization': '1.02',
Expand Down Expand Up @@ -415,20 +417,22 @@ def test_statistic_case2(self):
distributed_summary.overlap_range), 85)
self.assertEqual(len(event_summary.items), 4)
self.assertEqual(len(event_summary.userdefined_items), 1)
self.assertEqual(len(event_summary.model_perspective_items), 3)
self.assertEqual(len(event_summary.model_perspective_items), 4)
self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 100)
self.assertEqual(
event_summary.model_perspective_items['Forward'].gpu_time, 315)
event_summary.model_perspective_items['Forward'].general_gpu_time,
315)
self.assertEqual(
event_summary.model_perspective_items['Backward'].gpu_time, 0)
event_summary.model_perspective_items['Backward'].general_gpu_time,
0)
self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy']
.general_gpu_time, 60)
print(
profiler.profiler_statistic._build_table(
statistic_data,
Expand Down
Loading

0 comments on commit 365975f

Please sign in to comment.