From f0e35c441511cdd45d3f7643a34f1c62351c50bb Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Mon, 29 Nov 2021 10:48:14 +0800 Subject: [PATCH] Added performance benchmakrs for Eager Dygraph (#37643) --- .../performance_tests/benchmark_eager_cpu.cc | 180 +++++++++++++ .../performance_tests/benchmark_eager_cuda.cc | 187 +++++++++++++ .../performance_tests/benchmark_fluid_cpu.cc | 221 +++++++++++++++ .../performance_tests/benchmark_fluid_cuda.cc | 254 ++++++++++++++++++ 4 files changed, 842 insertions(+) create mode 100644 paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc create mode 100644 paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc create mode 100644 paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc create mode 100644 paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc new file mode 100644 index 00000000000000..0a84f3b523aeed --- /dev/null +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Eager Dygraph + +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/flags.h" + +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/backward.h" + +#include "paddle/fluid/imperative/tracer.h" + +#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" +#include "paddle/fluid/eager/tests/test_utils.h" + +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif + +// TODO(jiabin): remove nolint here!!! +using namespace egr; // NOLINT + +// Disable pten path +DECLARE_bool(run_pten_kernel); + +TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; } + +TEST(Benchmark, EagerScaleCPU) { + // Prepare Device Contexts + egr::InitEnv(paddle::platform::CPUPlace()); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4}); + egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 5.0, true); + RetainGradForTensor(tensor); + + if (mode == "Accuracy") { + benchmark_eager_scale(tensor, true /* accuracy_check*/); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_scale_cpu.out"); +#endif + benchmark_eager_scale(tensor); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + +TEST(Benchmark, EagerIntermediateMatmulCPU) { + // Prepare Device Contexts + InitEnv(paddle::platform::CPUPlace()); + + auto tracer = std::make_shared(); + paddle::imperative::SetCurrentTracer(tracer); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2}); + egr::EagerTensor X = EagerUtils::CreateTensorWithValue( + ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2}); + egr::EagerTensor Y = EagerUtils::CreateTensorWithValue( + ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_intermediate_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_intermediate_matmul_cpu.out"); +#endif + benchmark_eager_intermediate_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + +TEST(Benchmark, EagerIntermediateMLPCPU) { + // Prepare Device Contexts + InitEnv(paddle::platform::CPUPlace()); + + auto tracer = std::make_shared(); + paddle::imperative::SetCurrentTracer(tracer); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + paddle::framework::DDim ddimX = + paddle::framework::make_ddim({MLP_M, MLP_N}); + egr::EagerTensor X = EagerUtils::CreateTensorWithValue( + ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, MLP_X_VAL, true); + RetainGradForTensor(X); + + std::vector Ws; + std::vector Bs; + for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { + paddle::framework::DDim ddimW = + paddle::framework::make_ddim({MLP_N, MLP_K}); + egr::EagerTensor W = EagerUtils::CreateTensorWithValue( + ddimW, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, MLP_W_VAL, true); + RetainGradForTensor(W); + + paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K}); + egr::EagerTensor B = EagerUtils::CreateTensorWithValue( + ddimB, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, MLP_B_VAL, true); + RetainGradForTensor(B); + + Ws.emplace_back(std::move(W)); + Bs.emplace_back(std::move(B)); + } + + if (mode == "Accuracy") { + benchmark_eager_intermediate_mlp(X, Ws, Bs, true /* accuracy_check */); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_intermediate_mlp_cpu.out"); +#endif + benchmark_eager_intermediate_mlp(X, Ws, Bs); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc new file mode 100644 index 00000000000000..b373802c79eb45 --- /dev/null +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -0,0 +1,187 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Eager Dygraph +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/flags.h" + +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/backward.h" + +#include "paddle/fluid/imperative/tracer.h" + +#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" +#include "paddle/fluid/eager/tests/test_utils.h" + +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif + +// TODO(jiabin): remove nolint here!!! +using namespace egr; // NOLINT + +DECLARE_bool(run_pten_kernel); + +TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; } + +TEST(Benchmark, EagerScaleCUDA) { + egr::InitEnv(paddle::platform::CUDAPlace()); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4}); + egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue( + ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); + RetainGradForTensor(tensor); + + if (mode == "Accuracy") { + benchmark_eager_scale(tensor, true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_eager_scale(tensor); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_scale_cuda.out"); +#endif + benchmark_eager_scale(tensor); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + +TEST(Benchmark, EagerIntermediateMatmulCUDA) { + paddle::platform::CUDAPlace place; + egr::InitEnv(place); + + auto tracer = std::make_shared(); + tracer->SetExpectedPlace(place); + paddle::imperative::SetCurrentTracer(tracer); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2}); + egr::EagerTensor X = EagerUtils::CreateTensorWithValue( + ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2}); + egr::EagerTensor Y = EagerUtils::CreateTensorWithValue( + ddimY, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_intermediate_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_eager_intermediate_matmul(X, Y); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_intermediate_matmul_cuda.out"); +#endif + benchmark_eager_intermediate_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + +TEST(Benchmark, EagerIntermediateMLPCUDA) { + paddle::platform::CUDAPlace place; + egr::InitEnv(place); + + auto tracer = std::make_shared(); + tracer->SetExpectedPlace(place); + paddle::imperative::SetCurrentTracer(tracer); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + paddle::framework::DDim ddimX = + paddle::framework::make_ddim({MLP_M, MLP_N}); + egr::EagerTensor X = EagerUtils::CreateTensorWithValue( + ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, MLP_X_VAL, true); + RetainGradForTensor(X); + + std::vector Ws; + std::vector Bs; + for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { + paddle::framework::DDim ddimW = + paddle::framework::make_ddim({MLP_N, MLP_K}); + egr::EagerTensor W = EagerUtils::CreateTensorWithValue( + ddimW, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, MLP_W_VAL, true); + RetainGradForTensor(W); + + paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K}); + egr::EagerTensor B = EagerUtils::CreateTensorWithValue( + ddimB, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, + pten::DataLayout::NCHW, MLP_B_VAL, true); + RetainGradForTensor(B); + + Ws.emplace_back(std::move(W)); + Bs.emplace_back(std::move(B)); + } + + if (mode == "Accuracy") { + benchmark_eager_intermediate_mlp(X, Ws, Bs, true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_eager_intermediate_mlp(X, Ws, Bs); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_intermediate_mlp_cuda.out"); +#endif + benchmark_eager_intermediate_mlp(X, Ws, Bs); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc new file mode 100644 index 00000000000000..20844055e300d6 --- /dev/null +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" +#include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/fluid/imperative/basic_engine.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/memory/memcpy.h" + +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif + +// Disable pten path +DECLARE_bool(run_pten_kernel); + +TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; } + +namespace paddle { +namespace imperative { + +TEST(Benchmark, FluidScaleCPU) { + // Prepare Device Contexts + platform::CPUPlace place; + egr::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + std::shared_ptr X(new imperative::VarBase(true, "X")); + X->SetOverridedStopGradient(false); + + std::vector src_data(128, 5.0); + std::vector dims = {2, 4, 4, 4}; + + auto* x_tensor = X->MutableVar()->GetMutable(); + x_tensor->Resize(framework::make_ddim(dims)); + auto* mutable_x = x_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, src_data.data(), + sizeof(float) * src_data.size()); + + if (mode == "Accuracy") { + benchmark_fluid_scale(X, platform::Place(place), + true /* accuracy_check */); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("fluid_scale_cpu.out"); +#endif + benchmark_fluid_scale(X, platform::Place(place)); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + +TEST(Benchmark, FluidMatmulCPU) { + // Prepare Device Contexts + platform::CPUPlace place; + egr::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + std::shared_ptr X(new imperative::VarBase(true, "X")); + X->SetOverridedStopGradient(false); + std::shared_ptr Y(new imperative::VarBase(true, "Y")); + Y->SetOverridedStopGradient(false); + + std::vector x_src_data(4, 1.0); + std::vector y_src_data(4, 2.0); + std::vector dims = {2, 2}; + + auto* x_tensor = X->MutableVar()->GetMutable(); + x_tensor->Resize(framework::make_ddim(dims)); + auto* mutable_x = x_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, x_src_data.data(), + sizeof(float) * x_src_data.size()); + + auto* y_tensor = Y->MutableVar()->GetMutable(); + y_tensor->Resize(framework::make_ddim(dims)); + auto* mutable_y = y_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_y, place, y_src_data.data(), + sizeof(float) * y_src_data.size()); + + if (mode == "Accuracy") { + benchmark_fluid_matmul(X, Y, platform::Place(place), + true /* accuracy_check */); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("fluid_matmul_cpu.out"); +#endif + benchmark_fluid_matmul(X, Y, platform::Place(place)); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + +TEST(Benchmark, FluidMLPCPU) { + // Prepare Device Contexts + platform::CPUPlace place; + egr::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + std::vector x_src_data(MLP_M * MLP_N, MLP_X_VAL); + std::vector w_src_data(MLP_N * MLP_K, MLP_W_VAL); + std::vector b_src_data(MLP_K, MLP_B_VAL); + + std::vector x_dims = {MLP_M, MLP_N}; + std::vector w_dims = {MLP_N, MLP_K}; + std::vector b_dims = {MLP_K}; + + std::shared_ptr X(new imperative::VarBase(true, "X")); + X->SetOverridedStopGradient(false); + + auto* x_tensor = X->MutableVar()->GetMutable(); + x_tensor->Resize(framework::make_ddim(x_dims)); + auto* mutable_x = x_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, x_src_data.data(), + sizeof(float) * x_src_data.size()); + + std::vector> Ws; + std::vector> Bs; + for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { + std::shared_ptr W( + new imperative::VarBase(true, "W")); + W->SetOverridedStopGradient(false); + std::shared_ptr B( + new imperative::VarBase(true, "B")); + B->SetOverridedStopGradient(false); + + auto* w_tensor = W->MutableVar()->GetMutable(); + w_tensor->Resize(framework::make_ddim(w_dims)); + auto* mutable_w = w_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_w, place, w_src_data.data(), + sizeof(float) * w_src_data.size()); + + auto* b_tensor = B->MutableVar()->GetMutable(); + b_tensor->Resize(framework::make_ddim(b_dims)); + auto* mutable_b = b_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_b, place, b_src_data.data(), + sizeof(float) * b_src_data.size()); + + Ws.emplace_back(std::move(W)); + Bs.emplace_back(std::move(B)); + } + + if (mode == "Accuracy") { + benchmark_fluid_mlp(X, Ws, Bs, platform::Place(place), + true /* accuracy_check */); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("fluid_mlp_cpu.out"); +#endif + benchmark_fluid_mlp(X, Ws, Bs, platform::Place(place)); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + +} // namespace imperative +} // namespace paddle + +USE_OP(scale); +USE_OP(matmul_v2); +USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc new file mode 100644 index 00000000000000..620a4d1cd128d4 --- /dev/null +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -0,0 +1,254 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" +#include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/fluid/imperative/basic_engine.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/memory/memcpy.h" + +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif + +// Disable pten path +DECLARE_bool(run_pten_kernel); + +TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; } + +namespace paddle { +namespace imperative { + +TEST(Benchmark, FluidScaleCUDA) { + // Prepare Device Contexts + platform::CUDAPlace place; + egr::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + std::shared_ptr X(new imperative::VarBase(true, "X")); + X->SetOverridedStopGradient(false); + + std::vector src_data(128, 5.0); + std::vector dims = {2, 4, 4, 4}; + + auto* x_tensor = X->MutableVar()->GetMutable(); + x_tensor->Resize(framework::make_ddim(dims)); + auto* mutable_x = x_tensor->mutable_data(place); + + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + auto* dev_ctx = + dynamic_cast(pool.Get(place)); + auto stream = dev_ctx->stream(); + paddle::memory::Copy(place, mutable_x, platform::CPUPlace(), + src_data.data(), sizeof(float) * src_data.size(), + stream); + + if (mode == "Accuracy") { + benchmark_fluid_scale(X, platform::Place(place), + true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_fluid_scale(X, platform::Place(place)); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("fluid_scale_cuda.out"); +#endif + benchmark_fluid_scale(X, platform::Place(place)); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + +TEST(Benchmark, FluidMatmulCUDA) { + // Prepare Device Contexts + platform::CUDAPlace place; + egr::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + std::shared_ptr X(new imperative::VarBase(true, "X")); + X->SetOverridedStopGradient(false); + std::shared_ptr Y(new imperative::VarBase(true, "Y")); + Y->SetOverridedStopGradient(false); + + std::vector x_src_data(4, 1.0); + std::vector y_src_data(4, 2.0); + std::vector dims = {2, 2}; + + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + auto* dev_ctx = + dynamic_cast(pool.Get(place)); + auto stream = dev_ctx->stream(); + + auto* x_tensor = X->MutableVar()->GetMutable(); + x_tensor->Resize(framework::make_ddim(dims)); + auto* mutable_x = x_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, platform::CPUPlace(), + x_src_data.data(), sizeof(float) * x_src_data.size(), + stream); + + auto* y_tensor = Y->MutableVar()->GetMutable(); + y_tensor->Resize(framework::make_ddim(dims)); + auto* mutable_y = y_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_y, platform::CPUPlace(), + y_src_data.data(), sizeof(float) * y_src_data.size(), + stream); + + if (mode == "Accuracy") { + benchmark_fluid_matmul(X, Y, platform::Place(place), + true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_fluid_matmul(X, Y, platform::Place(place)); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("fluid_matmul_cuda.out"); +#endif + benchmark_fluid_matmul(X, Y, platform::Place(place)); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + +TEST(Benchmark, FluidMLPCUDA) { + // Prepare Device Contexts + platform::CUDAPlace place; + egr::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + auto* dev_ctx = + dynamic_cast(pool.Get(place)); + auto stream = dev_ctx->stream(); + + std::vector x_src_data(MLP_M * MLP_N, MLP_X_VAL); + std::vector w_src_data(MLP_N * MLP_K, MLP_W_VAL); + std::vector b_src_data(MLP_K, MLP_B_VAL); + + std::vector x_dims = {MLP_M, MLP_N}; + std::vector w_dims = {MLP_N, MLP_K}; + std::vector b_dims = {MLP_K}; + + std::shared_ptr X(new imperative::VarBase(true, "X")); + X->SetOverridedStopGradient(false); + + auto* x_tensor = X->MutableVar()->GetMutable(); + x_tensor->Resize(framework::make_ddim(x_dims)); + auto* mutable_x = x_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, platform::CPUPlace(), + x_src_data.data(), sizeof(float) * x_src_data.size(), + stream); + + std::vector> Ws; + std::vector> Bs; + for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { + std::shared_ptr W( + new imperative::VarBase(true, "W")); + W->SetOverridedStopGradient(false); + std::shared_ptr B( + new imperative::VarBase(true, "B")); + B->SetOverridedStopGradient(false); + + auto* w_tensor = W->MutableVar()->GetMutable(); + w_tensor->Resize(framework::make_ddim(w_dims)); + auto* mutable_w = w_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_w, platform::CPUPlace(), + w_src_data.data(), sizeof(float) * w_src_data.size(), + stream); + + auto* b_tensor = B->MutableVar()->GetMutable(); + b_tensor->Resize(framework::make_ddim(b_dims)); + auto* mutable_b = b_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_b, platform::CPUPlace(), + b_src_data.data(), sizeof(float) * b_src_data.size(), + stream); + + Ws.emplace_back(std::move(W)); + Bs.emplace_back(std::move(B)); + } + + if (mode == "Accuracy") { + benchmark_fluid_mlp(X, Ws, Bs, platform::Place(place), + true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_fluid_mlp(X, Ws, Bs, platform::Place(place)); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("fluid_mlp_cuda.out"); +#endif + benchmark_fluid_mlp(X, Ws, Bs, platform::Place(place)); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + +} // namespace imperative +} // namespace paddle + +USE_OP(scale); +USE_OP(matmul_v2); +USE_OP(reduce_sum); +USE_OP(reduce_sum_grad);