Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move transpose to pten #39327

Merged
merged 47 commits into from
Mar 2, 2022
Merged
Show file tree
Hide file tree
Changes from 41 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
c36ffdb
immigrate_transpose_to_pten cpu kernel only; test=develop
phlrain Jan 18, 2022
910f2a2
fix bug; test=develop
phlrain Jan 18, 2022
95dc685
add transpose cuda api
phlrain Jan 19, 2022
5d3abf3
bug fix;
phlrain Jan 19, 2022
407e91d
fix bugs
phlrain Jan 20, 2022
4f592a7
fix bugs; test=develop
phlrain Jan 21, 2022
a263da2
bug fix;
phlrain Jan 21, 2022
f756d1e
move transepose to pten; test=develop
phlrain Jan 29, 2022
aacd428
fix bug; test=develop
phlrain Feb 2, 2022
67b9e54
fix bugs; test=develop
phlrain Feb 2, 2022
fc2b941
add transpose grad fp16 support; test=develop
phlrain Feb 2, 2022
3a2c761
fix bug; test=develop
phlrain Feb 3, 2022
7a2a463
fix npu bug; test=develop
phlrain Feb 3, 2022
74d3aac
fix nemul = 0 bug; test=develop
phlrain Feb 3, 2022
ad67d1e
add fp16 support; test=develop
phlrain Feb 4, 2022
49d7d17
fix data type register bug; test=develop
phlrain Feb 6, 2022
41134fa
fix transpose bug; test=develop
phlrain Feb 7, 2022
2dbf42b
update transpose
phlrain Feb 9, 2022
c25488f
fix transpose bug; test=develop
phlrain Feb 14, 2022
7826e7d
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 14, 2022
69ebc79
remove useless code; test=develop
phlrain Feb 14, 2022
5362142
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 14, 2022
1da1d06
remove useless code; test=develop
phlrain Feb 15, 2022
b261f44
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 15, 2022
dd6815c
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 15, 2022
4ce0d5d
fix transpose alias bug; test=develop
phlrain Feb 16, 2022
d62f092
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 16, 2022
b017cf7
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 16, 2022
4bde16b
polish code; test=develop
phlrain Feb 16, 2022
644c383
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 16, 2022
e5d67c9
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 16, 2022
b88d53d
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 18, 2022
07e3f81
resolve confict; test=develop
phlrain Feb 18, 2022
9a60334
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 18, 2022
0ca4770
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 21, 2022
50f111b
resolve confilct; test=develop
phlrain Feb 21, 2022
9414470
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 21, 2022
4c66d0e
recover prepared operator; test=develop
phlrain Feb 23, 2022
30009c2
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 23, 2022
35c2743
fix bug; test=develop
phlrain Feb 23, 2022
4270a07
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 24, 2022
99f05be
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 25, 2022
16475bd
polish code; test=develop
phlrain Feb 25, 2022
a566861
fix bug; test=develop
phlrain Feb 25, 2022
7d37978
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 25, 2022
6177792
fix bug; test=develop
phlrain Feb 25, 2022
92d94e2
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
phlrain Feb 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ USE_OP(pool2d);
USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
USE_OP(relu);
USE_OP_DEVICE_KERNEL(relu, MKLDNN);
USE_OP(transpose);
USE_OP_ITSELF(transpose);
USE_OP_DEVICE_KERNEL(transpose, MKLDNN);

namespace paddle {
Expand Down
60 changes: 11 additions & 49 deletions paddle/fluid/operators/transpose_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,14 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
}
};

class TransposeGradInferVarType : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {
ctx->SyncTypeAndDataType(framework::GradVarName("Out"),
framework::GradVarName("X"));
}
};

} // namespace operators
} // namespace paddle

Expand All @@ -347,59 +355,13 @@ REGISTER_OPERATOR(
transpose, ops::TransposeOp, ops::TransposeOpMaker,
paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);

REGISTER_OP_CPU_KERNEL(
transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, bool>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
transpose_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad,
ops::TransposeGradInferVarType);

REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
ops::Transpose2GradMaker<paddle::framework::OpDesc>,
ops::Transpose2GradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad,
ops::TransposeGradInferVarType,
ops::Transpose2DoubleGradMaker<paddle::framework::OpDesc>,
ops::Transpose2DoubleGradMaker<paddle::imperative::OpBase>);

REGISTER_OP_CPU_KERNEL(
transpose2, ops::TransposeKernel<paddle::platform::CPUDeviceContext, bool>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, int32_t>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int32_t>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
139 changes: 0 additions & 139 deletions paddle/fluid/operators/transpose_op.cu

This file was deleted.

42 changes: 21 additions & 21 deletions paddle/fluid/operators/transpose_op.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ limitations under the License. */

#include "paddle/fluid/framework/gpu_utils.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -258,10 +259,10 @@ struct SystemElemType<16> {
};

template <typename T, int tile_long, int tile_short>
void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d,
int tile_size_i, int tile_size_j,
int total_tiles_count, const T* input,
const Dim3& input_dims, T* output) {
void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i,
int tile_size_j, int total_tiles_count,
const T* input, const Dim3& input_dims,
T* output) {
constexpr int NumThreads = tile_long;
if (tile_size_i <= tile_long && tile_size_j <= tile_short) {
TilingSwapDim1And2<
Expand All @@ -278,7 +279,7 @@ void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d,

template <typename T, int tile_long, int tile_short, typename dummy = void>
struct NarrowDims2TransposeDispatch {
static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
int tile_size_j, int total_tiles_count,
const T* input, const Dim3& input_dims, T* output) {
PADDLE_ENFORCE_EQ(
Expand Down Expand Up @@ -319,7 +320,7 @@ struct NarrowDims2TransposeDispatch<
T, tile_long, tile_short,
typename std::enable_if<
CheckNonLongTileSize(tile_long, tile_short, sizeof(T)), void>::type> {
static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
int tile_size_j, int total_tiles_count,
const T* input, const Dim3& input_dims, T* output) {
PADDLE_ENFORCE_EQ(
Expand Down Expand Up @@ -351,7 +352,7 @@ struct NarrowDims2TransposeDispatch<
T, tile_long, tile_short,
typename std::enable_if<CheckLongTileSize(tile_long, tile_short, sizeof(T)),
void>::type> {
static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
int tile_size_j, int total_tiles_count,
const T* input, const Dim3& input_dims, T* output) {
PADDLE_ENFORCE_EQ(
Expand All @@ -368,7 +369,7 @@ struct NarrowDims2TransposeDispatch<
};

template <typename T, bool conjugate = false>
void SwapDim1And2InNarrow(const platform::CUDADeviceContext& d, const T* input,
void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input,
const Dim3& input_dims, T* output,
const int kMinTileSize) {
// First get available tile sizes for the data type requested as backups
Expand Down Expand Up @@ -473,9 +474,8 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,

// Here suppose convert all tensor to dim3, so just change dim1 and 2.
template <typename T>
void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
const T* input, const Dim3& input_dims,
T* output) {
void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
const Dim3& input_dims, T* output) {
// Suppose tile size > 16
static const int kMinTileSize = 16;
static const int kMinNarrowTileSize = 96;
Expand Down Expand Up @@ -512,7 +512,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
} else {
// If input shape is small, such as 8X8, just do simple copy
int total_elements = input_dims[0] * input_dims[1] * input_dims[2];
auto config = GetGpuLaunchConfig1D(d, total_elements);
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_elements);
TransposeSimpleKernel<T, 0, 2, 1><<<
config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
total_elements, input, input_dims, output);
Expand All @@ -521,7 +521,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,

template <typename T>
struct SwapDim1And2InTranspose {
typedef platform::CUDADeviceContext Device;
typedef phi::GPUContext Device;
void operator()(const Device& d, const T* in,
const std::vector<int>& combined_dims, T* out) {
Dim3 input_dims = {static_cast<int>(combined_dims[0]),
Expand All @@ -533,15 +533,15 @@ struct SwapDim1And2InTranspose {

template <typename T>
struct SwapDim0And2InTranspose {
typedef platform::CUDADeviceContext Device;
typedef phi::GPUContext Device;
void operator()(const Device& d, const T* in,
const std::vector<int>& combined_dims, T* out) {
Dim3 input_dims = {static_cast<int>(combined_dims[0]),
static_cast<int>(combined_dims[1]),
static_cast<int>(combined_dims[2])};

size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
auto config = GetGpuLaunchConfig1D(d, total_size);
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_size);

TransposeSimpleKernel<T, 2, 1, 0><<<
config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
Expand Down Expand Up @@ -607,7 +607,7 @@ inline void CombineTransposeDim3(const framework::DDim& shape,

template <typename T>
struct TransposeSimple {
static bool run(const platform::CUDADeviceContext& ctx, const Tensor& in,
static bool run(const phi::GPUContext& ctx, const Tensor& in,
const std::vector<int32_t> perm, Tensor* out) {
// First reduce the dimensions of the input tensor if possible.
std::vector<int> new_perm;
Expand Down Expand Up @@ -654,12 +654,12 @@ struct TransposeSimple {
};

template <typename T>
void TransposeGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
const int ndims, const Tensor& in,
const std::vector<int32_t> perm, Tensor* out) {
void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int ndims,
const Tensor& in, const std::vector<int32_t> perm,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

一个小的优化点,perm参数是不可以使用引用&?

Tensor* out) {
auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
if (!ret) {
TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, in, out, perm);
TransCompute<phi::GPUContext, T>(ndims, dev_ctx, in, out, perm);
}
}

Expand Down
Loading