Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace PADDLE_WITH_XPU2 with PADDLE_WITH_KP #40560

Merged
merged 2 commits into from
Mar 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 57 additions & 135 deletions paddle/phi/kernels/funcs/reduce_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@

#pragma once

// CUDA and HIP use same api
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// CUDA, XPU and HIP use same api
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__)

#include <algorithm>
#include <cmath>
Expand Down Expand Up @@ -220,7 +220,7 @@ struct IndexCalculator {
phi::Array<int, kMaxRank> dims;
phi::Array<int, kMaxRank> strides;
phi::Array<int, kMaxRank> reduce_strides;
#ifndef PADDLE_WITH_XPU2
#ifndef PADDLE_WITH_XPU_KP
phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
#endif
};
Expand All @@ -231,81 +231,65 @@ struct ReduceIndexMapping {
HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
: dim(dims) {}

#ifdef PADDLE_WITH_XPU_KP
__device__ __forceinline__ int BlockIdX() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return (cluster_id() / dim.split_num_x % dim.split_num_y);
} else {
return cluster_id() % dim.split_num_x;
}
#else
return blockIdx.x;
#endif
}

__device__ __forceinline__ int BlockIdY() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return (cluster_id() % dim.split_num_x);
} else {
return (cluster_id() / dim.split_num_x % dim.split_num_y);
}
#else
return blockIdx.y;
#endif
}

__device__ __forceinline__ int BlockDimX() {
#ifdef PADDLE_WITH_XPU2
return dim.deal_size_x;
#else
return blockDim.x;
#endif
}
__device__ __forceinline__ int BlockDimX() { return dim.deal_size_x; }

__device__ __forceinline__ int BlockDimY() {
#ifdef PADDLE_WITH_XPU2
return 1;
#else
return blockDim.y;
#endif
}
__device__ __forceinline__ int BlockDimY() { return 1; }

__device__ __forceinline__ int GridDimX() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return dim.split_num_y;
} else {
return dim.split_num_x;
}
#else
return gridDim.x;
#endif
}

__device__ __forceinline__ int GridDimY() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return dim.split_num_x;
} else {
return dim.split_num_y;
}
#else
return gridDim.y;
#endif
}

__device__ __forceinline__ int GetLoopSize() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return dim.deal_size_y;
} else {
return dim.deal_size_x;
}
}
#else
return 1;
__device__ __forceinline__ int BlockIdX() { return blockIdx.x; }

__device__ __forceinline__ int BlockIdY() { return blockIdx.y; }

__device__ __forceinline__ int BlockDimX() { return blockDim.x; }

__device__ __forceinline__ int BlockDimY() { return blockDim.y; }

__device__ __forceinline__ int GridDimX() { return gridDim.x; }

__device__ __forceinline__ int GridDimY() { return gridDim.y; }

__device__ int GetLoopSize() { return 1; }
#endif
}
};

// when reduce_type == kReduceLastDim this struct will be used
Expand Down Expand Up @@ -341,7 +325,7 @@ struct ReduceConfig {

// when should_reduce_again is true, we need malloc temp space for temp data
void SetOutputData(Ty* y_data,
const phi::GPUContext& dev_ctx,
const KPDevice& dev_ctx,
phi::DenseTensor* tmp) {
if (should_reduce_again) {
tmp->Resize(phi::make_ddim(
Expand Down Expand Up @@ -640,9 +624,7 @@ struct ReduceConfig {
int blocking_size;
bool should_reduce_again;
bool reduce_last_dim;

Ty* output_data;

dim3 block;
dim3 grid;
};
Expand Down Expand Up @@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x,

kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
&reduce_var, &reduce_var, reducer, reduce_last_dim);
if (need_store) {
y[store_offset + i] = static_cast<Ty>(reduce_var);
}

Ty result = static_cast<Ty>(reduce_var);
kps::details::WriteData<Ty>(
y + store_offset + i, &result, static_cast<int>(need_store));
}
}

Expand Down Expand Up @@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data,
dim.SetRem(config.reduce_num % config.block.x, 0, 0);

#ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx,
Ty,
MPType,
ReduceOp,
TransformOp,
OneDimIndexCal><<<8, 64, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
init,
config.reduce_num,
config.left_num,
config.reduce_last_dim,
reduce_index_calculator,
left_index_calculator,
dim);
auto grid_num = 8;
auto block_num = 64;
#else
auto grid_num = config.grid;
auto block_num = config.block;
#endif
ReduceAnyKernel<Tx,
Ty,
MPType,
ReduceOp,
TransformOp,
OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
OneDimIndexCal><<<grid_num, block_num, 0, stream>>>(
x_data,
config.output_data,
reducer,
Expand All @@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data,
reduce_index_calculator,
left_index_calculator,
dim);
#endif

} else {
int reduce_rank = config.reduce_strides.size();
Expand All @@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data,
dim.SetRem(config.reduce_num % config.block.x, 0, 0);

#ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx,
Ty,
MPType,
ReduceOp,
TransformOp,
IndexCalculator><<<8, 64, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
init,
config.reduce_num,
config.left_num,
config.reduce_last_dim,
reduce_index_calculator,
left_index_calculator,
dim);
auto grid_num = 8;
auto block_num = 64;
#else
auto grid_num = config.grid;
auto block_num = config.block;
#endif
ReduceAnyKernel<Tx,
Ty,
MPType,
ReduceOp,
TransformOp,
IndexCalculator><<<config.grid, config.block, 0, stream>>>(
IndexCalculator><<<grid_num, block_num, 0, stream>>>(
x_data,
config.output_data,
reducer,
Expand All @@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data,
reduce_index_calculator,
left_index_calculator,
dim);
#endif
}

if (config.should_reduce_again) {
Expand All @@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data,
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim.SetRem(config.left_num % block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<
Ty,
Ty,
MPType,
ReduceOp,
kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data,
y_data,
reducer,
kps::IdentityFunctor<Ty, MPType>(),
init,
config.grid.y,
config.left_num,
config.grid.y,
dim);
#else
grid = 8;
block = 64;
#endif
ReduceHigherDimKernel<
Ty,
Ty,
Expand All @@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data,
config.left_num,
config.grid.y,
dim);
#endif
}
}

Expand All @@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data,
Ty* y_data,
const TransformOp& transform,
int reduce_num,
const phi::GPUContext& dev_ctx,
const KPDevice& dev_ctx,
KPStream stream) {
auto reducer = ReduceOp<Ty>();
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
Expand Down Expand Up @@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data,
Ty* y_data,
const TransformOp& transform,
int reduce_num,
const phi::GPUContext& dev_ctx,
const KPDevice& dev_ctx,
KPStream stream) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
Expand All @@ -1087,12 +1030,16 @@ template <typename Tx,
typename Ty,
template <typename> class ReduceOp,
typename TransformOp>
void ReduceKernel(const phi::GPUContext& dev_ctx,
void ReduceKernel(const KPDevice& dev_ctx,
const phi::DenseTensor& x,
phi::DenseTensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims) {
#ifdef PADDLE_WITH_XPU_KP
auto stream = dev_ctx.x_context()->xpu_stream;
#else
auto stream = dev_ctx.stream();
#endif
dev_ctx.Alloc<Ty>(y);

auto x_dim = phi::vectorize<int>(x.dims());
Expand Down Expand Up @@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
0);

#ifdef PADDLE_WITH_XPU_KP
auto grid_num = 8;
auto block_num = 64;
#else
auto grid_num = config.grid;
auto block_num = config.block;
#endif
ReduceHigherDimKernel<Tx,
Ty,
MPType,
ReduceOp<MPType>,
TransformOp><<<8, 64, 0, stream>>>(
TransformOp><<<grid_num, block_num, 0, stream>>>(
x_data,
config.output_data,
reducer,
Expand All @@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
config.left_num,
config.blocking_size,
dim);
#else
ReduceHigherDimKernel<
Tx,
Ty,
MPType,
ReduceOp<MPType>,
TransformOp><<<config.grid, config.block, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
reducer.initial(),
config.reduce_num,
config.left_num,
config.blocking_size,
dim);
#endif

if (config.should_reduce_again) {
dim3 block = dim3(config.block.x, 1, 1);
Expand All @@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
dim2.SetRem(config.left_num % config.block.x, 0, 0);

#ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<
Ty,
Ty,
MPType,
ReduceOp<MPType>,
kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data,
y_data,
reducer,
kps::IdentityFunctor<Ty, MPType>(config.grid.y),
reducer.initial(),
config.grid.y,
config.left_num,
config.grid.y,
dim2);
#else
grid = 8;
block = 64;
#endif
ReduceHigherDimKernel<
Ty,
Ty,
Expand All @@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
config.left_num,
config.grid.y,
dim2);
#endif
}
return;
}
Expand Down
Loading