Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

H2D data transfer optimization for stack kernel #48899

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 113 additions & 51 deletions paddle/phi/kernels/gpu/stack_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,101 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/fast_divmod.h"

namespace phi {

template <typename T, typename IntType>
__global__ void StackCUDAKernel(T** input_ptrs,
IntType split_size,
IntType rows,
IntType cols,
template <typename IndexT>
struct DivmodWarpper {
public:
void SetDivden(IndexT dividen) { divmoder = phi::funcs::FastDivMod(dividen); }
__device__ inline phi::funcs::FastDivMod::DivModT div_mod(IndexT val) {
return divmoder.Divmod(val);
}

private:
phi::funcs::FastDivMod divmoder;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

感觉还是应该DivMod定义时特化,支持uint32_t类型的快速除法取模、普通整数类型的除法取模?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

可以增加一个特化,我会另启一个PR完成这项工作.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DivMod若能支持所有类型,这一层wrapper封装就没有必要了。另外,SetDivdendividen,肯定有一个写错了吧,以及FastDivMod里面用的是divisor,都是一个意思吧。

};

template <>
struct DivmodWarpper<int64_t> {
public:
using DivModT = phi::AlignedVector<int64_t, 2>;

void SetDivden(int64_t dividen) { dividen_ = dividen; }
__device__ inline DivModT div_mod(int64_t val) {
DivModT data;
data[0] = val / dividen_;
data[1] = val - data[0] * dividen_;
return data;
}

private:
int64_t dividen_;
};

constexpr int kWarpperSize = 64;
template <typename T, typename IndexT>
struct PointerArray : public DivmodWarpper<IndexT> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • 我觉得PointerArrayDivmodWarpper不是继承关系,而是PointerArray有一个DivmodWarpper成员,他们定义的类型之间觉得没有特别大的关系,你只是为了方便使用。
  • kWarpperSize的定义挪到类里面。

public:
const T* data[kWarpperSize];
PointerArray(const std::vector<const DenseTensor*>& x,
int num,
int64_t dividen) {
this->SetDivden(dividen);
for (auto i = 0; i < num; ++i) {
data[i] = x[i]->data<T>();
}
}
};

template <typename Context, typename T, typename IndexT>
struct PointerToPointer : public DivmodWarpper<IndexT> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里也可以用PointerArray,两种不同的形式,在其他算子里面也是通用的。

public:
T** data;
PointerToPointer(const Context& ctx,
const std::vector<const DenseTensor*>& x,
int num,
int64_t dividen) {
this->SetDivden(dividen);
auto byte_len = num * sizeof(T*);
std::vector<const T*> x_datas(num);
for (int i = 0; i < num; ++i) {
x_datas[i] = x[i]->data<T>();
}
auto tmp_x_data = paddle::memory::Alloc(
ctx.GetPlace(),
byte_len,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
paddle::memory::Copy(ctx.GetPlace(),
tmp_x_data->ptr(),
phi::CPUPlace(),
reinterpret_cast<void*>(x_datas.data()),
x_datas.size() * sizeof(T*),
ctx.stream());
data = reinterpret_cast<T**>(tmp_x_data->ptr());
}
};

template <typename T, typename IndexT, typename WarpT>
__global__ void StackCUDAKernel(WarpT input_warpper,
IndexT split_size,
IndexT rows,
IndexT cols,
T* __restrict__ output) {
IntType grid_x = static_cast<IntType>(blockIdx.x) * blockDim.x + threadIdx.x;
IntType grid_x_stride = static_cast<IntType>(blockDim.x) * gridDim.x;
IntType grid_y_stride = static_cast<IntType>(blockDim.y) * gridDim.y;
IndexT grid_x = static_cast<IndexT>(blockIdx.x) * blockDim.x + threadIdx.x;
IndexT grid_x_stride = static_cast<IndexT>(blockDim.x) * gridDim.x;
IndexT grid_y_stride = static_cast<IndexT>(blockDim.y) * gridDim.y;

for (; grid_x < cols; grid_x += grid_x_stride) {
IntType grid_y =
static_cast<IntType>(blockIdx.y) * blockDim.y + threadIdx.y;
IndexT grid_y = static_cast<IndexT>(blockIdx.y) * blockDim.y + threadIdx.y;

IntType split = grid_x / split_size;
const T* input_ptr = input_ptrs[split];
IntType col_offset = grid_x % split_size;
auto divmod_rslt = input_warpper.div_mod(grid_x);
const T* input_ptr = input_warpper.data[divmod_rslt[0]];
#pragma unroll
for (; grid_y < rows; grid_y += grid_y_stride) {
output[grid_y * cols + grid_x] =
input_ptr[grid_y * split_size + col_offset];
input_ptr[grid_y * split_size + divmod_rslt[1]];
}
}
}
Expand All @@ -52,24 +123,8 @@ void StackKernel(const Context& dev_ctx,
int axis,
DenseTensor* out) {
if (axis < 0) axis += (x[0]->dims().size() + 1);

int n = static_cast<int>(x.size());
T* y_data = dev_ctx.template Alloc<T>(out);
std::vector<const T*> x_datas(n);
for (int i = 0; i < n; i++) {
x_datas[i] = x[i]->data<T>();
}

auto tmp_x_data = paddle::memory::Alloc(
dev_ctx.GetPlace(),
x_datas.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_x_data->ptr(),
phi::CPUPlace(),
reinterpret_cast<void*>(x_datas.data()),
x_datas.size() * sizeof(T*),
dev_ctx.stream());

// Split x dim from axis to matrix
int64_t x_row = 1, x_col = 1;
Expand All @@ -78,33 +133,40 @@ void StackKernel(const Context& dev_ctx,
}
x_col = x[0]->numel() / x_row;
int64_t out_col = x_col * n;

auto config =
phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);

if (out->numel() < std::numeric_limits<int32_t>::max()) {
StackCUDAKernel<T, int32_t>
<<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
static_cast<int32_t>(x_col),
static_cast<int32_t>(x_row),
static_cast<int32_t>(out_col),
y_data);
#define IMPL_STACK_CUDA_KERNEL(index_t, input_warpper) \
StackCUDAKernel<T, index_t, decltype(input_warpper)> \
<<<config.block_per_grid, \
config.thread_per_block, \
0, \
dev_ctx.stream()>>>(input_warpper, \
static_cast<index_t>(x_col), \
static_cast<index_t>(x_row), \
static_cast<index_t>(out_col), \
y_data);

bool use_int32 = out->numel() < std::numeric_limits<int32_t>::max();
if (n <= kWarpperSize) {
if (use_int32) {
PointerArray<T, int32_t> ptr_array(x, n, x_col);
IMPL_STACK_CUDA_KERNEL(int32_t, ptr_array);
} else {
PointerArray<T, int64_t> ptr_array(x, n, x_col);
IMPL_STACK_CUDA_KERNEL(int64_t, ptr_array);
}
} else {
StackCUDAKernel<T, int64_t>
<<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
x_col,
x_row,
out_col,
y_data);
if (use_int32) {
PointerToPointer<Context, T, int32_t> ptr_array(dev_ctx, x, n, x_col);
IMPL_STACK_CUDA_KERNEL(int32_t, ptr_array);
} else {
PointerToPointer<Context, T, int64_t> ptr_array(dev_ctx, x, n, x_col);
IMPL_STACK_CUDA_KERNEL(int64_t, ptr_array);
}
}
#undef IMPL_STACK_CUDA_KERNEL
}

} // namespace phi

PD_REGISTER_KERNEL(stack,
Expand Down