-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
H2D data transfer optimization for stack kernel #48899
Merged
JamesLim-sy
merged 5 commits into
PaddlePaddle:develop
from
JamesLim-sy:optimization_for_stack_kernel
Dec 11, 2022
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
a5fd9ea
first commit.
JamesLim-sy 945bf4b
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
JamesLim-sy da6ba39
refine performance with fast_divmod
JamesLim-sy 441bb1e
refine performance with fast_divmod
JamesLim-sy ff39bba
fix conflicts
JamesLim-sy File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,30 +18,101 @@ | |
#include "paddle/phi/backends/gpu/gpu_context.h" | ||
#include "paddle/phi/backends/gpu/gpu_launch_config.h" | ||
#include "paddle/phi/core/kernel_registry.h" | ||
#include "paddle/phi/kernels/funcs/fast_divmod.h" | ||
|
||
namespace phi { | ||
|
||
template <typename T, typename IntType> | ||
__global__ void StackCUDAKernel(T** input_ptrs, | ||
IntType split_size, | ||
IntType rows, | ||
IntType cols, | ||
template <typename IndexT> | ||
struct DivmodWarpper { | ||
public: | ||
void SetDivden(IndexT dividen) { divmoder = phi::funcs::FastDivMod(dividen); } | ||
__device__ inline phi::funcs::FastDivMod::DivModT div_mod(IndexT val) { | ||
return divmoder.Divmod(val); | ||
} | ||
|
||
private: | ||
phi::funcs::FastDivMod divmoder; | ||
}; | ||
|
||
template <> | ||
struct DivmodWarpper<int64_t> { | ||
public: | ||
using DivModT = phi::AlignedVector<int64_t, 2>; | ||
|
||
void SetDivden(int64_t dividen) { dividen_ = dividen; } | ||
__device__ inline DivModT div_mod(int64_t val) { | ||
DivModT data; | ||
data[0] = val / dividen_; | ||
data[1] = val - data[0] * dividen_; | ||
return data; | ||
} | ||
|
||
private: | ||
int64_t dividen_; | ||
}; | ||
|
||
constexpr int kWarpperSize = 64; | ||
template <typename T, typename IndexT> | ||
struct PointerArray : public DivmodWarpper<IndexT> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
public: | ||
const T* data[kWarpperSize]; | ||
PointerArray(const std::vector<const DenseTensor*>& x, | ||
int num, | ||
int64_t dividen) { | ||
this->SetDivden(dividen); | ||
for (auto i = 0; i < num; ++i) { | ||
data[i] = x[i]->data<T>(); | ||
} | ||
} | ||
}; | ||
|
||
template <typename Context, typename T, typename IndexT> | ||
struct PointerToPointer : public DivmodWarpper<IndexT> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里也可以用 |
||
public: | ||
T** data; | ||
PointerToPointer(const Context& ctx, | ||
const std::vector<const DenseTensor*>& x, | ||
int num, | ||
int64_t dividen) { | ||
this->SetDivden(dividen); | ||
auto byte_len = num * sizeof(T*); | ||
std::vector<const T*> x_datas(num); | ||
for (int i = 0; i < num; ++i) { | ||
x_datas[i] = x[i]->data<T>(); | ||
} | ||
auto tmp_x_data = paddle::memory::Alloc( | ||
ctx.GetPlace(), | ||
byte_len, | ||
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); | ||
paddle::memory::Copy(ctx.GetPlace(), | ||
tmp_x_data->ptr(), | ||
phi::CPUPlace(), | ||
reinterpret_cast<void*>(x_datas.data()), | ||
x_datas.size() * sizeof(T*), | ||
ctx.stream()); | ||
data = reinterpret_cast<T**>(tmp_x_data->ptr()); | ||
} | ||
}; | ||
|
||
template <typename T, typename IndexT, typename WarpT> | ||
__global__ void StackCUDAKernel(WarpT input_warpper, | ||
IndexT split_size, | ||
IndexT rows, | ||
IndexT cols, | ||
T* __restrict__ output) { | ||
IntType grid_x = static_cast<IntType>(blockIdx.x) * blockDim.x + threadIdx.x; | ||
IntType grid_x_stride = static_cast<IntType>(blockDim.x) * gridDim.x; | ||
IntType grid_y_stride = static_cast<IntType>(blockDim.y) * gridDim.y; | ||
IndexT grid_x = static_cast<IndexT>(blockIdx.x) * blockDim.x + threadIdx.x; | ||
IndexT grid_x_stride = static_cast<IndexT>(blockDim.x) * gridDim.x; | ||
IndexT grid_y_stride = static_cast<IndexT>(blockDim.y) * gridDim.y; | ||
|
||
for (; grid_x < cols; grid_x += grid_x_stride) { | ||
IntType grid_y = | ||
static_cast<IntType>(blockIdx.y) * blockDim.y + threadIdx.y; | ||
IndexT grid_y = static_cast<IndexT>(blockIdx.y) * blockDim.y + threadIdx.y; | ||
|
||
IntType split = grid_x / split_size; | ||
const T* input_ptr = input_ptrs[split]; | ||
IntType col_offset = grid_x % split_size; | ||
auto divmod_rslt = input_warpper.div_mod(grid_x); | ||
const T* input_ptr = input_warpper.data[divmod_rslt[0]]; | ||
#pragma unroll | ||
for (; grid_y < rows; grid_y += grid_y_stride) { | ||
output[grid_y * cols + grid_x] = | ||
input_ptr[grid_y * split_size + col_offset]; | ||
input_ptr[grid_y * split_size + divmod_rslt[1]]; | ||
} | ||
} | ||
} | ||
|
@@ -52,24 +123,8 @@ void StackKernel(const Context& dev_ctx, | |
int axis, | ||
DenseTensor* out) { | ||
if (axis < 0) axis += (x[0]->dims().size() + 1); | ||
|
||
int n = static_cast<int>(x.size()); | ||
T* y_data = dev_ctx.template Alloc<T>(out); | ||
std::vector<const T*> x_datas(n); | ||
for (int i = 0; i < n; i++) { | ||
x_datas[i] = x[i]->data<T>(); | ||
} | ||
|
||
auto tmp_x_data = paddle::memory::Alloc( | ||
dev_ctx.GetPlace(), | ||
x_datas.size() * sizeof(T*), | ||
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); | ||
paddle::memory::Copy(dev_ctx.GetPlace(), | ||
tmp_x_data->ptr(), | ||
phi::CPUPlace(), | ||
reinterpret_cast<void*>(x_datas.data()), | ||
x_datas.size() * sizeof(T*), | ||
dev_ctx.stream()); | ||
|
||
// Split x dim from axis to matrix | ||
int64_t x_row = 1, x_col = 1; | ||
|
@@ -78,33 +133,40 @@ void StackKernel(const Context& dev_ctx, | |
} | ||
x_col = x[0]->numel() / x_row; | ||
int64_t out_col = x_col * n; | ||
|
||
auto config = | ||
phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row); | ||
|
||
if (out->numel() < std::numeric_limits<int32_t>::max()) { | ||
StackCUDAKernel<T, int32_t> | ||
<<<config.block_per_grid, | ||
config.thread_per_block, | ||
0, | ||
dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()), | ||
static_cast<int32_t>(x_col), | ||
static_cast<int32_t>(x_row), | ||
static_cast<int32_t>(out_col), | ||
y_data); | ||
#define IMPL_STACK_CUDA_KERNEL(index_t, input_warpper) \ | ||
StackCUDAKernel<T, index_t, decltype(input_warpper)> \ | ||
<<<config.block_per_grid, \ | ||
config.thread_per_block, \ | ||
0, \ | ||
dev_ctx.stream()>>>(input_warpper, \ | ||
static_cast<index_t>(x_col), \ | ||
static_cast<index_t>(x_row), \ | ||
static_cast<index_t>(out_col), \ | ||
y_data); | ||
|
||
bool use_int32 = out->numel() < std::numeric_limits<int32_t>::max(); | ||
if (n <= kWarpperSize) { | ||
if (use_int32) { | ||
PointerArray<T, int32_t> ptr_array(x, n, x_col); | ||
IMPL_STACK_CUDA_KERNEL(int32_t, ptr_array); | ||
} else { | ||
PointerArray<T, int64_t> ptr_array(x, n, x_col); | ||
IMPL_STACK_CUDA_KERNEL(int64_t, ptr_array); | ||
} | ||
} else { | ||
StackCUDAKernel<T, int64_t> | ||
<<<config.block_per_grid, | ||
config.thread_per_block, | ||
0, | ||
dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()), | ||
x_col, | ||
x_row, | ||
out_col, | ||
y_data); | ||
if (use_int32) { | ||
PointerToPointer<Context, T, int32_t> ptr_array(dev_ctx, x, n, x_col); | ||
IMPL_STACK_CUDA_KERNEL(int32_t, ptr_array); | ||
} else { | ||
PointerToPointer<Context, T, int64_t> ptr_array(dev_ctx, x, n, x_col); | ||
IMPL_STACK_CUDA_KERNEL(int64_t, ptr_array); | ||
} | ||
} | ||
#undef IMPL_STACK_CUDA_KERNEL | ||
} | ||
|
||
} // namespace phi | ||
|
||
PD_REGISTER_KERNEL(stack, | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
感觉还是应该
DivMod
定义时特化,支持uint32_t
类型的快速除法取模、普通整数类型的除法取模?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
可以增加一个特化,我会另启一个PR完成这项工作.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DivMod
若能支持所有类型,这一层wrapper封装就没有必要了。另外,SetDivden
和dividen
,肯定有一个写错了吧,以及FastDivMod
里面用的是divisor
,都是一个意思吧。