Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize performance of lookup_table_v2_op #39856

Merged
merged 2 commits into from
Feb 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 27 additions & 18 deletions paddle/fluid/operators/lookup_table_v2_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,18 @@ limitations under the License. */
namespace paddle {
namespace operators {

template <typename T, typename IdT, int BlockDimX, int BlockDimY, int GridDimX,
bool PaddingFlag>
template <typename T, typename IdT, bool PaddingFlag>
__global__ void LookupTableV2(T *output, const T *table, const IdT *ids,
const int64_t N, const int64_t K, const int64_t D,
const int64_t padding_idx) {
int idx = threadIdx.x;
int idy = blockIdx.x + threadIdx.y * GridDimX;
int idy = blockIdx.x + threadIdx.y * gridDim.x;

while (idy < K) {
auto id = static_cast<int64_t>(ids[idy]);
T *out = output + idy * D;
const T *tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) {
for (int i = idx; i < D; i += blockDim.x) {
if (PaddingFlag) {
if (id == padding_idx)
out[i] = static_cast<T>(0);
Expand All @@ -43,25 +42,29 @@ __global__ void LookupTableV2(T *output, const T *table, const IdT *ids,
out[i] = tab[i];
}
}
idy += BlockDimY * GridDimX;
idy += blockDim.y * gridDim.x;
}
}

template <typename T, typename IdT, int BlockDimX, int BlockDimY, int GridDimX>
template <typename T, typename IdT>
__global__ void LookupTableV2Grad(T *table, const T *output, const IdT *ids,
const int64_t N, const int64_t K,
const int64_t D) {
int idx = threadIdx.x;
int idy = blockIdx.x + threadIdx.y * GridDimX;
int idy = blockIdx.x + threadIdx.y * gridDim.x;

while (idy < K) {
auto id = static_cast<int64_t>(ids[idy]);
const T *out = output + idy * D;
T *tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) {
#ifdef PADDLE_WITH_CUDA
paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
#else
for (int i = idx; i < D; i += blockDim.x) {
paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
}
idy += BlockDimY * GridDimX;
#endif
idy += blockDim.y * gridDim.x;
}
}

Expand All @@ -81,19 +84,20 @@ struct LookupTableV2CUDAFunctor {
size_t D = table_t->dims()[1];
size_t K = ids_t_->numel();

const int gridx = 2 * context_.cuda_device_context().GetSMCount();
dim3 threads(256, 4);
dim3 grids(80, 1);
dim3 grids(gridx, 1);

const auto *table = table_t->template data<T>();
const auto *ids = ids_t_->template data<IdT>();
auto *output = output_t->template mutable_data<T>(context_.GetPlace());
auto stream = context_.cuda_device_context().stream();

if (padding_idx == -1) {
LookupTableV2<T, IdT, 256, 4, 80, false><<<grids, threads, 0, stream>>>(
LookupTableV2<T, IdT, false><<<grids, threads, 0, stream>>>(
output, table, ids, N, K, D, padding_idx);
} else {
LookupTableV2<T, IdT, 256, 4, 80, true><<<grids, threads, 0, stream>>>(
LookupTableV2<T, IdT, true><<<grids, threads, 0, stream>>>(
output, table, ids, N, K, D, padding_idx);
}
}
Expand Down Expand Up @@ -193,17 +197,22 @@ struct LookupTableV2GradCUDAFunctor {
int D = d_table_t->dims()[1];
int K = ids_t_->numel();

dim3 threads(128, 8);
dim3 grids(8, 1);
const T *d_output = d_output_t->template data<T>();
const auto *ids = ids_t_->template data<IdT>();
T *d_table = d_table_t->mutable_data<T>(context_.GetPlace());

auto t = framework::EigenVector<T>::Flatten(*d_table_t);
t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
#endif

LookupTableV2Grad<T, IdT, 128, 8,
8><<<grids, threads, 0, dev_ctx.stream()>>>(
const int gridx = 2 * dev_ctx.GetSMCount();
dim3 threads(128, 8);
dim3 grids(gridx, 1);
LookupTableV2Grad<T, IdT><<<grids, threads, 0, dev_ctx.stream()>>>(
d_table, d_output, ids, N, K, D);
}
}
Expand Down
88 changes: 88 additions & 0 deletions paddle/fluid/platform/device/gpu/gpu_primitives.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,94 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
}
}
#endif

// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )"
// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up.
template <typename T, typename std::enable_if<std::is_same<
platform::float16, T>::value>::type * = nullptr>
__device__ __forceinline__ void fastAtomicAdd(T *tensor, size_t index,
const size_t numel, T value) {
#if ((CUDA_VERSION < 10000) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
CudaAtomicAdd(reinterpret_cast<platform::float16 *>(tensor) + index,
static_cast<platform::float16>(value));
#else
// whether the address is 32-byte aligned.
__half *target_addr = reinterpret_cast<__half *>(tensor + index);
bool aligned_half2 =
(reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);

if (aligned_half2 && index < (numel - 1)) {
__half2 value2;
value2.x = *reinterpret_cast<__half *>(&value);
value2.y = __int2half_rz(0);
atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2);

} else if (!aligned_half2 && index > 0) {
__half2 value2;
value2.x = __int2half_rz(0);
value2.y = *reinterpret_cast<__half *>(&value);
atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2);

} else {
atomicAdd(reinterpret_cast<__half *>(tensor) + index,
*reinterpret_cast<__half *>(&value));
}
#endif
}

template <typename T, typename std::enable_if<!std::is_same<
platform::float16, T>::value>::type * = nullptr>
__device__ __forceinline__ void fastAtomicAdd(T *arr, size_t index,
const size_t numel, T value) {
CudaAtomicAdd(arr + index, value);
}

#ifdef PADDLE_WITH_CUDA
/*
* One thead block deals with elementwise atomicAdd for vector of len.
* @in: [x1, x2, x3, ...]
* @out:[y1+x1, y2+x2, y3+x3, ...]
* */
template <typename T, typename std::enable_if<!std::is_same<
platform::float16, T>::value>::type * = nullptr>
__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
for (int i = tid; i < len; i += threads_per_block) {
CudaAtomicAdd(&out[i], in[i]);
}
}

// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
template <typename T, typename std::enable_if<std::is_same<
platform::float16, T>::value>::type * = nullptr>
__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
int i = 0;
int loops = len / 2 * 2;

bool aligned_half2 =
(reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);

if (aligned_half2) {
for (i = tid * 2; i < loops; i += threads_per_block * 2) {
__half2 value2;
T value_1 = in[i];
T value_2 = in[i + 1];
value2.x = *reinterpret_cast<__half *>(&value_1);
value2.y = *reinterpret_cast<__half *>(&value_2);
atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
}
for (; i < len; i += threads_per_block) {
fastAtomicAdd(out, i, len, in[i]);
}
} else {
for (int i = tid; i < len; i += threads_per_block) {
fastAtomicAdd(out, i, len, in[i]);
}
}
}
#endif
#endif

CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
Expand Down