Skip to content

Commit

Permalink
Merge improve Ell kernel
Browse files Browse the repository at this point in the history
This PR improves Ell kernel

Summary
- improve the global memory access coalescing
- use atomicadd on shared memory to reduce the results of multithreads with the same row
- use `max_thread_per_worker=32` for Nvidia and AMD
- specialize the kernel with `num_thread_per_worker=1`

Related PR: #411
  • Loading branch information
yhmtsai authored Dec 7, 2019
2 parents c53a9b2 + 4f48a76 commit b4262c9
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 146 deletions.
125 changes: 72 additions & 53 deletions common/matrix/ell_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -34,74 +34,94 @@ namespace kernel {
namespace {


template <int subwarp_size, bool atomic, typename ValueType, typename IndexType,
typename Closure>
__device__ void spmv_kernel(const size_type num_rows, const int nwarps_per_row,
const ValueType *__restrict__ val,
const IndexType *__restrict__ col,
const size_type stride,
const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b,
const size_type b_stride, ValueType *__restrict__ c,
const size_type c_stride, Closure op)
template <int num_thread_per_worker, bool atomic, typename ValueType,
typename IndexType, typename Closure>
__device__ void spmv_kernel(
const size_type num_rows, const int num_worker_per_row,
const ValueType *__restrict__ val, const IndexType *__restrict__ col,
const size_type stride, const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b, const size_type b_stride,
ValueType *__restrict__ c, const size_type c_stride, Closure op)
{
const auto tidx =
static_cast<IndexType>(blockDim.x) * blockIdx.x + threadIdx.x;
const IndexType x = tidx / subwarp_size / nwarps_per_row;
const auto warp_id = tidx / subwarp_size % nwarps_per_row;
const auto y_start = tidx % subwarp_size +
num_stored_elements_per_row * warp_id / nwarps_per_row;
const auto y_end =
num_stored_elements_per_row * (warp_id + 1) / nwarps_per_row;
if (x < num_rows) {
const auto tile_block =
group::tiled_partition<subwarp_size>(group::this_thread_block());
ValueType temp = zero<ValueType>();
const auto column_id = blockIdx.y;
for (IndexType idx = y_start; idx < y_end; idx += subwarp_size) {
const auto ind = x + idx * stride;
const auto col_idx = col[ind];
if (col_idx < idx) {
break;
} else {
temp += val[ind] * b[col_idx * b_stride + column_id];
static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
const auto column_id = blockIdx.y;
if (num_thread_per_worker == 1) {
// Specialize the num_thread_per_worker = 1. It doesn't need the shared
// memory, __syncthreads, and atomic_add
if (tidx < num_rows) {
ValueType temp = zero<ValueType>();
for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
const auto ind = tidx + idx * stride;
const auto col_idx = col[ind];
if (col_idx < idx) {
break;
} else {
temp += val[ind] * b[col_idx * b_stride + column_id];
}
}
const auto c_ind = tidx * c_stride + column_id;
c[c_ind] = op(temp, c[c_ind]);
}
const auto answer = reduce(
tile_block, temp, [](ValueType x, ValueType y) { return x + y; });
if (tile_block.thread_rank() == 0) {
if (atomic) {
atomic_add(&(c[x * c_stride + column_id]),
op(answer, c[x * c_stride + column_id]));
} else {
c[x * c_stride + column_id] =
op(answer, c[x * c_stride + column_id]);
} else {
if (tidx < num_worker_per_row * num_rows) {
const auto idx_in_worker = threadIdx.y;
const auto x = tidx % num_rows;
const auto worker_id = tidx / num_rows;
const auto step_size = num_worker_per_row * num_thread_per_worker;
__shared__ UninitializedArray<ValueType, default_block_size /
num_thread_per_worker>
storage;
if (idx_in_worker == 0) {
storage[threadIdx.x] = 0;
}
__syncthreads();
ValueType temp = zero<ValueType>();
for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker;
idx < num_stored_elements_per_row; idx += step_size) {
const auto ind = x + idx * stride;
const auto col_idx = col[ind];
if (col_idx < idx) {
break;
} else {
temp += val[ind] * b[col_idx * b_stride + column_id];
}
}
atomic_add(&storage[threadIdx.x], temp);
__syncthreads();
if (idx_in_worker == 0) {
const auto c_ind = x * c_stride + column_id;
if (atomic) {
atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind]));
} else {
c[c_ind] = op(storage[threadIdx.x], c[c_ind]);
}
}
}
}
}


template <int subwarp_size, bool atomic = false, typename ValueType,
template <int num_thread_per_worker, bool atomic = false, typename ValueType,
typename IndexType>
__global__ __launch_bounds__(default_block_size) void spmv(
const size_type num_rows, const int nwarps_per_row,
const size_type num_rows, const int num_worker_per_row,
const ValueType *__restrict__ val, const IndexType *__restrict__ col,
const size_type stride, const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b, const size_type b_stride,
ValueType *__restrict__ c, const size_type c_stride)
{
spmv_kernel<subwarp_size, atomic>(
num_rows, nwarps_per_row, val, col, stride, num_stored_elements_per_row,
b, b_stride, c, c_stride,
spmv_kernel<num_thread_per_worker, atomic>(
num_rows, num_worker_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[](const ValueType &x, const ValueType &y) { return x; });
}


template <int subwarp_size, bool atomic = false, typename ValueType,
template <int num_thread_per_worker, bool atomic = false, typename ValueType,
typename IndexType>
__global__ __launch_bounds__(default_block_size) void spmv(
const size_type num_rows, const int nwarps_per_row,
const size_type num_rows, const int num_worker_per_row,
const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
const IndexType *__restrict__ col, const size_type stride,
const size_type num_stored_elements_per_row,
Expand All @@ -116,15 +136,15 @@ __global__ __launch_bounds__(default_block_size) void spmv(
// Thus, the cuda kernel only computes alpha * a * b when it uses atomic
// operation.
if (atomic) {
spmv_kernel<subwarp_size, atomic>(
num_rows, nwarps_per_row, val, col, stride,
spmv_kernel<num_thread_per_worker, atomic>(
num_rows, num_worker_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[&alpha_val](const ValueType &x, const ValueType &y) {
return alpha_val * x;
});
} else {
spmv_kernel<subwarp_size, atomic>(
num_rows, nwarps_per_row, val, col, stride,
spmv_kernel<num_thread_per_worker, atomic>(
num_rows, num_worker_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
return alpha_val * x + beta_val * y;
Expand All @@ -137,10 +157,9 @@ __global__ __launch_bounds__(default_block_size) void spmv(


template <typename ValueType>
__global__
__launch_bounds__(config::max_block_size) void initialize_zero_dense(
size_type num_rows, size_type num_cols, size_type stride,
ValueType *__restrict__ result)
__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense(
size_type num_rows, size_type num_cols, size_type stride,
ValueType *__restrict__ result)
{
const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
Expand Down
108 changes: 60 additions & 48 deletions cuda/matrix/ell_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,20 @@ constexpr int num_threads_per_core = 4;
constexpr double ratio = 1e-2;


/**
* max_thread_per_worker is the max number of thread per worker. The
* `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker>
*/
constexpr int max_thread_per_worker = 32;


/**
* A compile-time list of sub-warp sizes for which the spmv kernels should be
* compiled.
* 0 is a special case where it uses a sub-warp size of warp_size in
* combination with atomic_adds.
*/
using compiled_kernels =
syn::value_list<int, 0, 1, 2, 4, 8, 16, 32, config::warp_size>;
using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;


#include "common/matrix/ell_kernels.hpp.inc"
Expand All @@ -102,35 +108,39 @@ namespace {


template <int info, typename ValueType, typename IndexType>
void abstract_spmv(syn::value_list<int, info>, int nwarps_per_row,
void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
const matrix::Ell<ValueType, IndexType> *a,
const matrix::Dense<ValueType> *b,
matrix::Dense<ValueType> *c,
const matrix::Dense<ValueType> *alpha = nullptr,
const matrix::Dense<ValueType> *beta = nullptr)
{
const auto nrows = a->get_size()[0];
constexpr int subwarp_size = (info == 0) ? config::warp_size : info;
constexpr int num_thread_per_worker =
(info == 0) ? max_thread_per_worker : info;
constexpr bool atomic = (info == 0);
const dim3 block_size(default_block_size, 1, 1);
const dim3 grid_size(
ceildiv(nrows * subwarp_size * nwarps_per_row, block_size.x),
b->get_size()[1], 1);
const dim3 block_size(default_block_size / num_thread_per_worker,
num_thread_per_worker, 1);
const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
b->get_size()[1], 1);
if (alpha == nullptr && beta == nullptr) {
kernel::spmv<subwarp_size, atomic><<<grid_size, block_size, 0, 0>>>(
nrows, nwarps_per_row, as_cuda_type(a->get_const_values()),
a->get_const_col_idxs(), a->get_stride(),
a->get_num_stored_elements_per_row(),
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(c->get_values()), c->get_stride());
kernel::spmv<num_thread_per_worker, atomic>
<<<grid_size, block_size, 0, 0>>>(
nrows, num_worker_per_row, as_cuda_type(a->get_const_values()),
a->get_const_col_idxs(), a->get_stride(),
a->get_num_stored_elements_per_row(),
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(c->get_values()), c->get_stride());
} else if (alpha != nullptr && beta != nullptr) {
kernel::spmv<subwarp_size, atomic><<<grid_size, block_size, 0, 0>>>(
nrows, nwarps_per_row, as_cuda_type(alpha->get_const_values()),
as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
a->get_stride(), a->get_num_stored_elements_per_row(),
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(beta->get_const_values()),
as_cuda_type(c->get_values()), c->get_stride());
kernel::spmv<num_thread_per_worker, atomic>
<<<grid_size, block_size, 0, 0>>>(
nrows, num_worker_per_row,
as_cuda_type(alpha->get_const_values()),
as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
a->get_stride(), a->get_num_stored_elements_per_row(),
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(beta->get_const_values()),
as_cuda_type(c->get_values()), c->get_stride());
} else {
GKO_KERNEL_NOT_FOUND;
}
Expand All @@ -140,42 +150,43 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv);


template <typename ValueType, typename IndexType>
std::array<int, 3> compute_subwarp_size_and_atomicity(
std::array<int, 3> compute_thread_worker_and_atomicity(
std::shared_ptr<const CudaExecutor> exec,
const matrix::Ell<ValueType, IndexType> *a)
{
int subwarp_size = 1;
int num_thread_per_worker = 1;
int atomic = 0;
int nwarps_per_row = 1;
int num_worker_per_row = 1;

const auto nrows = a->get_size()[0];
const auto ell_ncols = a->get_num_stored_elements_per_row();
// TODO: num_threads_per_core should be tuned for AMD gpu
const auto nwarps = exec->get_num_warps_per_sm() *
exec->get_num_multiprocessor() * num_threads_per_core;

// Use multithreads to perform the reduction on each row when the matrix is
// wide.
// To make every thread have computation, so pick the value which is the
// power of 2 less than warp_size and is less than or equal to ell_ncols. If
// the subwarp_size is warp_size and allow more than one warps to work on
// the same row, use atomic add to handle the warps write the value into the
// same position. The #warps is decided according to the number of warps
// allowed on GPU.
// power of 2 less than max_thread_per_worker and is less than or equal to
// ell_ncols. If the num_thread_per_worker is max_thread_per_worker and
// allow more than one worker to work on the same row, use atomic add to
// handle the worker write the value into the same position. The #worker is
// decided according to the number of worker allowed on GPU.
if (static_cast<double>(ell_ncols) / nrows > ratio) {
while (subwarp_size < config::warp_size &&
(subwarp_size << 1) <= ell_ncols) {
subwarp_size <<= 1;
while (num_thread_per_worker < max_thread_per_worker &&
(num_thread_per_worker << 1) <= ell_ncols) {
num_thread_per_worker <<= 1;
}
if (subwarp_size == config::warp_size) {
nwarps_per_row =
std::min(ell_ncols / config::warp_size, nwarps / nrows);
nwarps_per_row = std::max(nwarps_per_row, 1);
if (num_thread_per_worker == max_thread_per_worker) {
num_worker_per_row =
std::min(ell_ncols / max_thread_per_worker, nwarps / nrows);
num_worker_per_row = std::max(num_worker_per_row, 1);
}
if (nwarps_per_row > 1) {
if (num_worker_per_row > 1) {
atomic = 1;
}
}
return {subwarp_size, atomic, nwarps_per_row};
return {num_thread_per_worker, atomic, num_worker_per_row};
}


Expand All @@ -187,24 +198,25 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
const matrix::Ell<ValueType, IndexType> *a,
const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
{
const auto data = compute_subwarp_size_and_atomicity(exec, a);
const int subwarp_size = std::get<0>(data);
const auto data = compute_thread_worker_and_atomicity(exec, a);
const int num_thread_per_worker = std::get<0>(data);
const int atomic = std::get<1>(data);
const int nwarps_per_row = std::get<2>(data);
const int num_worker_per_row = std::get<2>(data);

/**
* info is the parameter for selecting the cuda kernel.
* for info == 0, it uses the kernel by warp_size threads with atomic
* operation for other value, it uses the kernel without atomic_add
*/
const int info = (!atomic) * subwarp_size;
const int info = (!atomic) * num_thread_per_worker;
if (atomic) {
zero_array(c->get_num_stored_elements(), c->get_values());
}
select_abstract_spmv(
compiled_kernels(),
[&info](int compiled_info) { return info == compiled_info; },
syn::value_list<int>(), syn::type_list<>(), nwarps_per_row, a, b, c);
syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b,
c);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
Expand All @@ -218,24 +230,24 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType> *beta,
matrix::Dense<ValueType> *c)
{
const auto data = compute_subwarp_size_and_atomicity(exec, a);
const int subwarp_size = std::get<0>(data);
const auto data = compute_thread_worker_and_atomicity(exec, a);
const int num_thread_per_worker = std::get<0>(data);
const int atomic = std::get<1>(data);
const int nwarps_per_row = std::get<2>(data);
const int num_worker_per_row = std::get<2>(data);

/**
* info is the parameter for selecting the cuda kernel.
* for info == 0, it uses the kernel by warp_size threads with atomic
* operation for other value, it uses the kernel without atomic_add
*/
const int info = (!atomic) * subwarp_size;
const int info = (!atomic) * num_thread_per_worker;
if (atomic) {
dense::scale(exec, beta, c);
}
select_abstract_spmv(
compiled_kernels(),
[&info](int compiled_info) { return info == compiled_info; },
syn::value_list<int>(), syn::type_list<>(), nwarps_per_row, a, b, c,
syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b, c,
alpha, beta);
}

Expand Down
Loading

0 comments on commit b4262c9

Please sign in to comment.