Merge improve Ell kernel

This PR improves Ell kernel Summary - improve the global memory access coalescing - use atomicadd on shared memory to reduce the results of multithreads with the same row - use `max_thread_per_worker=32` for Nvidia and AMD - specialize the kernel with `num_thread_per_worker=1` Related PR: #411
ginkgo-project · Dec 7, 2019 · b4262c9 · b4262c9
2 parents c53a9b2 + 4f48a76
commit b4262c9
Show file tree

Hide file tree

Showing 3 changed files with 185 additions and 146 deletions.
diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
@@ -34,74 +34,94 @@ namespace kernel {
 namespace {
 
 
-template <int subwarp_size, bool atomic, typename ValueType, typename IndexType,
-          typename Closure>
-__device__ void spmv_kernel(const size_type num_rows, const int nwarps_per_row,
-                            const ValueType *__restrict__ val,
-                            const IndexType *__restrict__ col,
-                            const size_type stride,
-                            const size_type num_stored_elements_per_row,
-                            const ValueType *__restrict__ b,
-                            const size_type b_stride, ValueType *__restrict__ c,
-                            const size_type c_stride, Closure op)
+template <int num_thread_per_worker, bool atomic, typename ValueType,
+          typename IndexType, typename Closure>
+__device__ void spmv_kernel(
+    const size_type num_rows, const int num_worker_per_row,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const size_type stride, const size_type num_stored_elements_per_row,
+    const ValueType *__restrict__ b, const size_type b_stride,
+    ValueType *__restrict__ c, const size_type c_stride, Closure op)
 {
     const auto tidx =
-        static_cast<IndexType>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const IndexType x = tidx / subwarp_size / nwarps_per_row;
-    const auto warp_id = tidx / subwarp_size % nwarps_per_row;
-    const auto y_start = tidx % subwarp_size +
-                         num_stored_elements_per_row * warp_id / nwarps_per_row;
-    const auto y_end =
-        num_stored_elements_per_row * (warp_id + 1) / nwarps_per_row;
-    if (x < num_rows) {
-        const auto tile_block =
-            group::tiled_partition<subwarp_size>(group::this_thread_block());
-        ValueType temp = zero<ValueType>();
-        const auto column_id = blockIdx.y;
-        for (IndexType idx = y_start; idx < y_end; idx += subwarp_size) {
-            const auto ind = x + idx * stride;
-            const auto col_idx = col[ind];
-            if (col_idx < idx) {
-                break;
-            } else {
-                temp += val[ind] * b[col_idx * b_stride + column_id];
+        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
+    const auto column_id = blockIdx.y;
+    if (num_thread_per_worker == 1) {
+        // Specialize the num_thread_per_worker = 1. It doesn't need the shared
+        // memory, __syncthreads, and atomic_add
+        if (tidx < num_rows) {
+            ValueType temp = zero<ValueType>();
+            for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
+                const auto ind = tidx + idx * stride;
+                const auto col_idx = col[ind];
+                if (col_idx < idx) {
+                    break;
+                } else {
+                    temp += val[ind] * b[col_idx * b_stride + column_id];
+                }
             }
+            const auto c_ind = tidx * c_stride + column_id;
+            c[c_ind] = op(temp, c[c_ind]);
         }
-        const auto answer = reduce(
-            tile_block, temp, [](ValueType x, ValueType y) { return x + y; });
-        if (tile_block.thread_rank() == 0) {
-            if (atomic) {
-                atomic_add(&(c[x * c_stride + column_id]),
-                           op(answer, c[x * c_stride + column_id]));
-            } else {
-                c[x * c_stride + column_id] =
-                    op(answer, c[x * c_stride + column_id]);
+    } else {
+        if (tidx < num_worker_per_row * num_rows) {
+            const auto idx_in_worker = threadIdx.y;
+            const auto x = tidx % num_rows;
+            const auto worker_id = tidx / num_rows;
+            const auto step_size = num_worker_per_row * num_thread_per_worker;
+            __shared__ UninitializedArray<ValueType, default_block_size /
+                                                         num_thread_per_worker>
+                storage;
+            if (idx_in_worker == 0) {
+                storage[threadIdx.x] = 0;
+            }
+            __syncthreads();
+            ValueType temp = zero<ValueType>();
+            for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker;
+                 idx < num_stored_elements_per_row; idx += step_size) {
+                const auto ind = x + idx * stride;
+                const auto col_idx = col[ind];
+                if (col_idx < idx) {
+                    break;
+                } else {
+                    temp += val[ind] * b[col_idx * b_stride + column_id];
+                }
+            }
+            atomic_add(&storage[threadIdx.x], temp);
+            __syncthreads();
+            if (idx_in_worker == 0) {
+                const auto c_ind = x * c_stride + column_id;
+                if (atomic) {
+                    atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind]));
+                } else {
+                    c[c_ind] = op(storage[threadIdx.x], c[c_ind]);
+                }
             }
         }
     }
 }
 
 
-template <int subwarp_size, bool atomic = false, typename ValueType,
+template <int num_thread_per_worker, bool atomic = false, typename ValueType,
           typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
-    const size_type num_rows, const int nwarps_per_row,
+    const size_type num_rows, const int num_worker_per_row,
     const ValueType *__restrict__ val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
     const ValueType *__restrict__ b, const size_type b_stride,
     ValueType *__restrict__ c, const size_type c_stride)
 {
-    spmv_kernel<subwarp_size, atomic>(
-        num_rows, nwarps_per_row, val, col, stride, num_stored_elements_per_row,
-        b, b_stride, c, c_stride,
+    spmv_kernel<num_thread_per_worker, atomic>(
+        num_rows, num_worker_per_row, val, col, stride,
+        num_stored_elements_per_row, b, b_stride, c, c_stride,
         [](const ValueType &x, const ValueType &y) { return x; });
 }
 
 
-template <int subwarp_size, bool atomic = false, typename ValueType,
+template <int num_thread_per_worker, bool atomic = false, typename ValueType,
           typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
-    const size_type num_rows, const int nwarps_per_row,
+    const size_type num_rows, const int num_worker_per_row,
     const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
     const IndexType *__restrict__ col, const size_type stride,
     const size_type num_stored_elements_per_row,
@@ -116,15 +136,15 @@ __global__ __launch_bounds__(default_block_size) void spmv(
     // Thus, the cuda kernel only computes alpha * a * b when it uses atomic
     // operation.
     if (atomic) {
-        spmv_kernel<subwarp_size, atomic>(
-            num_rows, nwarps_per_row, val, col, stride,
+        spmv_kernel<num_thread_per_worker, atomic>(
+            num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, b_stride, c, c_stride,
             [&alpha_val](const ValueType &x, const ValueType &y) {
                 return alpha_val * x;
             });
     } else {
-        spmv_kernel<subwarp_size, atomic>(
-            num_rows, nwarps_per_row, val, col, stride,
+        spmv_kernel<num_thread_per_worker, atomic>(
+            num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, b_stride, c, c_stride,
             [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
                 return alpha_val * x + beta_val * y;
@@ -137,10 +157,9 @@ __global__ __launch_bounds__(default_block_size) void spmv(
 
 
 template <typename ValueType>
-__global__
-    __launch_bounds__(config::max_block_size) void initialize_zero_dense(
-        size_type num_rows, size_type num_cols, size_type stride,
-        ValueType *__restrict__ result)
+__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense(
+    size_type num_rows, size_type num_cols, size_type stride,
+    ValueType *__restrict__ result)
 {
     const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
     const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;

diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
@@ -85,14 +85,20 @@ constexpr int num_threads_per_core = 4;
 constexpr double ratio = 1e-2;
 
 
+/**
+ * max_thread_per_worker is the max number of thread per worker. The
+ * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker>
+ */
+constexpr int max_thread_per_worker = 32;
+
+
 /**
  * A compile-time list of sub-warp sizes for which the spmv kernels should be
  * compiled.
  * 0 is a special case where it uses a sub-warp size of warp_size in
  * combination with atomic_adds.
  */
-using compiled_kernels =
-    syn::value_list<int, 0, 1, 2, 4, 8, 16, 32, config::warp_size>;
+using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
 
 
 #include "common/matrix/ell_kernels.hpp.inc"
@@ -102,35 +108,39 @@ namespace {
 
 
 template <int info, typename ValueType, typename IndexType>
-void abstract_spmv(syn::value_list<int, info>, int nwarps_per_row,
+void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                    const matrix::Ell<ValueType, IndexType> *a,
                    const matrix::Dense<ValueType> *b,
                    matrix::Dense<ValueType> *c,
                    const matrix::Dense<ValueType> *alpha = nullptr,
                    const matrix::Dense<ValueType> *beta = nullptr)
 {
     const auto nrows = a->get_size()[0];
-    constexpr int subwarp_size = (info == 0) ? config::warp_size : info;
+    constexpr int num_thread_per_worker =
+        (info == 0) ? max_thread_per_worker : info;
     constexpr bool atomic = (info == 0);
-    const dim3 block_size(default_block_size, 1, 1);
-    const dim3 grid_size(
-        ceildiv(nrows * subwarp_size * nwarps_per_row, block_size.x),
-        b->get_size()[1], 1);
+    const dim3 block_size(default_block_size / num_thread_per_worker,
+                          num_thread_per_worker, 1);
+    const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
+                         b->get_size()[1], 1);
     if (alpha == nullptr && beta == nullptr) {
-        kernel::spmv<subwarp_size, atomic><<<grid_size, block_size, 0, 0>>>(
-            nrows, nwarps_per_row, as_cuda_type(a->get_const_values()),
-            a->get_const_col_idxs(), a->get_stride(),
-            a->get_num_stored_elements_per_row(),
-            as_cuda_type(b->get_const_values()), b->get_stride(),
-            as_cuda_type(c->get_values()), c->get_stride());
+        kernel::spmv<num_thread_per_worker, atomic>
+            <<<grid_size, block_size, 0, 0>>>(
+                nrows, num_worker_per_row, as_cuda_type(a->get_const_values()),
+                a->get_const_col_idxs(), a->get_stride(),
+                a->get_num_stored_elements_per_row(),
+                as_cuda_type(b->get_const_values()), b->get_stride(),
+                as_cuda_type(c->get_values()), c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
-        kernel::spmv<subwarp_size, atomic><<<grid_size, block_size, 0, 0>>>(
-            nrows, nwarps_per_row, as_cuda_type(alpha->get_const_values()),
-            as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
-            a->get_stride(), a->get_num_stored_elements_per_row(),
-            as_cuda_type(b->get_const_values()), b->get_stride(),
-            as_cuda_type(beta->get_const_values()),
-            as_cuda_type(c->get_values()), c->get_stride());
+        kernel::spmv<num_thread_per_worker, atomic>
+            <<<grid_size, block_size, 0, 0>>>(
+                nrows, num_worker_per_row,
+                as_cuda_type(alpha->get_const_values()),
+                as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
+                a->get_stride(), a->get_num_stored_elements_per_row(),
+                as_cuda_type(b->get_const_values()), b->get_stride(),
+                as_cuda_type(beta->get_const_values()),
+                as_cuda_type(c->get_values()), c->get_stride());
     } else {
         GKO_KERNEL_NOT_FOUND;
     }
@@ -140,42 +150,43 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv);
 
 
 template <typename ValueType, typename IndexType>
-std::array<int, 3> compute_subwarp_size_and_atomicity(
+std::array<int, 3> compute_thread_worker_and_atomicity(
     std::shared_ptr<const CudaExecutor> exec,
     const matrix::Ell<ValueType, IndexType> *a)
 {
-    int subwarp_size = 1;
+    int num_thread_per_worker = 1;
     int atomic = 0;
-    int nwarps_per_row = 1;
+    int num_worker_per_row = 1;
 
     const auto nrows = a->get_size()[0];
     const auto ell_ncols = a->get_num_stored_elements_per_row();
+    // TODO: num_threads_per_core should be tuned for AMD gpu
     const auto nwarps = exec->get_num_warps_per_sm() *
                         exec->get_num_multiprocessor() * num_threads_per_core;
 
     // Use multithreads to perform the reduction on each row when the matrix is
     // wide.
     // To make every thread have computation, so pick the value which is the
-    // power of 2 less than warp_size and is less than or equal to ell_ncols. If
-    // the subwarp_size is warp_size and allow more than one warps to work on
-    // the same row, use atomic add to handle the warps write the value into the
-    // same position. The #warps is decided according to the number of warps
-    // allowed on GPU.
+    // power of 2 less than max_thread_per_worker and is less than or equal to
+    // ell_ncols. If the num_thread_per_worker is max_thread_per_worker and
+    // allow more than one worker to work on the same row, use atomic add to
+    // handle the worker write the value into the same position. The #worker is
+    // decided according to the number of worker allowed on GPU.
     if (static_cast<double>(ell_ncols) / nrows > ratio) {
-        while (subwarp_size < config::warp_size &&
-               (subwarp_size << 1) <= ell_ncols) {
-            subwarp_size <<= 1;
+        while (num_thread_per_worker < max_thread_per_worker &&
+               (num_thread_per_worker << 1) <= ell_ncols) {
+            num_thread_per_worker <<= 1;
         }
-        if (subwarp_size == config::warp_size) {
-            nwarps_per_row =
-                std::min(ell_ncols / config::warp_size, nwarps / nrows);
-            nwarps_per_row = std::max(nwarps_per_row, 1);
+        if (num_thread_per_worker == max_thread_per_worker) {
+            num_worker_per_row =
+                std::min(ell_ncols / max_thread_per_worker, nwarps / nrows);
+            num_worker_per_row = std::max(num_worker_per_row, 1);
         }
-        if (nwarps_per_row > 1) {
+        if (num_worker_per_row > 1) {
             atomic = 1;
         }
     }
-    return {subwarp_size, atomic, nwarps_per_row};
+    return {num_thread_per_worker, atomic, num_worker_per_row};
 }
 
 
@@ -187,24 +198,25 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
           const matrix::Ell<ValueType, IndexType> *a,
           const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
 {
-    const auto data = compute_subwarp_size_and_atomicity(exec, a);
-    const int subwarp_size = std::get<0>(data);
+    const auto data = compute_thread_worker_and_atomicity(exec, a);
+    const int num_thread_per_worker = std::get<0>(data);
     const int atomic = std::get<1>(data);
-    const int nwarps_per_row = std::get<2>(data);
+    const int num_worker_per_row = std::get<2>(data);
 
     /**
      * info is the parameter for selecting the cuda kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
-    const int info = (!atomic) * subwarp_size;
+    const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
         zero_array(c->get_num_stored_elements(), c->get_values());
     }
     select_abstract_spmv(
         compiled_kernels(),
         [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), nwarps_per_row, a, b, c);
+        syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b,
+        c);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
@@ -218,24 +230,24 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
                    const matrix::Dense<ValueType> *beta,
                    matrix::Dense<ValueType> *c)
 {
-    const auto data = compute_subwarp_size_and_atomicity(exec, a);
-    const int subwarp_size = std::get<0>(data);
+    const auto data = compute_thread_worker_and_atomicity(exec, a);
+    const int num_thread_per_worker = std::get<0>(data);
     const int atomic = std::get<1>(data);
-    const int nwarps_per_row = std::get<2>(data);
+    const int num_worker_per_row = std::get<2>(data);
 
     /**
      * info is the parameter for selecting the cuda kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
-    const int info = (!atomic) * subwarp_size;
+    const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
         dense::scale(exec, beta, c);
     }
     select_abstract_spmv(
         compiled_kernels(),
         [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), nwarps_per_row, a, b, c,
+        syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b, c,
         alpha, beta);
 }