Merge fallback transpose kernel

Adds a custom transpose implementation for Fbcsr and Csr transpose for vendor-unsupported types. Related PR: #1123
ginkgo-project · Sep 19, 2022 · 3f9321a · 3f9321a
2 parents 693bc4f + 123f3c4
commit 3f9321a
Show file tree

Hide file tree

Showing 10 changed files with 485 additions and 39 deletions.
diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml
@@ -94,7 +94,7 @@
     - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} install
     - |
         (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1
-    - ctest -V --timeout 3000
+    - ctest -V --timeout 6000
     - ninja test_install
     - pushd test/test_install
     - ninja install
@@ -145,7 +145,7 @@
     - cd ${CI_JOB_NAME/test/build}
     - |
         (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1
-    - ctest -V --timeout 3000
+    - ctest -V --timeout 6000
     - ninja test_install
     - pushd test/test_install
     - ninja install

diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1098,3 +1098,35 @@ void build_lookup(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void fallback_transpose(std::shared_ptr<const DefaultExecutor> exec,
+                        const matrix::Csr<ValueType, IndexType>* input,
+                        matrix::Csr<ValueType, IndexType>* output)
+{
+    const auto in_num_rows = input->get_size()[0];
+    const auto out_num_rows = output->get_size()[0];
+    const auto nnz = output->get_num_stored_elements();
+    const auto in_row_ptrs = input->get_const_row_ptrs();
+    const auto in_col_idxs = input->get_const_col_idxs();
+    // workaround for CUDA 9.2 Thrust unconstrained constructor issues
+    const auto in_vals = reinterpret_cast<const device_member_type<ValueType>*>(
+        input->get_const_values());
+    const auto out_row_ptrs = output->get_row_ptrs();
+    const auto out_col_idxs = output->get_col_idxs();
+    const auto out_vals =
+        reinterpret_cast<device_member_type<ValueType>*>(output->get_values());
+    array<IndexType> out_row_idxs{exec, nnz};
+    components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_rows,
+                                     out_col_idxs);
+    exec->copy(nnz, in_vals, out_vals);
+    exec->copy(nnz, in_col_idxs, out_row_idxs.get_data());
+    auto loc_it = thrust::make_zip_iterator(
+        thrust::make_tuple(out_row_idxs.get_data(), out_col_idxs));
+    using tuple_type =
+        thrust::tuple<IndexType, IndexType, device_type<ValueType>>;
+    thrust::sort_by_key(thrust::device, loc_it, loc_it + nnz, out_vals);
+    components::convert_idxs_to_ptrs(exec, out_row_idxs.get_data(), nnz,
+                                     out_num_rows, out_row_ptrs);
+}
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -244,3 +244,68 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
+
+
+namespace kernel {
+
+
+template <typename ValueType, typename IndexType>
+__global__ void __launch_bounds__(default_block_size)
+    permute_transpose(const ValueType* __restrict__ in,
+                      ValueType* __restrict__ out, int bs, size_type nnzb,
+                      const IndexType* perm)
+{
+    const auto idx = thread::get_thread_id_flat();
+    const auto block = idx / (bs * bs);
+    const auto i = (idx % (bs * bs)) / bs;
+    const auto j = idx % bs;
+    if (block < nnzb) {
+        out[block * bs * bs + j * bs + i] =
+            in[perm[block] * bs * bs + i * bs + j];
+    }
+}
+
+
+}  // namespace kernel
+
+
+template <typename ValueType, typename IndexType>
+void fallback_transpose(const std::shared_ptr<const DefaultExecutor> exec,
+                        const matrix::Fbcsr<ValueType, IndexType>* const input,
+                        matrix::Fbcsr<ValueType, IndexType>* const output)
+{
+    const auto in_num_row_blocks = input->get_num_block_rows();
+    const auto out_num_row_blocks = output->get_num_block_rows();
+    const auto nnzb = output->get_num_stored_blocks();
+    const auto bs = input->get_block_size();
+    const auto in_row_ptrs = input->get_const_row_ptrs();
+    const auto in_col_idxs = input->get_const_col_idxs();
+    const auto in_vals = as_device_type(input->get_const_values());
+    const auto out_row_ptrs = output->get_row_ptrs();
+    const auto out_col_idxs = output->get_col_idxs();
+    const auto out_vals = as_device_type(output->get_values());
+    array<IndexType> out_row_idxs{exec, nnzb};
+    array<IndexType> permutation{exec, nnzb};
+    components::fill_seq_array(exec, permutation.get_data(), nnzb);
+    components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_row_blocks,
+                                     out_col_idxs);
+    exec->copy(nnzb, in_col_idxs, out_row_idxs.get_data());
+    auto zip_it = thrust::make_zip_iterator(thrust::make_tuple(
+        thrust::device_pointer_cast(out_row_idxs.get_data()),
+        thrust::device_pointer_cast(out_col_idxs),
+        thrust::device_pointer_cast(permutation.get_data())));
+    using tuple_type =
+        thrust::tuple<IndexType, IndexType, device_type<ValueType>>;
+    thrust::sort(thrust::device, zip_it, zip_it + nnzb,
+                 [] __device__(const tuple_type& a, const tuple_type& b) {
+                     return thrust::tie(thrust::get<0>(a), thrust::get<1>(a)) <
+                            thrust::tie(thrust::get<0>(b), thrust::get<1>(b));
+                 });
+    components::convert_idxs_to_ptrs(exec, out_row_idxs.get_data(), nnzb,
+                                     out_num_row_blocks, out_row_ptrs);
+    const auto grid_size = ceildiv(nnzb * bs * bs, default_block_size);
+    if (grid_size > 0) {
+        kernel::permute_transpose<<<grid_size, default_block_size>>>(
+            in_vals, out_vals, bs, nnzb, permutation.get_const_data());
+    }
+}
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
@@ -952,7 +952,7 @@ void transpose(std::shared_ptr<const CudaExecutor> exec,
             idxBase, alg, buffer);
 #endif
     } else {
-        GKO_NOT_IMPLEMENTED;
+        fallback_transpose(exec, orig, trans);
     }
 }
 
@@ -967,11 +967,10 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
     if (orig->get_size()[0] == 0) {
         return;
     }
+    const auto block_size = default_block_size;
+    const auto grid_size =
+        ceildiv(trans->get_num_stored_elements(), block_size);
     if (cusparse::is_supported<ValueType, IndexType>::value) {
-        const auto block_size = default_block_size;
-        const auto grid_size =
-            ceildiv(trans->get_num_stored_elements(), block_size);
-
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
@@ -1006,13 +1005,13 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
             trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
             idxBase, alg, buffer);
 #endif
-        if (grid_size > 0) {
-            kernel::conjugate<<<grid_size, block_size, 0, 0>>>(
-                trans->get_num_stored_elements(),
-                as_cuda_type(trans->get_values()));
-        }
     } else {
-        GKO_NOT_IMPLEMENTED;
+        fallback_transpose(exec, orig, trans);
+    }
+    if (grid_size > 0 && is_complex<ValueType>()) {
+        kernel::conjugate<<<grid_size, block_size, 0, 0>>>(
+            trans->get_num_stored_elements(),
+            as_cuda_type(trans->get_values()));
     }
 }
 

diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu
@@ -340,7 +340,7 @@ void transpose(const std::shared_ptr<const CudaExecutor> exec,
             [bs](int compiled_block_size) { return bs == compiled_block_size; },
             syn::value_list<int>(), syn::type_list<>(), trans);
     } else {
-        GKO_NOT_IMPLEMENTED;
+        fallback_transpose(exec, orig, trans);
     }
 }
 
@@ -356,7 +356,7 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
     const int grid_size =
         ceildiv(trans->get_num_stored_elements(), default_block_size);
     transpose(exec, orig, trans);
-    if (grid_size > 0) {
+    if (grid_size > 0 && is_complex<ValueType>()) {
         kernel::conjugate<<<grid_size, default_block_size>>>(
             trans->get_num_stored_elements(),
             as_cuda_type(trans->get_values()));

diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
@@ -500,6 +500,20 @@ TEST_F(Csr, TransposeIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, Transpose64IsEquivalentToRef)
+{
+    using Mtx64 = gko::matrix::Csr<double, gko::int64>;
+    auto mtx = gen_mtx<Mtx64>(123, 234, 0);
+    auto dmtx = gko::clone(cuda, mtx);
+
+    auto trans = gko::as<Mtx64>(mtx->transpose());
+    auto d_trans = gko::as<Mtx64>(dmtx->transpose());
+
+    GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
+    ASSERT_TRUE(d_trans->is_sorted_by_column_index());
+}
+
+
 TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
 {
     set_up_apply_complex_data(std::make_shared<ComplexMtx::automatical>(cuda));
@@ -512,6 +526,20 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, ConjugateTranspose64IsEquivalentToRef)
+{
+    using Mtx64 = gko::matrix::Csr<double, gko::int64>;
+    auto mtx = gen_mtx<Mtx64>(123, 234, 0);
+    auto dmtx = gko::clone(cuda, mtx);
+
+    auto trans = gko::as<Mtx64>(mtx->transpose());
+    auto d_trans = gko::as<Mtx64>(dmtx->transpose());
+
+    GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
+    ASSERT_TRUE(d_trans->is_sorted_by_column_index());
+}
+
+
 TEST_F(Csr, ConvertToDenseIsEquivalentToRef)
 {
     set_up_apply_data(std::make_shared<Mtx::sparselib>());

diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
@@ -755,7 +755,7 @@ void transpose(std::shared_ptr<const HipExecutor> exec,
             orig->get_const_col_idxs(), trans->get_values(),
             trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
     } else {
-        GKO_NOT_IMPLEMENTED;
+        fallback_transpose(exec, orig, trans);
     }
 }
 
@@ -770,11 +770,10 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
     if (orig->get_size()[0] == 0) {
         return;
     }
+    const auto block_size = default_block_size;
+    const auto grid_size =
+        ceildiv(trans->get_num_stored_elements(), block_size);
     if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        const auto block_size = default_block_size;
-        const auto grid_size =
-            ceildiv(trans->get_num_stored_elements(), block_size);
-
         hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
         hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;
 
@@ -784,14 +783,13 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
             trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
-
-        if (grid_size > 0) {
-            hipLaunchKernelGGL(kernel::conjugate, grid_size, block_size, 0, 0,
-                               trans->get_num_stored_elements(),
-                               as_hip_type(trans->get_values()));
-        }
     } else {
-        GKO_NOT_IMPLEMENTED;
+        fallback_transpose(exec, orig, trans);
+    }
+    if (grid_size > 0 && is_complex<ValueType>()) {
+        hipLaunchKernelGGL(kernel::conjugate, grid_size, block_size, 0, 0,
+                           trans->get_num_stored_elements(),
+                           as_hip_type(trans->get_values()));
     }
 }
 

diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp
@@ -288,10 +288,12 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void transpose(const std::shared_ptr<const HipExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* const orig,
-               matrix::Fbcsr<ValueType, IndexType>* const trans)
-    GKO_NOT_IMPLEMENTED;
+void transpose(const std::shared_ptr<const DefaultExecutor> exec,
+               const matrix::Fbcsr<ValueType, IndexType>* const input,
+               matrix::Fbcsr<ValueType, IndexType>* const output)
+{
+    fallback_transpose(exec, input, output);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
@@ -301,7 +303,16 @@ template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const HipExecutor> exec,
                     const matrix::Fbcsr<ValueType, IndexType>* orig,
                     matrix::Fbcsr<ValueType, IndexType>* trans)
-    GKO_NOT_IMPLEMENTED;
+{
+    const int grid_size =
+        ceildiv(trans->get_num_stored_elements(), default_block_size);
+    transpose(exec, orig, trans);
+    if (grid_size > 0 && is_complex<ValueType>()) {
+        kernel::conjugate<<<grid_size, default_block_size>>>(
+            trans->get_num_stored_elements(),
+            as_device_type(trans->get_values()));
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);

diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp
@@ -510,6 +510,20 @@ TEST_F(Csr, TransposeIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, Transpose64IsEquivalentToRef)
+{
+    using Mtx64 = gko::matrix::Csr<double, gko::int64>;
+    auto mtx = gen_mtx<Mtx64>(123, 234, 0);
+    auto dmtx = gko::clone(hip, mtx);
+
+    auto trans = gko::as<Mtx64>(mtx->transpose());
+    auto d_trans = gko::as<Mtx64>(dmtx->transpose());
+
+    GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
+    ASSERT_TRUE(d_trans->is_sorted_by_column_index());
+}
+
+
 TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
 {
     set_up_apply_data(std::make_shared<Mtx::automatical>(hip));
@@ -522,6 +536,20 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, ConjugateTranspose64IsEquivalentToRef)
+{
+    using Mtx64 = gko::matrix::Csr<double, gko::int64>;
+    auto mtx = gen_mtx<Mtx64>(123, 234, 0);
+    auto dmtx = gko::clone(hip, mtx);
+
+    auto trans = gko::as<Mtx64>(mtx->transpose());
+    auto d_trans = gko::as<Mtx64>(dmtx->transpose());
+
+    GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
+    ASSERT_TRUE(d_trans->is_sorted_by_column_index());
+}
+
+
 TEST_F(Csr, ConvertToDenseIsEquivalentToRef)
 {
     set_up_apply_data(std::make_shared<Mtx::sparselib>());