Skip to content

Commit

Permalink
Merge fallback transpose kernel
Browse files Browse the repository at this point in the history
Adds a custom transpose implementation for Fbcsr and Csr transpose for vendor-unsupported types.

Related PR: #1123
  • Loading branch information
upsj authored Sep 19, 2022
2 parents 693bc4f + 123f3c4 commit 3f9321a
Show file tree
Hide file tree
Showing 10 changed files with 485 additions and 39 deletions.
4 changes: 2 additions & 2 deletions .gitlab/scripts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
- ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} install
- |
(( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1
- ctest -V --timeout 3000
- ctest -V --timeout 6000
- ninja test_install
- pushd test/test_install
- ninja install
Expand Down Expand Up @@ -145,7 +145,7 @@
- cd ${CI_JOB_NAME/test/build}
- |
(( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1
- ctest -V --timeout 3000
- ctest -V --timeout 6000
- ninja test_install
- pushd test/test_install
- ninja install
Expand Down
32 changes: 32 additions & 0 deletions common/cuda_hip/matrix/csr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1098,3 +1098,35 @@ void build_lookup(std::shared_ptr<const DefaultExecutor> exec,
}

GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);


template <typename ValueType, typename IndexType>
void fallback_transpose(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* input,
matrix::Csr<ValueType, IndexType>* output)
{
const auto in_num_rows = input->get_size()[0];
const auto out_num_rows = output->get_size()[0];
const auto nnz = output->get_num_stored_elements();
const auto in_row_ptrs = input->get_const_row_ptrs();
const auto in_col_idxs = input->get_const_col_idxs();
// workaround for CUDA 9.2 Thrust unconstrained constructor issues
const auto in_vals = reinterpret_cast<const device_member_type<ValueType>*>(
input->get_const_values());
const auto out_row_ptrs = output->get_row_ptrs();
const auto out_col_idxs = output->get_col_idxs();
const auto out_vals =
reinterpret_cast<device_member_type<ValueType>*>(output->get_values());
array<IndexType> out_row_idxs{exec, nnz};
components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_rows,
out_col_idxs);
exec->copy(nnz, in_vals, out_vals);
exec->copy(nnz, in_col_idxs, out_row_idxs.get_data());
auto loc_it = thrust::make_zip_iterator(
thrust::make_tuple(out_row_idxs.get_data(), out_col_idxs));
using tuple_type =
thrust::tuple<IndexType, IndexType, device_type<ValueType>>;
thrust::sort_by_key(thrust::device, loc_it, loc_it + nnz, out_vals);
components::convert_idxs_to_ptrs(exec, out_row_idxs.get_data(), nnz,
out_num_rows, out_row_ptrs);
}
65 changes: 65 additions & 0 deletions common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,68 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);


namespace kernel {


template <typename ValueType, typename IndexType>
__global__ void __launch_bounds__(default_block_size)
permute_transpose(const ValueType* __restrict__ in,
ValueType* __restrict__ out, int bs, size_type nnzb,
const IndexType* perm)
{
const auto idx = thread::get_thread_id_flat();
const auto block = idx / (bs * bs);
const auto i = (idx % (bs * bs)) / bs;
const auto j = idx % bs;
if (block < nnzb) {
out[block * bs * bs + j * bs + i] =
in[perm[block] * bs * bs + i * bs + j];
}
}


} // namespace kernel


template <typename ValueType, typename IndexType>
void fallback_transpose(const std::shared_ptr<const DefaultExecutor> exec,
const matrix::Fbcsr<ValueType, IndexType>* const input,
matrix::Fbcsr<ValueType, IndexType>* const output)
{
const auto in_num_row_blocks = input->get_num_block_rows();
const auto out_num_row_blocks = output->get_num_block_rows();
const auto nnzb = output->get_num_stored_blocks();
const auto bs = input->get_block_size();
const auto in_row_ptrs = input->get_const_row_ptrs();
const auto in_col_idxs = input->get_const_col_idxs();
const auto in_vals = as_device_type(input->get_const_values());
const auto out_row_ptrs = output->get_row_ptrs();
const auto out_col_idxs = output->get_col_idxs();
const auto out_vals = as_device_type(output->get_values());
array<IndexType> out_row_idxs{exec, nnzb};
array<IndexType> permutation{exec, nnzb};
components::fill_seq_array(exec, permutation.get_data(), nnzb);
components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_row_blocks,
out_col_idxs);
exec->copy(nnzb, in_col_idxs, out_row_idxs.get_data());
auto zip_it = thrust::make_zip_iterator(thrust::make_tuple(
thrust::device_pointer_cast(out_row_idxs.get_data()),
thrust::device_pointer_cast(out_col_idxs),
thrust::device_pointer_cast(permutation.get_data())));
using tuple_type =
thrust::tuple<IndexType, IndexType, device_type<ValueType>>;
thrust::sort(thrust::device, zip_it, zip_it + nnzb,
[] __device__(const tuple_type& a, const tuple_type& b) {
return thrust::tie(thrust::get<0>(a), thrust::get<1>(a)) <
thrust::tie(thrust::get<0>(b), thrust::get<1>(b));
});
components::convert_idxs_to_ptrs(exec, out_row_idxs.get_data(), nnzb,
out_num_row_blocks, out_row_ptrs);
const auto grid_size = ceildiv(nnzb * bs * bs, default_block_size);
if (grid_size > 0) {
kernel::permute_transpose<<<grid_size, default_block_size>>>(
in_vals, out_vals, bs, nnzb, permutation.get_const_data());
}
}
21 changes: 10 additions & 11 deletions cuda/matrix/csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -952,7 +952,7 @@ void transpose(std::shared_ptr<const CudaExecutor> exec,
idxBase, alg, buffer);
#endif
} else {
GKO_NOT_IMPLEMENTED;
fallback_transpose(exec, orig, trans);
}
}

Expand All @@ -967,11 +967,10 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
if (orig->get_size()[0] == 0) {
return;
}
const auto block_size = default_block_size;
const auto grid_size =
ceildiv(trans->get_num_stored_elements(), block_size);
if (cusparse::is_supported<ValueType, IndexType>::value) {
const auto block_size = default_block_size;
const auto grid_size =
ceildiv(trans->get_num_stored_elements(), block_size);

#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
Expand Down Expand Up @@ -1006,13 +1005,13 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
idxBase, alg, buffer);
#endif
if (grid_size > 0) {
kernel::conjugate<<<grid_size, block_size, 0, 0>>>(
trans->get_num_stored_elements(),
as_cuda_type(trans->get_values()));
}
} else {
GKO_NOT_IMPLEMENTED;
fallback_transpose(exec, orig, trans);
}
if (grid_size > 0 && is_complex<ValueType>()) {
kernel::conjugate<<<grid_size, block_size, 0, 0>>>(
trans->get_num_stored_elements(),
as_cuda_type(trans->get_values()));
}
}

Expand Down
4 changes: 2 additions & 2 deletions cuda/matrix/fbcsr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ void transpose(const std::shared_ptr<const CudaExecutor> exec,
[bs](int compiled_block_size) { return bs == compiled_block_size; },
syn::value_list<int>(), syn::type_list<>(), trans);
} else {
GKO_NOT_IMPLEMENTED;
fallback_transpose(exec, orig, trans);
}
}

Expand All @@ -356,7 +356,7 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
const int grid_size =
ceildiv(trans->get_num_stored_elements(), default_block_size);
transpose(exec, orig, trans);
if (grid_size > 0) {
if (grid_size > 0 && is_complex<ValueType>()) {
kernel::conjugate<<<grid_size, default_block_size>>>(
trans->get_num_stored_elements(),
as_cuda_type(trans->get_values()));
Expand Down
28 changes: 28 additions & 0 deletions cuda/test/matrix/csr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,20 @@ TEST_F(Csr, TransposeIsEquivalentToRef)
}


TEST_F(Csr, Transpose64IsEquivalentToRef)
{
using Mtx64 = gko::matrix::Csr<double, gko::int64>;
auto mtx = gen_mtx<Mtx64>(123, 234, 0);
auto dmtx = gko::clone(cuda, mtx);

auto trans = gko::as<Mtx64>(mtx->transpose());
auto d_trans = gko::as<Mtx64>(dmtx->transpose());

GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
ASSERT_TRUE(d_trans->is_sorted_by_column_index());
}


TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
{
set_up_apply_complex_data(std::make_shared<ComplexMtx::automatical>(cuda));
Expand All @@ -512,6 +526,20 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
}


TEST_F(Csr, ConjugateTranspose64IsEquivalentToRef)
{
using Mtx64 = gko::matrix::Csr<double, gko::int64>;
auto mtx = gen_mtx<Mtx64>(123, 234, 0);
auto dmtx = gko::clone(cuda, mtx);

auto trans = gko::as<Mtx64>(mtx->transpose());
auto d_trans = gko::as<Mtx64>(dmtx->transpose());

GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
ASSERT_TRUE(d_trans->is_sorted_by_column_index());
}


TEST_F(Csr, ConvertToDenseIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::sparselib>());
Expand Down
22 changes: 10 additions & 12 deletions hip/matrix/csr_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,7 @@ void transpose(std::shared_ptr<const HipExecutor> exec,
orig->get_const_col_idxs(), trans->get_values(),
trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
} else {
GKO_NOT_IMPLEMENTED;
fallback_transpose(exec, orig, trans);
}
}

Expand All @@ -770,11 +770,10 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
if (orig->get_size()[0] == 0) {
return;
}
const auto block_size = default_block_size;
const auto grid_size =
ceildiv(trans->get_num_stored_elements(), block_size);
if (hipsparse::is_supported<ValueType, IndexType>::value) {
const auto block_size = default_block_size;
const auto grid_size =
ceildiv(trans->get_num_stored_elements(), block_size);

hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;

Expand All @@ -784,14 +783,13 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
orig->get_const_values(), orig->get_const_row_ptrs(),
orig->get_const_col_idxs(), trans->get_values(),
trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);

if (grid_size > 0) {
hipLaunchKernelGGL(kernel::conjugate, grid_size, block_size, 0, 0,
trans->get_num_stored_elements(),
as_hip_type(trans->get_values()));
}
} else {
GKO_NOT_IMPLEMENTED;
fallback_transpose(exec, orig, trans);
}
if (grid_size > 0 && is_complex<ValueType>()) {
hipLaunchKernelGGL(kernel::conjugate, grid_size, block_size, 0, 0,
trans->get_num_stored_elements(),
as_hip_type(trans->get_values()));
}
}

Expand Down
21 changes: 16 additions & 5 deletions hip/matrix/fbcsr_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,10 +288,12 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


template <typename ValueType, typename IndexType>
void transpose(const std::shared_ptr<const HipExecutor> exec,
const matrix::Fbcsr<ValueType, IndexType>* const orig,
matrix::Fbcsr<ValueType, IndexType>* const trans)
GKO_NOT_IMPLEMENTED;
void transpose(const std::shared_ptr<const DefaultExecutor> exec,
const matrix::Fbcsr<ValueType, IndexType>* const input,
matrix::Fbcsr<ValueType, IndexType>* const output)
{
fallback_transpose(exec, input, output);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
Expand All @@ -301,7 +303,16 @@ template <typename ValueType, typename IndexType>
void conj_transpose(std::shared_ptr<const HipExecutor> exec,
const matrix::Fbcsr<ValueType, IndexType>* orig,
matrix::Fbcsr<ValueType, IndexType>* trans)
GKO_NOT_IMPLEMENTED;
{
const int grid_size =
ceildiv(trans->get_num_stored_elements(), default_block_size);
transpose(exec, orig, trans);
if (grid_size > 0 && is_complex<ValueType>()) {
kernel::conjugate<<<grid_size, default_block_size>>>(
trans->get_num_stored_elements(),
as_device_type(trans->get_values()));
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
Expand Down
28 changes: 28 additions & 0 deletions hip/test/matrix/csr_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,20 @@ TEST_F(Csr, TransposeIsEquivalentToRef)
}


TEST_F(Csr, Transpose64IsEquivalentToRef)
{
using Mtx64 = gko::matrix::Csr<double, gko::int64>;
auto mtx = gen_mtx<Mtx64>(123, 234, 0);
auto dmtx = gko::clone(hip, mtx);

auto trans = gko::as<Mtx64>(mtx->transpose());
auto d_trans = gko::as<Mtx64>(dmtx->transpose());

GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
ASSERT_TRUE(d_trans->is_sorted_by_column_index());
}


TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::automatical>(hip));
Expand All @@ -522,6 +536,20 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
}


TEST_F(Csr, ConjugateTranspose64IsEquivalentToRef)
{
using Mtx64 = gko::matrix::Csr<double, gko::int64>;
auto mtx = gen_mtx<Mtx64>(123, 234, 0);
auto dmtx = gko::clone(hip, mtx);

auto trans = gko::as<Mtx64>(mtx->transpose());
auto d_trans = gko::as<Mtx64>(dmtx->transpose());

GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
ASSERT_TRUE(d_trans->is_sorted_by_column_index());
}


TEST_F(Csr, ConvertToDenseIsEquivalentToRef)
{
set_up_apply_data(std::make_shared<Mtx::sparselib>());
Expand Down
Loading

0 comments on commit 3f9321a

Please sign in to comment.