Skip to content

Commit

Permalink
Merge device_matrix_data and device-side matrix::read
Browse files Browse the repository at this point in the history
This adds a device_matrix_data type for device-side matrix initialization.

Related PR: #886
  • Loading branch information
upsj authored Nov 20, 2021
2 parents 4e35fd0 + 3c570da commit 2a34e0e
Show file tree
Hide file tree
Showing 247 changed files with 4,513 additions and 18,890 deletions.
13 changes: 13 additions & 0 deletions cmake/create_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,20 @@ function(ginkgo_create_common_test test_name)
# use float for DPC++ if necessary
if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE)
target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1)
target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
endif()
ginkgo_set_test_target_properties(${test_name}_${exec} ${test_target_name})
endforeach()
endfunction(ginkgo_create_common_test)

function(ginkgo_create_common_and_reference_test test_name)
ginkgo_create_common_test(${test_name})
ginkgo_build_test_name(${test_name} test_target_name)
set(test_target_name ${test_target_name}_reference)
add_executable(${test_target_name} ${test_name}.cpp)
target_compile_features(${test_target_name} PUBLIC cxx_std_14)
target_compile_options(${test_target_name} PRIVATE ${GINKGO_COMPILER_FLAGS})
target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=ReferenceExecutor EXEC_NAMESPACE=reference)
target_link_libraries(${test_target_name} PRIVATE ${ARGN})
ginkgo_set_test_target_properties(${test_name}_reference ${test_target_name})
endfunction()
11 changes: 9 additions & 2 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
set(UNIFIED_SOURCES
base/index_set_kernels.cpp
components/precision_conversion.cpp
components/reduce_array.cpp
components/device_matrix_data_kernels.cpp
components/absolute_array_kernels.cpp
components/fill_array_kernels.cpp
components/precision_conversion_kernels.cpp
components/reduce_array_kernels.cpp
matrix/coo_kernels.cpp
matrix/csr_kernels.cpp
matrix/dense_kernels.cpp
matrix/ell_kernels.cpp
matrix/hybrid_kernels.cpp
matrix/sellp_kernels.cpp
matrix/sparsity_csr_kernels.cpp
matrix/diagonal_kernels.cpp
preconditioner/jacobi_kernels.cpp
solver/bicg_kernels.cpp
Expand Down
61 changes: 0 additions & 61 deletions common/cuda_hip/components/absolute_array.hpp.inc

This file was deleted.

59 changes: 0 additions & 59 deletions common/cuda_hip/components/fill_array.hpp.inc

This file was deleted.

3 changes: 2 additions & 1 deletion common/cuda_hip/matrix/csr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1070,7 +1070,8 @@ __global__
auto res_nnz = result_row_ptrs[res_row];
for (auto nnz = source_row_ptrs[src_row];
nnz < source_row_ptrs[src_row + 1]; ++nnz) {
const auto res_col = source_col_idxs[nnz] - col_offset;
const auto res_col =
source_col_idxs[nnz] - static_cast<IndexType>(col_offset);
if (res_col < num_cols && res_col >= 0) {
result_col_idxs[res_nnz] = res_col;
result_values[res_nnz] = source_values[nnz];
Expand Down
83 changes: 83 additions & 0 deletions common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,86 @@ __global__ __launch_bounds__(default_block_size) void transpose_blocks(


} // namespace kernel


template <typename ValueType, typename IndexType>
void fill_in_matrix_data(
std::shared_ptr<const DefaultExecutor> exec,
const Array<matrix_data_entry<ValueType, IndexType>>& data, int block_size,
Array<IndexType>& row_ptr_array, Array<IndexType>& col_idx_array,
Array<ValueType>& value_array)
{
using entry = matrix_data_entry<ValueType, IndexType>;
using device_entry = device_type<entry>;
const auto nnz = data.get_num_elems();
const auto bs = block_size;
auto row_ptrs = row_ptr_array.get_data();
auto num_rows = row_ptr_array.get_num_elems() - 1;
if (nnz == 0) {
components::fill_array(exec, row_ptrs, num_rows + 1, IndexType{});
col_idx_array.resize_and_reset(0);
value_array.resize_and_reset(0);
return;
}
auto block_ordered_data = data;
auto data_ptr = as_device_type(block_ordered_data.get_data());
auto data_it = thrust::device_pointer_cast(data_ptr);
thrust::stable_sort(thrust::device, data_it, data_it + nnz,
[bs] __device__(device_entry a, device_entry b) {
return thrust::make_pair(a.row / bs,
a.column / bs) <
thrust::make_pair(b.row / bs, b.column / bs);
});
// build block pattern
auto adj_predicate = [bs, data_ptr, nnz] __device__(size_type i) {
const auto a = i > 0 ? data_ptr[i - 1] : device_entry{-bs, -bs, {}};
const auto b = data_ptr[i];
return (a.row / bs != b.row / bs) || (a.column / bs != b.column / bs);
};
auto iota = thrust::make_counting_iterator(size_type{});
// count how many blocks we have by counting how often the block changes
auto num_blocks = static_cast<size_type>(
thrust::count_if(thrust::device, iota, iota + nnz, adj_predicate));
// allocate storage
Array<IndexType> row_idx_array{exec, num_blocks};
Array<size_type> block_ptr_array{exec, num_blocks};
col_idx_array.resize_and_reset(num_blocks);
value_array.resize_and_reset(num_blocks * bs * bs);
auto row_idxs = row_idx_array.get_data();
auto col_idxs = col_idx_array.get_data();
auto values = as_device_type(value_array.get_data());
auto block_ptrs = block_ptr_array.get_data();
auto block_ptr_it = thrust::device_pointer_cast(block_ptrs);
// write (block_row, block_col, block_start_idx) tuples for each block
thrust::copy_if(thrust::device, iota, iota + nnz, block_ptr_it,
adj_predicate);
auto block_output_it = thrust::make_zip_iterator(
thrust::make_tuple(thrust::device_pointer_cast(row_idxs),
thrust::device_pointer_cast(col_idxs)));
thrust::transform(thrust::device, block_ptr_it, block_ptr_it + num_blocks,
block_output_it, [bs, data_ptr] __device__(size_type i) {
return thrust::make_tuple(data_ptr[i].row / bs,
data_ptr[i].column / bs);
});
// build row pointers from row indices
components::build_row_ptrs_from_idxs(exec, row_idx_array, num_rows,
row_ptrs);
// fill in values
components::fill_array(exec, value_array.get_data(), num_blocks * bs * bs,
zero<ValueType>());
thrust::for_each_n(
thrust::device, iota, num_blocks,
[block_ptrs, nnz, num_blocks, bs, data_ptr,
values] __device__(size_type i) {
const auto block_begin = block_ptrs[i];
const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
for (auto nz = block_begin; nz < block_end; nz++) {
const auto entry = data_ptr[nz];
values[i * bs * bs + (entry.column % bs) * bs +
(entry.row % bs)] = fake_complex_unpack(entry.value);
}
});
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
60 changes: 60 additions & 0 deletions common/cuda_hip/matrix/hybrid_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,63 @@ __global__ __launch_bounds__(default_block_size) void add(


} // namespace kernel


template <typename ValueType, typename IndexType>
struct hybrid_tuple_unpack_functor {
using device_entry = device_type<matrix_data_entry<ValueType, IndexType>>;
device_entry __device__
operator()(thrust::tuple<size_type, device_entry> e) const
{
return thrust::get<1>(e);
}
};


template <typename ValueType, typename IndexType>
void split_matrix_data(
std::shared_ptr<const DefaultExecutor> exec,
const Array<matrix_data_entry<ValueType, IndexType>>& data,
const int64* row_ptrs, size_type ell_limit, size_type num_rows,
Array<matrix_data_entry<ValueType, IndexType>>& ell_data,
Array<matrix_data_entry<ValueType, IndexType>>& coo_data)
{
using device_entry = device_type<matrix_data_entry<ValueType, IndexType>>;
auto iota = thrust::make_counting_iterator(size_type{});
auto data_it =
thrust::device_pointer_cast(as_device_type(data.get_const_data()));
const auto nnz = data.get_num_elems();
auto enumerated_data_it =
thrust::make_zip_iterator(thrust::make_tuple(iota, data_it));
auto ell_predicate = [row_ptrs, ell_limit] __device__(
thrust::tuple<size_type, device_entry> e) {
const auto row_begin = row_ptrs[thrust::get<1>(e).row];
const auto local_nz = thrust::get<0>(e) - row_begin;
return local_nz < ell_limit;
};
auto coo_predicate = [row_ptrs, ell_limit] __device__(
thrust::tuple<size_type, device_entry> e) {
const auto row_begin = row_ptrs[thrust::get<1>(e).row];
const auto local_nz = thrust::get<0>(e) - row_begin;
return local_nz >= ell_limit;
};
const auto ell_nnz = static_cast<size_type>(
thrust::count_if(thrust::device, enumerated_data_it,
enumerated_data_it + nnz, ell_predicate));
const auto coo_nnz = nnz - ell_nnz;
ell_data.resize_and_reset(ell_nnz);
coo_data.resize_and_reset(coo_nnz);
auto ell_data_it = thrust::make_transform_output_iterator(
thrust::device_pointer_cast(as_device_type(ell_data.get_data())),
hybrid_tuple_unpack_functor<ValueType, IndexType>{});
auto coo_data_it = thrust::make_transform_output_iterator(
thrust::device_pointer_cast(as_device_type(coo_data.get_data())),
hybrid_tuple_unpack_functor<ValueType, IndexType>{});
thrust::copy_if(thrust::device, enumerated_data_it,
enumerated_data_it + nnz, ell_data_it, ell_predicate);
thrust::copy_if(thrust::device, enumerated_data_it,
enumerated_data_it + nnz, coo_data_it, coo_predicate);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL);
40 changes: 40 additions & 0 deletions common/unified/base/kernel_launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,16 @@ device_type<T> as_device_type(T value)
}


template <typename T>
using unpack_member_type = typename detail::fake_complex_unpack_impl<T>::type;

template <typename T>
GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
{
return fake_complex_unpack(value);
}


} // namespace cuda
} // namespace kernels
} // namespace gko
Expand Down Expand Up @@ -92,6 +102,16 @@ device_type<T> as_device_type(T value)
}


template <typename T>
using unpack_member_type = typename detail::fake_complex_unpack_impl<T>::type;

template <typename T>
GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
{
return fake_complex_unpack(value);
}


} // namespace hip
} // namespace kernels
} // namespace gko
Expand All @@ -117,6 +137,16 @@ device_type<T> as_device_type(T value)
return value;
}


template <typename T>
using unpack_member_type = T;

template <typename T>
GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
{
return value;
}

} // namespace dpcpp
} // namespace kernels
} // namespace gko
Expand All @@ -143,6 +173,16 @@ device_type<T> as_device_type(T value)
}


template <typename T>
using unpack_member_type = T;

template <typename T>
GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
{
return value;
}


} // namespace omp
} // namespace kernels
} // namespace gko
Expand Down
Loading

0 comments on commit 2a34e0e

Please sign in to comment.