Merge device_matrix_data and device-side matrix::read

This adds a device_matrix_data type for device-side matrix initialization. Related PR: #886
ginkgo-project · Nov 20, 2021 · 2a34e0e · 2a34e0e
2 parents 4e35fd0 + 3c570da
commit 2a34e0e
Show file tree

Hide file tree

Showing 247 changed files with 4,513 additions and 18,890 deletions.
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
@@ -171,7 +171,20 @@ function(ginkgo_create_common_test test_name)
         # use float for DPC++ if necessary
         if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE)
             target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1)
+            target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
         endif()
         ginkgo_set_test_target_properties(${test_name}_${exec} ${test_target_name})
     endforeach()
 endfunction(ginkgo_create_common_test)
+
+function(ginkgo_create_common_and_reference_test test_name)
+    ginkgo_create_common_test(${test_name})
+    ginkgo_build_test_name(${test_name} test_target_name)
+    set(test_target_name ${test_target_name}_reference)
+    add_executable(${test_target_name} ${test_name}.cpp)
+    target_compile_features(${test_target_name} PUBLIC cxx_std_14)
+    target_compile_options(${test_target_name} PRIVATE ${GINKGO_COMPILER_FLAGS})
+    target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=ReferenceExecutor EXEC_NAMESPACE=reference)
+    target_link_libraries(${test_target_name} PRIVATE ${ARGN})
+    ginkgo_set_test_target_properties(${test_name}_reference ${test_target_name})
+endfunction()
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -1,10 +1,17 @@
 set(UNIFIED_SOURCES
     base/index_set_kernels.cpp
-    components/precision_conversion.cpp
-    components/reduce_array.cpp
+    components/device_matrix_data_kernels.cpp
+    components/absolute_array_kernels.cpp
+    components/fill_array_kernels.cpp
+    components/precision_conversion_kernels.cpp
+    components/reduce_array_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/csr_kernels.cpp
     matrix/dense_kernels.cpp
+    matrix/ell_kernels.cpp
+    matrix/hybrid_kernels.cpp
+    matrix/sellp_kernels.cpp
+    matrix/sparsity_csr_kernels.cpp
     matrix/diagonal_kernels.cpp
     preconditioner/jacobi_kernels.cpp
     solver/bicg_kernels.cpp

diff --git a/common/cuda_hip/components/absolute_array.hpp.inc b/common/cuda_hip/components/absolute_array.hpp.inc
diff --git a/common/cuda_hip/components/fill_array.hpp.inc b/common/cuda_hip/components/fill_array.hpp.inc
diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1070,7 +1070,8 @@ __global__
         auto res_nnz = result_row_ptrs[res_row];
         for (auto nnz = source_row_ptrs[src_row];
              nnz < source_row_ptrs[src_row + 1]; ++nnz) {
-            const auto res_col = source_col_idxs[nnz] - col_offset;
+            const auto res_col =
+                source_col_idxs[nnz] - static_cast<IndexType>(col_offset);
             if (res_col < num_cols && res_col >= 0) {
                 result_col_idxs[res_nnz] = res_col;
                 result_values[res_nnz] = source_values[nnz];

diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -73,3 +73,86 @@ __global__ __launch_bounds__(default_block_size) void transpose_blocks(
 
 
 }  // namespace kernel
+
+
+template <typename ValueType, typename IndexType>
+void fill_in_matrix_data(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const Array<matrix_data_entry<ValueType, IndexType>>& data, int block_size,
+    Array<IndexType>& row_ptr_array, Array<IndexType>& col_idx_array,
+    Array<ValueType>& value_array)
+{
+    using entry = matrix_data_entry<ValueType, IndexType>;
+    using device_entry = device_type<entry>;
+    const auto nnz = data.get_num_elems();
+    const auto bs = block_size;
+    auto row_ptrs = row_ptr_array.get_data();
+    auto num_rows = row_ptr_array.get_num_elems() - 1;
+    if (nnz == 0) {
+        components::fill_array(exec, row_ptrs, num_rows + 1, IndexType{});
+        col_idx_array.resize_and_reset(0);
+        value_array.resize_and_reset(0);
+        return;
+    }
+    auto block_ordered_data = data;
+    auto data_ptr = as_device_type(block_ordered_data.get_data());
+    auto data_it = thrust::device_pointer_cast(data_ptr);
+    thrust::stable_sort(thrust::device, data_it, data_it + nnz,
+                        [bs] __device__(device_entry a, device_entry b) {
+                            return thrust::make_pair(a.row / bs,
+                                                     a.column / bs) <
+                                   thrust::make_pair(b.row / bs, b.column / bs);
+                        });
+    // build block pattern
+    auto adj_predicate = [bs, data_ptr, nnz] __device__(size_type i) {
+        const auto a = i > 0 ? data_ptr[i - 1] : device_entry{-bs, -bs, {}};
+        const auto b = data_ptr[i];
+        return (a.row / bs != b.row / bs) || (a.column / bs != b.column / bs);
+    };
+    auto iota = thrust::make_counting_iterator(size_type{});
+    // count how many blocks we have by counting how often the block changes
+    auto num_blocks = static_cast<size_type>(
+        thrust::count_if(thrust::device, iota, iota + nnz, adj_predicate));
+    // allocate storage
+    Array<IndexType> row_idx_array{exec, num_blocks};
+    Array<size_type> block_ptr_array{exec, num_blocks};
+    col_idx_array.resize_and_reset(num_blocks);
+    value_array.resize_and_reset(num_blocks * bs * bs);
+    auto row_idxs = row_idx_array.get_data();
+    auto col_idxs = col_idx_array.get_data();
+    auto values = as_device_type(value_array.get_data());
+    auto block_ptrs = block_ptr_array.get_data();
+    auto block_ptr_it = thrust::device_pointer_cast(block_ptrs);
+    // write (block_row, block_col, block_start_idx) tuples for each block
+    thrust::copy_if(thrust::device, iota, iota + nnz, block_ptr_it,
+                    adj_predicate);
+    auto block_output_it = thrust::make_zip_iterator(
+        thrust::make_tuple(thrust::device_pointer_cast(row_idxs),
+                           thrust::device_pointer_cast(col_idxs)));
+    thrust::transform(thrust::device, block_ptr_it, block_ptr_it + num_blocks,
+                      block_output_it, [bs, data_ptr] __device__(size_type i) {
+                          return thrust::make_tuple(data_ptr[i].row / bs,
+                                                    data_ptr[i].column / bs);
+                      });
+    // build row pointers from row indices
+    components::build_row_ptrs_from_idxs(exec, row_idx_array, num_rows,
+                                         row_ptrs);
+    // fill in values
+    components::fill_array(exec, value_array.get_data(), num_blocks * bs * bs,
+                           zero<ValueType>());
+    thrust::for_each_n(
+        thrust::device, iota, num_blocks,
+        [block_ptrs, nnz, num_blocks, bs, data_ptr,
+         values] __device__(size_type i) {
+            const auto block_begin = block_ptrs[i];
+            const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
+            for (auto nz = block_begin; nz < block_end; nz++) {
+                const auto entry = data_ptr[nz];
+                values[i * bs * bs + (entry.column % bs) * bs +
+                       (entry.row % bs)] = fake_complex_unpack(entry.value);
+            }
+        });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
diff --git a/common/cuda_hip/matrix/hybrid_kernels.hpp.inc b/common/cuda_hip/matrix/hybrid_kernels.hpp.inc
@@ -140,3 +140,63 @@ __global__ __launch_bounds__(default_block_size) void add(
 
 
 }  // namespace kernel
+
+
+template <typename ValueType, typename IndexType>
+struct hybrid_tuple_unpack_functor {
+    using device_entry = device_type<matrix_data_entry<ValueType, IndexType>>;
+    device_entry __device__
+    operator()(thrust::tuple<size_type, device_entry> e) const
+    {
+        return thrust::get<1>(e);
+    }
+};
+
+
+template <typename ValueType, typename IndexType>
+void split_matrix_data(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const Array<matrix_data_entry<ValueType, IndexType>>& data,
+    const int64* row_ptrs, size_type ell_limit, size_type num_rows,
+    Array<matrix_data_entry<ValueType, IndexType>>& ell_data,
+    Array<matrix_data_entry<ValueType, IndexType>>& coo_data)
+{
+    using device_entry = device_type<matrix_data_entry<ValueType, IndexType>>;
+    auto iota = thrust::make_counting_iterator(size_type{});
+    auto data_it =
+        thrust::device_pointer_cast(as_device_type(data.get_const_data()));
+    const auto nnz = data.get_num_elems();
+    auto enumerated_data_it =
+        thrust::make_zip_iterator(thrust::make_tuple(iota, data_it));
+    auto ell_predicate = [row_ptrs, ell_limit] __device__(
+                             thrust::tuple<size_type, device_entry> e) {
+        const auto row_begin = row_ptrs[thrust::get<1>(e).row];
+        const auto local_nz = thrust::get<0>(e) - row_begin;
+        return local_nz < ell_limit;
+    };
+    auto coo_predicate = [row_ptrs, ell_limit] __device__(
+                             thrust::tuple<size_type, device_entry> e) {
+        const auto row_begin = row_ptrs[thrust::get<1>(e).row];
+        const auto local_nz = thrust::get<0>(e) - row_begin;
+        return local_nz >= ell_limit;
+    };
+    const auto ell_nnz = static_cast<size_type>(
+        thrust::count_if(thrust::device, enumerated_data_it,
+                         enumerated_data_it + nnz, ell_predicate));
+    const auto coo_nnz = nnz - ell_nnz;
+    ell_data.resize_and_reset(ell_nnz);
+    coo_data.resize_and_reset(coo_nnz);
+    auto ell_data_it = thrust::make_transform_output_iterator(
+        thrust::device_pointer_cast(as_device_type(ell_data.get_data())),
+        hybrid_tuple_unpack_functor<ValueType, IndexType>{});
+    auto coo_data_it = thrust::make_transform_output_iterator(
+        thrust::device_pointer_cast(as_device_type(coo_data.get_data())),
+        hybrid_tuple_unpack_functor<ValueType, IndexType>{});
+    thrust::copy_if(thrust::device, enumerated_data_it,
+                    enumerated_data_it + nnz, ell_data_it, ell_predicate);
+    thrust::copy_if(thrust::device, enumerated_data_it,
+                    enumerated_data_it + nnz, coo_data_it, coo_predicate);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL);
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
@@ -65,6 +65,16 @@ device_type<T> as_device_type(T value)
 }
 
 
+template <typename T>
+using unpack_member_type = typename detail::fake_complex_unpack_impl<T>::type;
+
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
+{
+    return fake_complex_unpack(value);
+}
+
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
@@ -92,6 +102,16 @@ device_type<T> as_device_type(T value)
 }
 
 
+template <typename T>
+using unpack_member_type = typename detail::fake_complex_unpack_impl<T>::type;
+
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
+{
+    return fake_complex_unpack(value);
+}
+
+
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
@@ -117,6 +137,16 @@ device_type<T> as_device_type(T value)
     return value;
 }
 
+
+template <typename T>
+using unpack_member_type = T;
+
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
+{
+    return value;
+}
+
 }  // namespace dpcpp
 }  // namespace kernels
 }  // namespace gko
@@ -143,6 +173,16 @@ device_type<T> as_device_type(T value)
 }
 
 
+template <typename T>
+using unpack_member_type = T;
+
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
+{
+    return value;
+}
+
+
 }  // namespace omp
 }  // namespace kernels
 }  // namespace gko