Skip to content

Commit

Permalink
Merge Fix CSR->ELL conversion
Browse files Browse the repository at this point in the history
And accelerate benchmarks.

### Conversion details
Fix a bug in CSR `max_nnz_per_row` computation.

+ The same `grid_dim` was used for the reduction and for the
  `calculate_nnz_per_row` kernel
+ The `grid_dim` was limited to maximum `default_block_size^2` elements.
+ For bigger matrices, the extracted `max_nnz_per_row` could be wrong, due to
  omitted values.

### Benchmark details
+ Move `matrix_from` to the external loop.
+ Benchmark directly into `matrix_to`.
  • Loading branch information
tcojean authored Jun 14, 2019
2 parents fce8dad + a4e46e5 commit 8bf33e0
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 15 deletions.
15 changes: 7 additions & 8 deletions benchmark/conversions/conversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(

// This function supposes that management of `FLAGS_overwrite` is done before
// calling it
void convert_matrix(const gko::LinOp *matrix_from, const gko::LinOp *matrix_to,
void convert_matrix(const gko::LinOp *matrix_from, gko::LinOp *matrix_to,
const char *conversion_name,
std::shared_ptr<gko::Executor> exec,
rapidjson::Value &test_case,
Expand All @@ -89,22 +89,22 @@ void convert_matrix(const gko::LinOp *matrix_from, const gko::LinOp *matrix_to,
rapidjson::Value(rapidjson::kObjectType), allocator);
// warm run
for (unsigned int i = 0; i < FLAGS_warmup; i++) {
auto to_clone = matrix_to->clone();
exec->synchronize();
to_clone->copy_from(matrix_from);
matrix_to->copy_from(matrix_from);
exec->synchronize();
matrix_to->clear();
}
std::chrono::nanoseconds time(0);
// timed run
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
auto to_clone = matrix_to->clone();
exec->synchronize();
auto tic = std::chrono::steady_clock::now();
to_clone->copy_from(matrix_from);
matrix_to->copy_from(matrix_from);
exec->synchronize();
auto toc = std::chrono::steady_clock::now();
time +=
std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
matrix_to->clear();
}
add_or_set_member(conversion_case[conversion_name], "time",
static_cast<double>(time.count()) / FLAGS_repetitions,
Expand Down Expand Up @@ -136,7 +136,6 @@ int main(int argc, char *argv[])
print_general_information(extra_information);

auto exec = executor_factory.at(FLAGS_executor)();
auto engine = get_engine();
auto formats = split(FLAGS_formats, ',');

rapidjson::IStreamWrapper jcin(std::cin);
Expand Down Expand Up @@ -166,6 +165,8 @@ int main(int argc, char *argv[])
std::clog << "Matrix is of size (" << data.size[0] << ", "
<< data.size[1] << ")" << std::endl;
for (const auto &format_from : formats) {
auto matrix_from =
share(matrix_factory.at(format_from)(exec, data));
for (const auto &format : matrix_factory) {
const auto format_to = std::get<0>(format);
if (format_from == format_to) {
Expand All @@ -179,8 +180,6 @@ int main(int argc, char *argv[])
continue;
}

auto matrix_from =
share(matrix_factory.at(format_from)(exec, data));
auto matrix_to = share(std::get<1>(format)(exec, data));
convert_matrix(matrix_from.get(), matrix_to.get(),
conversion_name.c_str(), exec, test_case,
Expand Down
1 change: 1 addition & 0 deletions benchmark/run_all_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ for (( i=${LOOP_START}; i < ${LOOP_END}; ++i )); do
run_spmv_benchmarks "${RESULT_FILE}"

if [ "${BENCHMARK}" == "conversions" ]; then
echo -e "${PREFIX}Running Conversion for ${GROUP}/${NAME}" 1>&2
run_conversion_benchmarks "${RESULT_FILE}"
fi

Expand Down
12 changes: 5 additions & 7 deletions cuda/matrix/csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1363,27 +1363,25 @@ void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
{
const auto num_rows = source->get_size()[0];

const auto grid_dim =
(ceildiv(num_rows, default_block_size) < default_block_size)
? ceildiv(num_rows, default_block_size)
: default_block_size;

auto nnz_per_row = Array<size_type>(exec, num_rows);
auto block_results = Array<size_type>(exec, default_block_size);
auto d_result = Array<size_type>(exec, 1);

const auto grid_dim = ceildiv(num_rows, default_block_size);
kernel::calculate_nnz_per_row<<<grid_dim, default_block_size>>>(
num_rows, as_cuda_type(source->get_const_row_ptrs()),
as_cuda_type(nnz_per_row.get_data()));

kernel::reduce_max_nnz<<<grid_dim, default_block_size,
const auto n = ceildiv(num_rows, default_block_size);
const auto reduce_dim = n <= default_block_size ? n : default_block_size;
kernel::reduce_max_nnz<<<reduce_dim, default_block_size,
default_block_size * sizeof(size_type)>>>(
num_rows, as_cuda_type(nnz_per_row.get_const_data()),
as_cuda_type(block_results.get_data()));

kernel::reduce_max_nnz<<<1, default_block_size,
default_block_size * sizeof(size_type)>>>(
grid_dim, as_cuda_type(block_results.get_const_data()),
reduce_dim, as_cuda_type(block_results.get_const_data()),
as_cuda_type(d_result.get_data()));

exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
Expand Down

0 comments on commit 8bf33e0

Please sign in to comment.