Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove column major specialization. #5755

Merged
merged 10 commits into from
Jun 5, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,8 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, float *values,
CHECK_EQ(cache_id, 0) << "Cache ID is not supported yet";
auto *learner = static_cast<xgboost::Learner *>(handle);

auto x = xgboost::data::DenseAdapter(values, n_rows, n_cols);
auto x =
std::make_shared<xgboost::data::DenseAdapter>(values, n_rows, n_cols);
HostDeviceVector<float>* p_predt { nullptr };
std::string type { c_type };
learner->InplacePredict(x, type, missing, &p_predt);
Expand Down Expand Up @@ -494,7 +495,8 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle,
CHECK_EQ(cache_id, 0) << "Cache ID is not supported yet";
auto *learner = static_cast<xgboost::Learner *>(handle);

auto x = data::CSRAdapter(indptr, indices, data, nindptr - 1, nelem, num_col);
auto x = std::make_shared<data::CSRAdapter>(indptr, indices, data,
nindptr - 1, nelem, num_col);
HostDeviceVector<float>* p_predt { nullptr };
std::string type { c_type };
learner->InplacePredict(x, type, missing, &p_predt);
Expand Down
4 changes: 2 additions & 2 deletions src/c_api/c_api.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ XGB_DLL int XGBoosterPredictFromArrayInterfaceColumns(BoosterHandle handle,
auto *learner = static_cast<Learner*>(handle);

std::string json_str{c_json_strs};
auto x = data::CudfAdapter(json_str);
auto x = std::make_shared<data::CudfAdapter>(json_str);
HostDeviceVector<float>* p_predt { nullptr };
std::string type { c_type };
learner->InplacePredict(x, type, missing, &p_predt);
Expand Down Expand Up @@ -97,7 +97,7 @@ XGB_DLL int XGBoosterPredictFromArrayInterface(BoosterHandle handle,
auto *learner = static_cast<Learner*>(handle);

std::string json_str{c_json_strs};
auto x = data::CupyAdapter(json_str);
auto x = std::make_shared<data::CupyAdapter>(json_str);
HostDeviceVector<float>* p_predt { nullptr };
std::string type { c_type };
learner->InplacePredict(x, type, missing, &p_predt);
Expand Down
2 changes: 2 additions & 0 deletions src/data/adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ class CSRAdapter : public detail::SingleBatchDataIter<CSRAdapterBatch> {
const CSRAdapterBatch& Value() const override { return batch_; }
size_t NumRows() const { return num_rows_; }
size_t NumColumns() const { return num_columns_; }
~CSRAdapter() noexcept(false) override = default;

private:
CSRAdapterBatch batch_;
Expand Down Expand Up @@ -222,6 +223,7 @@ class DenseAdapter : public detail::SingleBatchDataIter<DenseAdapterBatch> {

size_t NumRows() const { return num_rows_; }
size_t NumColumns() const { return num_columns_; }
~DenseAdapter() noexcept(false) override = default;

private:
DenseAdapterBatch batch_;
Expand Down
44 changes: 14 additions & 30 deletions src/data/device_adapter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,36 +34,27 @@ struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
};

class CudfAdapterBatch : public detail::NoMetaInfo {
friend class CudfAdapter;

public:
CudfAdapterBatch() = default;
CudfAdapterBatch(common::Span<ArrayInterface> columns,
common::Span<size_t> column_ptr, size_t num_elements)
CudfAdapterBatch(common::Span<ArrayInterface> columns, size_t num_rows)
: columns_(columns),
column_ptr_(column_ptr),
num_elements_(num_elements) {}
size_t Size() const { return num_elements_; }
num_rows_(num_rows) {}
size_t Size() const { return num_rows_ * columns_.size(); }
__device__ COOTuple GetElement(size_t idx) const {
size_t column_idx =
thrust::upper_bound(thrust::seq,column_ptr_.begin(), column_ptr_.end(), idx) - column_ptr_.begin() - 1;
auto& column = columns_[column_idx];
size_t row_idx = idx - column_ptr_[column_idx];
size_t column_idx = idx % columns_.size();
size_t row_idx = idx / columns_.size();
auto const& column = columns_[column_idx];
float value = column.valid.Data() == nullptr || column.valid.Check(row_idx)
? column.GetElement(row_idx)
: std::numeric_limits<float>::quiet_NaN();
return {row_idx, column_idx, value};
}
__device__ float GetValue(size_t ridx, bst_feature_t fidx) const {
auto const& column = columns_[fidx];
float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
? column.GetElement(ridx)
: std::numeric_limits<float>::quiet_NaN();
return value;
}

private:
common::Span<ArrayInterface> columns_;
common::Span<size_t> column_ptr_;
size_t num_elements_;
size_t num_rows_;
};

/*!
Expand Down Expand Up @@ -127,7 +118,6 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
CHECK_EQ(typestr.size(), 3) << ArrayInterfaceErrors::TypestrFormat();
CHECK_NE(typestr.front(), '>') << ArrayInterfaceErrors::BigEndian();
std::vector<ArrayInterface> columns;
std::vector<size_t> column_ptr({0});
auto first_column = ArrayInterface(get<Object const>(json_columns[0]));
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
CHECK_NE(device_idx_, -1);
Expand All @@ -137,31 +127,27 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
auto column = ArrayInterface(get<Object const>(json_col));
columns.push_back(column);
CHECK_EQ(column.num_cols, 1);
column_ptr.emplace_back(column_ptr.back() + column.num_rows);
num_rows_ = std::max(num_rows_, size_t(column.num_rows));
CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
<< "All columns should use the same device.";
CHECK_EQ(num_rows_, column.num_rows)
<< "All columns should have same number of rows.";
}
columns_ = columns;
column_ptr_ = column_ptr;
batch_ = CudfAdapterBatch(dh::ToSpan(columns_), dh::ToSpan(column_ptr_),
column_ptr.back());
batch_ = CudfAdapterBatch(dh::ToSpan(columns_), num_rows_);
}
const CudfAdapterBatch& Value() const override {
CHECK_EQ(batch_.columns_.data(), columns_.data().get());
return batch_;
}
const CudfAdapterBatch& Value() const override { return batch_; }

size_t NumRows() const { return num_rows_; }
size_t NumColumns() const { return columns_.size(); }
size_t DeviceIdx() const { return device_idx_; }

// Cudf is column major
bool IsRowMajor() { return false; }

private:
CudfAdapterBatch batch_;
dh::device_vector<ArrayInterface> columns_;
dh::device_vector<size_t> column_ptr_; // Exclusive scan of column sizes
size_t num_rows_{0};
int device_idx_;
};
Expand Down Expand Up @@ -201,8 +187,6 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
size_t NumColumns() const { return array_interface_.num_cols; }
size_t DeviceIdx() const { return device_idx_; }

bool IsRowMajor() { return true; }

private:
ArrayInterface array_interface_;
CupyAdapterBatch batch_;
Expand Down
56 changes: 3 additions & 53 deletions src/data/ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,8 @@ struct WriteCompressedEllpackFunctor {
// Here the data is already correctly ordered and simply needs to be compacted
// to remove missing data
template <typename AdapterBatchT>
void CopyDataRowMajor(const AdapterBatchT& batch, EllpackPageImpl* dst,
int device_idx, float missing) {
void CopyDataToEllpack(const AdapterBatchT& batch, EllpackPageImpl* dst,
int device_idx, float missing) {
// Some witchcraft happens here
// The goal is to copy valid elements out of the input to an ellpack matrix
// with a given row stride, using no extra working memory Standard stream
Expand Down Expand Up @@ -209,51 +209,6 @@ void CopyDataRowMajor(const AdapterBatchT& batch, EllpackPageImpl* dst,
});
}

template <typename AdapterT, typename AdapterBatchT>
void CopyDataColumnMajor(AdapterT* adapter, const AdapterBatchT& batch,
EllpackPageImpl* dst, float missing) {
// Step 1: Get the sizes of the input columns
dh::caching_device_vector<size_t> column_sizes(adapter->NumColumns(), 0);
auto d_column_sizes = column_sizes.data().get();
// Populate column sizes
dh::LaunchN(adapter->DeviceIdx(), batch.Size(), [=] __device__(size_t idx) {
const auto& e = batch.GetElement(idx);
atomicAdd(reinterpret_cast<unsigned long long*>( // NOLINT
&d_column_sizes[e.column_idx]),
static_cast<unsigned long long>(1)); // NOLINT
});

thrust::host_vector<size_t> host_column_sizes = column_sizes;

// Step 2: Iterate over columns, place elements in correct row, increment
// temporary row pointers
dh::caching_device_vector<size_t> temp_row_ptr(adapter->NumRows(), 0);
auto d_temp_row_ptr = temp_row_ptr.data().get();
auto row_stride = dst->row_stride;
size_t begin = 0;
auto device_accessor = dst->GetDeviceAccessor(adapter->DeviceIdx());
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
data::IsValidFunctor is_valid(missing);
for (auto size : host_column_sizes) {
size_t end = begin + size;
dh::LaunchN(adapter->DeviceIdx(), end - begin, [=] __device__(size_t idx) {
auto writer_non_const =
writer; // For some reason this variable gets captured as const
const auto& e = batch.GetElement(idx + begin);
if (!is_valid(e)) return;
size_t output_position =
e.row_idx * row_stride + d_temp_row_ptr[e.row_idx];
auto bin_idx = device_accessor.SearchBin(e.value, e.column_idx);
writer_non_const.AtomicWriteSymbol(d_compressed_buffer, bin_idx,
output_position);
d_temp_row_ptr[e.row_idx] += 1;
});

begin = end;
}
}

void WriteNullValues(EllpackPageImpl* dst, int device_idx,
common::Span<size_t> row_counts) {
// Write the null values
Expand Down Expand Up @@ -284,12 +239,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterT* adapter, float missing, bool is_dense

*this = EllpackPageImpl(adapter->DeviceIdx(), cuts, is_dense, row_stride,
adapter->NumRows());
if (adapter->IsRowMajor()) {
CopyDataRowMajor(batch, this, adapter->DeviceIdx(), missing);
} else {
CopyDataColumnMajor(adapter, batch, this, missing);
}

CopyDataToEllpack(batch, this, adapter->DeviceIdx(), missing);
WriteNullValues(this, adapter->DeviceIdx(), row_counts_span);
}

Expand Down
54 changes: 5 additions & 49 deletions src/data/simple_dmatrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,51 +35,12 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
thrust::device_pointer_cast(offset.data()));
}

template <typename AdapterT>
void CopyDataColumnMajor(AdapterT* adapter, common::Span<Entry> data,
int device_idx, float missing,
common::Span<size_t> row_ptr) {
// Step 1: Get the sizes of the input columns
dh::device_vector<size_t> column_sizes(adapter->NumColumns());
auto d_column_sizes = column_sizes.data().get();
auto& batch = adapter->Value();
// Populate column sizes
dh::LaunchN(device_idx, batch.Size(), [=] __device__(size_t idx) {
const auto& e = batch.GetElement(idx);
atomicAdd(reinterpret_cast<unsigned long long*>( // NOLINT
&d_column_sizes[e.column_idx]),
static_cast<unsigned long long>(1)); // NOLINT
});

thrust::host_vector<size_t> host_column_sizes = column_sizes;

// Step 2: Iterate over columns, place elements in correct row, increment
// temporary row pointers
dh::device_vector<size_t> temp_row_ptr(
thrust::device_pointer_cast(row_ptr.data()),
thrust::device_pointer_cast(row_ptr.data() + row_ptr.size()));
auto d_temp_row_ptr = temp_row_ptr.data().get();
size_t begin = 0;
IsValidFunctor is_valid(missing);
for (auto size : host_column_sizes) {
size_t end = begin + size;
dh::LaunchN(device_idx, end - begin, [=] __device__(size_t idx) {
const auto& e = batch.GetElement(idx + begin);
if (!is_valid(e)) return;
data[d_temp_row_ptr[e.row_idx]] = Entry(e.column_idx, e.value);
d_temp_row_ptr[e.row_idx] += 1;
});

begin = end;
}
}

// Here the data is already correctly ordered and simply needs to be compacted
// to remove missing data
template <typename AdapterT>
void CopyDataRowMajor(AdapterT* adapter, common::Span<Entry> data,
int device_idx, float missing,
common::Span<size_t> row_ptr) {
void CopyDataToDMatrix(AdapterT* adapter, common::Span<Entry> data,
int device_idx, float missing,
common::Span<size_t> row_ptr) {
auto& batch = adapter->Value();
auto transform_f = [=] __device__(size_t idx) {
const auto& e = batch.GetElement(idx);
Expand Down Expand Up @@ -116,13 +77,8 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
CountRowOffsets(batch, s_offset, adapter->DeviceIdx(), missing);
info_.num_nonzero_ = sparse_page_.offset.HostVector().back();
sparse_page_.data.Resize(info_.num_nonzero_);
if (adapter->IsRowMajor()) {
CopyDataRowMajor(adapter, sparse_page_.data.DeviceSpan(),
adapter->DeviceIdx(), missing, s_offset);
} else {
CopyDataColumnMajor(adapter, sparse_page_.data.DeviceSpan(),
adapter->DeviceIdx(), missing, s_offset);
}
CopyDataToDMatrix(adapter, sparse_page_.data.DeviceSpan(),
adapter->DeviceIdx(), missing, s_offset);

info_.num_col_ = adapter->NumColumns();
info_.num_row_ = adapter->NumRows();
Expand Down
14 changes: 7 additions & 7 deletions src/predictor/cpu_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -271,30 +271,30 @@ class CPUPredictor : public Predictor {
PredictionCacheEntry *out_preds,
uint32_t tree_begin, uint32_t tree_end) const {
auto threads = omp_get_max_threads();
auto m = dmlc::get<Adapter>(x);
CHECK_EQ(m.NumColumns(), model.learner_model_param->num_feature)
auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
<< "Number of columns in data must equal to trained model.";
MetaInfo info;
info.num_col_ = m.NumColumns();
info.num_row_ = m.NumRows();
info.num_col_ = m->NumColumns();
info.num_row_ = m->NumRows();
this->InitOutPredictions(info, &(out_preds->predictions), model);
std::vector<Entry> workspace(info.num_col_ * 8 * threads);
auto &predictions = out_preds->predictions.HostVector();
std::vector<RegTree::FVec> thread_temp;
InitThreadTemp(threads, model.learner_model_param->num_feature, &thread_temp);
size_t constexpr kUnroll = 8;
PredictBatchKernel(AdapterView<Adapter, kUnroll>(
&m, missing, common::Span<Entry>{workspace}),
m.get(), missing, common::Span<Entry>{workspace}),
&predictions, model, tree_begin, tree_end, &thread_temp);
}

void InplacePredict(dmlc::any const &x, const gbm::GBTreeModel &model,
float missing, PredictionCacheEntry *out_preds,
uint32_t tree_begin, unsigned tree_end) const override {
if (x.type() == typeid(data::DenseAdapter)) {
if (x.type() == typeid(std::shared_ptr<data::DenseAdapter>)) {
this->DispatchedInplacePredict<data::DenseAdapter>(
x, model, missing, out_preds, tree_begin, tree_end);
} else if (x.type() == typeid(data::CSRAdapter)) {
} else if (x.type() == typeid(std::shared_ptr<data::CSRAdapter>)) {
this->DispatchedInplacePredict<data::CSRAdapter>(
x, model, missing, out_preds, tree_begin, tree_end);
} else {
Expand Down
Loading