Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Host Solver workspace query fix #733

Merged
merged 2 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 6 additions & 41 deletions include/matx/transforms/chol/chol_lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class matxDnCholHostPlan_t : matxDnHostSolver_t<typename remove_cvref_t<ATensor>
* matrix using the Cholesky method, where M is either the upper or lower
* triangular portion of A. Input matrix A must be a square Hermitian matrix
* positive-definite where only the upper or lower triangle is used. This does
* require a workspace.
* not require a workspace.
*
* @tparam T1
* Data type of A matrix
Expand Down Expand Up @@ -181,45 +181,22 @@ class matxDnCholHostPlan_t : matxDnHostSolver_t<typename remove_cvref_t<ATensor>
DnCholHostParams_t params;
};

/**
* Crude hash to get a reasonably good delta for collisions. This doesn't need
* to be perfect, but fast enough to not slow down lookups, and different enough
* so the common solver parameters change
*/
struct DnCholHostParamsKeyHash {
std::size_t operator()(const DnCholHostParams_t &k) const noexcept
{
return (std::hash<uint64_t>()(k.n)) + (std::hash<uint64_t>()(k.batch_size));
}
};

/**
* Test cholesky parameters for equality. Unlike the hash, all parameters must
* match.
*/
struct DnCholHostParamsKeyEq {
bool operator()(const DnCholHostParams_t &l, const DnCholHostParams_t &t) const
noexcept
{
return l.n == t.n && l.batch_size == t.batch_size && l.dtype == t.dtype;
}
};

using chol_Host_cache_t = std::unordered_map<DnCholHostParams_t, std::any, DnCholHostParamsKeyHash, DnCholHostParamsKeyEq>;
#endif

} // end namespace detail


/**
* Perform a Cholesky decomposition using a cached plan
* Perform a Cholesky decomposition.
*
* See documentation of matxDnCholHostPlan_t for a description of how the
* algorithm works. This function provides a simple interface to the LAPACK
* library by deducing all parameters needed to perform a Cholesky decomposition
* from only the matrix A. The input and output parameters may be the same
* tensor. In that case, the input is destroyed and the output is stored
* in-place. Input must be a positive-definite Hermitian or real symmetric matrix.
*
* No caching as LAPACK allocation/setup is not needed.
*
* @tparam T1
* Data type of matrix A
Expand Down Expand Up @@ -280,20 +257,8 @@ void chol_impl([[maybe_unused]] OutputTensor &&out,

const char uplo_lapack = (uplo == SolverFillMode::UPPER)? 'U' : 'L';

// Get parameters required by these tensors
auto params = detail::matxDnCholHostPlan_t<OutputTensor, decltype(tv)>::GetCholParams(tv, uplo_lapack);

using cache_val_type = detail::matxDnCholHostPlan_t<OutputTensor, decltype(a_new)>;
detail::GetCache().LookupAndExec<detail::chol_Host_cache_t>(
detail::GetCacheIdFromType<detail::chol_Host_cache_t>(),
params,
[&]() {
return std::make_shared<cache_val_type>(tv, uplo_lapack);
},
[&](std::shared_ptr<cache_val_type> ctype) {
ctype->Exec(tv, tv, exec, uplo_lapack);
}
);
detail::matxDnCholHostPlan_t<OutputTensor, decltype(a_new)> chol_plan(tv, uplo_lapack);
chol_plan.Exec(tv, tv, exec, uplo_lapack);

if (! allContiguous) {
matx::copy(out, tv, exec);
Expand Down
40 changes: 26 additions & 14 deletions include/matx/transforms/eig/eig_lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
using T2 = typename WTensor::value_type;
static constexpr int RANK = OutTensor_t::Rank();
static_assert(RANK >= 2, "Input/Output tensor must be rank 2 or higher");

/**
* Plan computing eigenvalues/vectors on square Hermitian A such that:
*
Expand Down Expand Up @@ -122,12 +122,20 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
{
// Perform a workspace query with lwork = -1.
lapack_int_t info;
T1 work_query;
T2 rwork_query;

// Due to a LAPACK bug which gives incorrect workspace size calculations for single-precision
// routines at large matrix sizes from to float-to-int conversions, should use double-precision
// for the query. Only an issue in routines which may need O(n^2) memory.
using WorkQuery = std::conditional_t<std::is_same_v<T1, float>, double,
std::conditional_t<std::is_same_v<T1, cuda::std::complex<float>>,
cuda::std::complex<double>, T1>>;
using RworkQuery = std::conditional_t<std::is_same_v<T2, float>, double, T2>;
WorkQuery work_query;
RworkQuery rwork_query;
lapack_int_t iwork_query;

// Use vector mode for a larger workspace size that works for both modes
syevd_dispatch("V", &params.uplo, &params.n, nullptr, &params.n,
syevd_dispatch<WorkQuery, RworkQuery>("V", &params.uplo, &params.n, nullptr, &params.n,
nullptr, &work_query, &this->lwork, &rwork_query,
&this->lrwork, &iwork_query, &this->liwork, &info);

Expand All @@ -140,7 +148,7 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
this->lrwork = static_cast<lapack_int_t>(rwork_query);
} else {
this->lwork = static_cast<lapack_int_t>(work_query);
this->lrwork = 0; // Complex variants do not use rwork.
this->lrwork = 0; // rwork is not used for real types
}
this->liwork = static_cast<lapack_int_t>(iwork_query);
}
Expand Down Expand Up @@ -212,22 +220,26 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
~matxDnEigHostPlan_t() {}

private:
/**
* Cannot use T1 and T2, as we always use the double versions for workspace queries
*/
template <typename AType, typename WType>
void syevd_dispatch(const char* jobz, const char* uplo, const lapack_int_t* n,
T1* a, const lapack_int_t* lda, T2* w, T1* work_in,
const lapack_int_t* lwork_in, [[maybe_unused]] T2* rwork_in,
AType* a, const lapack_int_t* lda, WType* w, AType* work_in,
const lapack_int_t* lwork_in, [[maybe_unused]] WType* rwork_in,
[[maybe_unused]] const lapack_int_t* lrwork_in, lapack_int_t* iwork_in,
const lapack_int_t* liwork_in, lapack_int_t* info)
{
// TODO: remove warning suppression once syevd is optimized in NVPL LAPACK
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
if constexpr (std::is_same_v<T1, float>) {
if constexpr (std::is_same_v<AType, float>) {
LAPACK_CALL(ssyevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, iwork_in, liwork_in, info);
} else if constexpr (std::is_same_v<T1, double>) {
} else if constexpr (std::is_same_v<AType, double>) {
LAPACK_CALL(dsyevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, iwork_in, liwork_in, info);
} else if constexpr (std::is_same_v<T1, cuda::std::complex<float>>) {
} else if constexpr (std::is_same_v<AType, cuda::std::complex<float>>) {
LAPACK_CALL(cheevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, rwork_in, lrwork_in, iwork_in, liwork_in, info);
} else if constexpr (std::is_same_v<T1, cuda::std::complex<double>>) {
} else if constexpr (std::is_same_v<AType, cuda::std::complex<double>>) {
LAPACK_CALL(zheevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, rwork_in, lrwork_in, iwork_in, liwork_in, info);
}
#pragma GCC diagnostic pop
Expand Down Expand Up @@ -260,7 +272,7 @@ struct DnEigHostParamsKeyEq {
}
};

using eig_Host_cache_t = std::unordered_map<DnEigHostParams_t, std::any, DnEigHostParamsKeyHash, DnEigHostParamsKeyEq>;
using eig_host_cache_t = std::unordered_map<DnEigHostParams_t, std::any, DnEigHostParamsKeyHash, DnEigHostParamsKeyEq>;
#endif

} // end namespace detail
Expand Down Expand Up @@ -335,8 +347,8 @@ void eig_impl([[maybe_unused]] OutputTensor &&out,

// Get cache or new eigen plan if it doesn't exist
using cache_val_type = detail::matxDnEigHostPlan_t<OutputTensor, decltype(w_new), decltype(a_new)>;
detail::GetCache().LookupAndExec<detail::eig_Host_cache_t>(
detail::GetCacheIdFromType<detail::eig_Host_cache_t>(),
detail::GetCache().LookupAndExec<detail::eig_host_cache_t>(
detail::GetCacheIdFromType<detail::eig_host_cache_t>(),
params,
[&]() {
return std::make_shared<cache_val_type>(w_new, tv, jobz_lapack, uplo_lapack);
Expand Down
45 changes: 4 additions & 41 deletions include/matx/transforms/lu/lu_lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,32 +188,6 @@ class matxDnLUHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
DnLUHostParams_t params;
};

/**
* Crude hash to get a reasonably good delta for collisions. This doesn't need
* to be perfect, but fast enough to not slow down lookups, and different enough
* so the common solver parameters change
*/
struct DnLUHostParamsKeyHash {
std::size_t operator()(const DnLUHostParams_t &k) const noexcept
{
return (std::hash<uint64_t>()(k.m)) + (std::hash<uint64_t>()(k.n)) +
(std::hash<uint64_t>()(k.batch_size));
}
};

/**
* Test LU parameters for equality. Unlike the hash, all parameters must match.
*/
struct DnLUHostParamsKeyEq {
bool operator()(const DnLUHostParams_t &l, const DnLUHostParams_t &t) const noexcept
{
return l.n == t.n && l.m == t.m && l.batch_size == t.batch_size &&
l.dtype == t.dtype;
}
};

// Static caches of LU this->handles
using lu_Host_cache_t = std::unordered_map<DnLUHostParams_t, std::any, DnLUHostParamsKeyHash, DnLUHostParamsKeyEq>;
#endif

} // end namespace detail
Expand All @@ -227,6 +201,8 @@ using lu_Host_cache_t = std::unordered_map<DnLUHostParams_t, std::any, DnLUHostP
* library by deducing all parameters needed to perform an LU decomposition from
* only the matrix A. The input and output parameters may be the same tensor. In
* that case, the input is destroyed and the output is stored in-place.
*
* No caching as LAPACK allocation/setup is not needed.
*
* @tparam T1
* Data type of matrix A
Expand Down Expand Up @@ -272,21 +248,8 @@ void lu_impl([[maybe_unused]] OutputTensor &&out,
auto tv = TransposeCopy(tp, a_new, exec);
auto tvt = tv.PermuteMatrix();

// Get parameters required by these tensors
auto params = detail::matxDnLUHostPlan_t<OutputTensor, decltype(piv_new), decltype(a_new)>::GetLUParams(piv_new, tvt);

// Get cache or new LU plan if it doesn't exist
using cache_val_type = detail::matxDnLUHostPlan_t<OutputTensor, decltype(piv_new), decltype(a_new)>;
detail::GetCache().LookupAndExec<detail::lu_Host_cache_t>(
detail::GetCacheIdFromType<detail::lu_Host_cache_t>(),
params,
[&]() {
return std::make_shared<cache_val_type>(piv_new, tvt);
},
[&](std::shared_ptr<cache_val_type> ctype) {
ctype->Exec(tvt, piv_new, tvt, exec);
}
);
detail::matxDnLUHostPlan_t<OutputTensor, decltype(piv_new), decltype(a_new)> lu_plan(piv_new, tvt);
lu_plan.Exec(tvt, piv_new, tvt, exec);

/* Temporary WAR
* Copy and free async buffer for transpose */
Expand Down
6 changes: 3 additions & 3 deletions include/matx/transforms/qr/qr_lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ struct DnQRHostParamsKeyEq {
}
};

using qr_Host_cache_t = std::unordered_map<DnQRHostParams_t, std::any, DnQRHostParamsKeyHash, DnQRHostParamsKeyEq>;
using qr_host_cache_t = std::unordered_map<DnQRHostParams_t, std::any, DnQRHostParamsKeyHash, DnQRHostParamsKeyEq>;
#endif

} // end namespace detail
Expand Down Expand Up @@ -297,8 +297,8 @@ void qr_solver_impl([[maybe_unused]] OutTensor &&out,

// Get cache or new QR plan if it doesn't exist
using cache_val_type = detail::matxDnQRHostPlan_t<OutTensor, decltype(tau_new), decltype(a_new)>;
detail::GetCache().LookupAndExec<detail::qr_Host_cache_t>(
detail::GetCacheIdFromType<detail::qr_Host_cache_t>(),
detail::GetCache().LookupAndExec<detail::qr_host_cache_t>(
detail::GetCacheIdFromType<detail::qr_host_cache_t>(),
params,
[&]() {
return std::make_shared<cache_val_type>(tau_new, tvt);
Expand Down
41 changes: 27 additions & 14 deletions include/matx/transforms/svd/svd_lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,11 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
// larger workspace size that works for all modes

lapack_int_t info;
T1 work_query;
lapack_int_t mn = cuda::std::min(params.m, params.n);
lapack_int_t mx = cuda::std::max(params.m, params.n);

if (params.algo == SVDHostAlgo::QR) {
T1 work_query;
gesvd_dispatch("A", "A", &params.m, &params.n, nullptr,
&params.m, nullptr, nullptr, &params.m, nullptr, &params.n,
&work_query, &this->lwork, nullptr, &info);
Expand All @@ -163,7 +163,16 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
this->lrwork = 0;
}
} else if (params.algo == SVDHostAlgo::DC) {
gesdd_dispatch("A", &params.m, &params.n, nullptr, &params.m, nullptr,
// Due to a LAPACK bug which gives incorrect workspace size calculations for single-precision
// routines at large matrix sizes from to float-to-int conversions, should use double-precision
// for the query. Only an issue in routines which may need O(n^2) memory.
using WorkQuery = std::conditional_t<std::is_same_v<T1, float>, double,
std::conditional_t<std::is_same_v<T1, cuda::std::complex<float>>,
cuda::std::complex<double>, T1>>;
using RworkQuery = std::conditional_t<std::is_same_v<T3, float>, double, T3>;
WorkQuery work_query;

gesdd_dispatch<WorkQuery, RworkQuery>("A", &params.m, &params.n, nullptr, &params.m, nullptr,
nullptr, &params.m, nullptr, &params.n, &work_query,
&this->lwork, nullptr, nullptr, &info);

Expand All @@ -173,11 +182,11 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
this->liwork = 8 * mn; // iwork has size 8*min(M,N) and is used for all types

// the real part of the first elem of work holds the optimal lwork.
if constexpr (is_complex_v<T1>) {
if constexpr (is_complex_v<WorkQuery>) {
this->lwork = static_cast<lapack_int_t>(work_query.real());

lapack_int_t mnthr = (mn * 5) / 3;
if (mx >= mnthr) {
if (mx >= mnthr) { // mx >> mn condition in LAPACK
this->lrwork = 5*mn*mn + 5*mn;
} else {
this->lrwork = cuda::std::max(5*mn*mn + 5*mn, 2*mx*mn + 2*mn*mn + mn);
Expand Down Expand Up @@ -297,18 +306,22 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
#pragma GCC diagnostic pop
}

/**
* Cannot use T1 and T3, as we always use the double versions for workspace queries
*/
template <typename T, typename S>
void gesdd_dispatch(const char *jobz, const lapack_int_t *m, const lapack_int_t *n,
T1 *a, const lapack_int_t *lda, T3 *s, T1 *u, const lapack_int_t *ldu,
T1 *vt, const lapack_int_t *ldvt, T1 *work_in, const lapack_int_t *lwork_in,
[[maybe_unused]] T3 *rwork_in, lapack_int_t *iwork_in, lapack_int_t *info)
T *a, const lapack_int_t *lda, S *s, T *u, const lapack_int_t *ldu,
T *vt, const lapack_int_t *ldvt, T *work_in, const lapack_int_t *lwork_in,
[[maybe_unused]] S *rwork_in, lapack_int_t *iwork_in, lapack_int_t *info)
{
if constexpr (std::is_same_v<T1, float>) {
if constexpr (std::is_same_v<T, float>) {
LAPACK_CALL(sgesdd)(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, iwork_in, info);
} else if constexpr (std::is_same_v<T1, double>) {
} else if constexpr (std::is_same_v<T, double>) {
LAPACK_CALL(dgesdd)(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, iwork_in, info);
} else if constexpr (std::is_same_v<T1, cuda::std::complex<float>>) {
} else if constexpr (std::is_same_v<T, cuda::std::complex<float>>) {
LAPACK_CALL(cgesdd)(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, rwork_in, iwork_in, info);
} else if constexpr (std::is_same_v<T1, cuda::std::complex<double>>) {
} else if constexpr (std::is_same_v<T, cuda::std::complex<double>>) {
LAPACK_CALL(zgesdd)(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, rwork_in, iwork_in, info);
}
}
Expand Down Expand Up @@ -343,7 +356,7 @@ struct DnSVDHostParamsKeyEq {
}
};

using svd_Host_cache_t = std::unordered_map<DnSVDHostParams_t, std::any, DnSVDHostParamsKeyHash, DnSVDHostParamsKeyEq>;
using svd_host_cache_t = std::unordered_map<DnSVDHostParams_t, std::any, DnSVDHostParamsKeyHash, DnSVDHostParamsKeyEq>;
#endif

}
Expand Down Expand Up @@ -424,8 +437,8 @@ void svd_impl([[maybe_unused]] UTensor &&u,

// Get cache or new SVD plan if it doesn't exist
using cache_val_type = detail::matxDnSVDHostPlan_t<decltype(u_in), decltype(s_new), decltype(vt_in), decltype(at_col_maj)>;
detail::GetCache().LookupAndExec<detail::svd_Host_cache_t>(
detail::GetCacheIdFromType<detail::svd_Host_cache_t>(),
detail::GetCache().LookupAndExec<detail::svd_host_cache_t>(
detail::GetCacheIdFromType<detail::svd_host_cache_t>(),
params,
[&]() {
return std::make_shared<cache_val_type>(u_in, s_new, vt_in, at_col_maj, job_lapack, algo);
Expand Down