From c33d749732d747115557c3d54e4fff3e78bc7302 Mon Sep 17 00:00:00 2001 From: Cliff Burdick <30670611+cliffburdick@users.noreply.github.com> Date: Thu, 21 Nov 2024 17:15:54 -0800 Subject: [PATCH] Faster batched SVD for small sizes (#805) --- include/matx/transforms/solver_common.h | 6 +- include/matx/transforms/svd/svd_cuda.h | 256 ++++++++++++++++++++---- test/00_solver/SVD.cu | 120 +++++++++++ 3 files changed, 338 insertions(+), 44 deletions(-) diff --git a/include/matx/transforms/solver_common.h b/include/matx/transforms/solver_common.h index 7e065ab1..67d8f46a 100644 --- a/include/matx/transforms/solver_common.h +++ b/include/matx/transforms/solver_common.h @@ -118,10 +118,10 @@ __MATX_INLINE__ char SVDModeToChar(SVDMode jobz) { template -__MATX_INLINE__ auto getSolverSupportedTensor(const Op &in, const Executor &exec) { +__MATX_INLINE__ auto getSolverSupportedTensor(const Op &in, const Executor &exec, bool force = false) { constexpr int RANK = Op::Rank(); - bool supported = true; + bool supported = !force; // If we're forcing a new tensor just make it unsupported if constexpr (!(is_tensor_view_v)) { supported = false; } else { @@ -274,7 +274,7 @@ class matxDnCUDASolver_t { } } else { -#endif +#endif if (dspace > 0) { matxAlloc(&d_workspace, batches * dspace, MATX_DEVICE_MEMORY); } diff --git a/include/matx/transforms/svd/svd_cuda.h b/include/matx/transforms/svd/svd_cuda.h index c8e513c9..0ef4f8b5 100644 --- a/include/matx/transforms/svd/svd_cuda.h +++ b/include/matx/transforms/svd/svd_cuda.h @@ -50,6 +50,12 @@ namespace matx { namespace detail { +enum class SVDMethod { + GESVD, + GESVDJ_BATCHED, + POW_ITER, +}; + template inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) { using ATypeS = typename AType::value_type; @@ -536,8 +542,44 @@ struct DnSVDCUDAParams_t { void *S; size_t batch_size; MatXDataType_t dtype; + SVDMethod method; }; +template +static __MATX_INLINE__ SVDMethod GetCUDASVDMethod(const ATensor &a) { + static constexpr int RANK = ATensor::Rank(); + index_t m = a.Size(RANK - 2); + index_t n = a.Size(RANK - 1); + SVDMethod method; + + // This assumes the matrix sizes are fairly large, in which case gesvd should win out on speed + if (a.Rank() == 2) { + method = detail::SVDMethod::GESVD; + } + else { + if (a.Size(RANK-2) <= 32 && + a.Size(RANK-1) <= 32) { + if constexpr (is_tensor_view_v) { + #if !defined(INDEX_32_BIT) + if (a.Stride(0) < std::numeric_limits::max()) { + method = detail::SVDMethod::GESVDJ_BATCHED; + } + #else + method = detail::SVDMethod::GESVDJ_BATCHED; + #endif + } + else { + method = detail::SVDMethod::GESVDJ_BATCHED; + } + } + else { + method = detail::SVDMethod::GESVD; + } + } + + return method; +} + template class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { using T1 = typename ATensor::value_type; @@ -583,6 +625,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { STensor &s, VtTensor &vt, const ATensor &a, + SVDMethod method, const char jobz = 'A') { MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL) @@ -600,24 +643,83 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { MATX_STATIC_ASSERT_STR((std::is_same_v::type, T3>), matxInvalidType, "A and S inner types must match"); params = GetSVDParams(u, s, vt, a, jobz); + params.method = method; + + if (params.method == SVDMethod::GESVDJ_BATCHED) { + [[maybe_unused]] cusolverStatus_t ret; + ret = cusolverDnCreateGesvdjInfo(&batch_params); + MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnCreateGesvdjInfo"); + + ret = cusolverDnXgesvdjSetTolerance(batch_params, 1e-9); + MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnXgesvdjSetTolerance"); + + ret = cusolverDnXgesvdjSetMaxSweeps(batch_params, 15); + MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnXgesvdjSetMaxSweeps"); + } + this->GetWorkspaceSize(); - this->AllocateWorkspace(params.batch_size, false); + this->AllocateWorkspace(params.batch_size, params.method == SVDMethod::GESVDJ_BATCHED); } void GetWorkspaceSize() override { + cusolverStatus_t ret; + // Use all mode for a larger workspace size that works for all modes - cusolverStatus_t ret = - cusolverDnXgesvd_bufferSize( - this->handle, this->dn_params, 'A', 'A', params.m, params.n, - MatXTypeToCudaType(), params.A, params.m, - MatXTypeToCudaType(), params.S, MatXTypeToCudaType(), - params.U, params.m, MatXTypeToCudaType(), params.VT, params.n, - MatXTypeToCudaType(), &this->dspace, &this->hspace); + if (!batched) { + ret = cusolverDnXgesvd_bufferSize( + this->handle, this->dn_params, 'A', 'A', params.m, params.n, + MatXTypeToCudaType(), params.A, params.m, + MatXTypeToCudaType(), params.S, MatXTypeToCudaType(), + params.U, params.m, MatXTypeToCudaType(), params.VT, params.n, + MatXTypeToCudaType(), &this->dspace, &this->hspace); + } + else { + if constexpr (std::is_same_v) { + ret = cusolverDnSgesvdjBatched_bufferSize( + this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), + reinterpret_cast(params.A), static_cast(params.m), + reinterpret_cast(params.S), reinterpret_cast(params.U), static_cast(params.m), + reinterpret_cast(params.VT), static_cast(params.n), + reinterpret_cast(&this->dspace), batch_params, static_cast(params.batch_size)); + } + else if constexpr (std::is_same_v) { + ret = cusolverDnDgesvdjBatched_bufferSize( + this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), + reinterpret_cast(params.A), static_cast(params.m), + reinterpret_cast(params.S), reinterpret_cast(params.U), + static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), + reinterpret_cast(&this->dspace), batch_params, static_cast(params.batch_size)); + } + else if constexpr (std::is_same_v, T1>) { + ret = cusolverDnCgesvdjBatched_bufferSize( + this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), + reinterpret_cast(params.A), static_cast(params.m), + reinterpret_cast(params.S), reinterpret_cast(params.U), + static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), + reinterpret_cast(&this->dspace), batch_params, static_cast(params.batch_size)); + } + else if constexpr (std::is_same_v, T1>) { + ret = cusolverDnZgesvdjBatched_bufferSize( + this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), + reinterpret_cast(params.A), static_cast(params.m), + reinterpret_cast(params.S), reinterpret_cast(params.U), + static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), + reinterpret_cast(&this->dspace), batch_params, static_cast(params.batch_size)); + } + else { + MATX_THROW(matxInvalidType, "Invalid data type passed to svd()"); + } + + int dspace_tmp = *reinterpret_cast(&this->dspace); + this->dspace = dspace_tmp; + this->hspace = 0; + } MATX_ASSERT(ret == CUSOLVER_STATUS_SUCCESS, matxSolverError); } + static DnSVDCUDAParams_t GetSVDParams(UTensor &u, STensor &s, VtTensor &vt, const ATensor &a, @@ -633,7 +735,6 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { params.S = s.Data(); params.jobz = jobz; params.dtype = TypeToInt(); - return params; } @@ -643,6 +744,8 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { { MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL) + cusolverStatus_t ret; + // Batch size checks for(int i = 0 ; i < RANK-2; i++) { MATX_ASSERT_STR(u.Size(i) == a.Size(i), matxInvalidDim, "U and A must have the same batch sizes"); @@ -661,30 +764,78 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { } MATX_ASSERT_STR(s.Size(RANK-2) == cuda::std::min(params.m, params.n), matxInvalidSize, "S must be ... x min(m,n)"); + const auto stream = exec.getStream(); + cusolverDnSetStream(this->handle, stream); + SetBatchPointers(a, this->batch_a_ptrs); SetBatchPointers(u, this->batch_u_ptrs); SetBatchPointers(vt, this->batch_vt_ptrs); SetBatchPointers(s, this->batch_s_ptrs); - const auto stream = exec.getStream(); - cusolverDnSetStream(this->handle, stream); - const int64_t ldvt = vt.Size(RANK-2); - - // At this time cuSolver does not have a batched 64-bit SVD interface. Change - // this to use the batched version once available. - for (size_t i = 0; i < this->batch_a_ptrs.size(); i++) { - - auto ret = cusolverDnXgesvd( - this->handle, this->dn_params, jobz, jobz, params.m, params.n, - MatXTypeToCudaType(), this->batch_a_ptrs[i], params.m, - MatXTypeToCudaType(), this->batch_s_ptrs[i], MatXTypeToCudaType(), - this->batch_u_ptrs[i], params.m, MatXTypeToCudaType(), this->batch_vt_ptrs[i], - ldvt, MatXTypeToCudaType(), - reinterpret_cast(this->d_workspace) + i * this->dspace, this->dspace, - reinterpret_cast(this->h_workspace) + i * this->hspace, this->hspace, - this->d_info + i); - - MATX_ASSERT(ret == CUSOLVER_STATUS_SUCCESS, matxSolverError); + const int64_t ldvt = vt.Size(RANK-2); + + if (params.method == SVDMethod::GESVD) { + // At this time cuSolver does not have a batched 64-bit SVD interface. Change + // this to use the batched version once available. + for (size_t i = 0; i < this->batch_a_ptrs.size(); i++) { + ret = cusolverDnXgesvd( + this->handle, this->dn_params, jobz, jobz, params.m, params.n, + MatXTypeToCudaType(), this->batch_a_ptrs[i], params.m, + MatXTypeToCudaType(), this->batch_s_ptrs[i], MatXTypeToCudaType(), + this->batch_u_ptrs[i], params.m, MatXTypeToCudaType(), this->batch_vt_ptrs[i], + ldvt, MatXTypeToCudaType(), + reinterpret_cast(this->d_workspace) + i * this->dspace, this->dspace, + reinterpret_cast(this->h_workspace) + i * this->hspace, this->hspace, + this->d_info + i); + + MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "cusolverDnXgesvd failed"); + } + } + else if (params.method == SVDMethod::GESVDJ_BATCHED) { + if constexpr (std::is_same_v) { + ret = cusolverDnSgesvdjBatched( + this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), + reinterpret_cast(params.A), static_cast(params.m), + reinterpret_cast(params.S), reinterpret_cast(params.U), + static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), + reinterpret_cast(this->d_workspace), static_cast(this->dspace), + this->d_info, batch_params, static_cast(params.batch_size)); + } + else if constexpr (std::is_same_v) { + ret = cusolverDnDgesvdjBatched( + this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), + reinterpret_cast(params.A), static_cast(params.m), + reinterpret_cast(params.S), reinterpret_cast(params.U), + static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), + reinterpret_cast(this->d_workspace), static_cast(this->dspace), + this->d_info, batch_params, static_cast(params.batch_size)); + } + else if constexpr (std::is_same_v, T1>) { + ret = cusolverDnCgesvdjBatched( + this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), + reinterpret_cast(params.A), static_cast(params.m), + reinterpret_cast(params.S), reinterpret_cast(params.U), + static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), + reinterpret_cast(this->d_workspace), static_cast(this->dspace), + this->d_info, batch_params, static_cast(params.batch_size)); + } + else if constexpr (std::is_same_v, T1>) { + ret = cusolverDnZgesvdjBatched( + this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), + reinterpret_cast(params.A), static_cast(params.m), + reinterpret_cast(params.S), reinterpret_cast(params.U), + static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), + reinterpret_cast(this->d_workspace), static_cast(this->dspace), + this->d_info, batch_params, static_cast(params.batch_size)); + } + else { + MATX_THROW(matxInvalidType, "Invalid data type passed to svd()"); + } + + MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "cusolverDnSgesvdjBatched failed"); + } + else { + MATX_THROW(matxInvalidType, "Invalid backend for SVD"); } std::vector h_info(this->batch_a_ptrs.size()); @@ -701,7 +852,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { MATX_ASSERT_STR_EXP(info, 0, matxSolverError, (std::to_string(info) + " superdiagonals of an intermediate bidiagonal form did not converge to zero in cuSolver Xgesvd").c_str()); } - } + } } /** @@ -717,7 +868,9 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { std::vector batch_u_ptrs; std::vector batch_s_ptrs; std::vector batch_vt_ptrs; + gesvdjInfo_t batch_params = nullptr; DnSVDCUDAParams_t params; + bool batched; }; /** @@ -800,15 +953,22 @@ void svd_impl(UTensor &&u, STensor &&s, Eventually these limitations may be fixed in cuSolver. */ - + + const char job_cusolver = detail::SVDModeToChar(jobz); + const bool m_leq_n = a.Size(RANK-2) <= a.Size(RANK-1); + const auto method = GetCUDASVDMethod(a); + + // The power iteration method is a custom kernel so we don't need to test the types + if (method == detail::SVDMethod::POW_ITER) { + svdbpi_impl(u, s, vt, a, 10, 0.f, exec); + return; + } + // cuSolver destroys the input, so we need to make a copy of A regardless T1 *tp; auto a_shape = a.Shape(); auto a_total_size = std::accumulate(a_shape.begin(), a_shape.begin() + ATensor::Rank(), 1, std::multiplies()); - matxAlloc(reinterpret_cast(&tp), sizeof(T1) * a_total_size, MATX_ASYNC_DEVICE_MEMORY, stream); - - const char job_cusolver = detail::SVDModeToChar(jobz); - const bool m_leq_n = a.Size(RANK-2) <= a.Size(RANK-1); + matxAlloc(reinterpret_cast(&tp), sizeof(T1) * a_total_size, MATX_ASYNC_DEVICE_MEMORY, stream); if (m_leq_n) { // get col-major AT @@ -816,10 +976,13 @@ void svd_impl(UTensor &&u, STensor &&s, (a_new = a).run(exec); auto at_col_maj = transpose_matrix(a_new); - auto u_new = getSolverSupportedTensor(u, exec); + // gesvdj does not return VT as gesvd does, so we need to transpose U before storing. Force an allocation + // of a temporary tensor here + auto u_new = getSolverSupportedTensor(u, exec, method == detail::SVDMethod::GESVDJ_BATCHED); auto vt_new = getSolverSupportedTensor(vt, exec); // swap U and VT. svd(AT) = V*S*UT + // svd(AT) = V*S*UT // Need the tensors to appear like V and UT since that is expected based on AT view // inputted, although the results when read as row-major are VT and U. auto u_in = transpose_matrix(vt_new); @@ -835,7 +998,7 @@ void svd_impl(UTensor &&u, STensor &&s, detail::GetCacheIdFromType(), params, [&]() { - return std::make_shared(u_in, s_new, vt_in, at_col_maj, job_cusolver); + return std::make_shared(u_in, s_new, vt_in, at_col_maj, method, job_cusolver); }, [&](std::shared_ptr ctype) { ctype->Exec(u_in, s_new, vt_in, at_col_maj, exec, job_cusolver); @@ -843,7 +1006,12 @@ void svd_impl(UTensor &&u, STensor &&s, ); if(!u_new.isSameView(u)) { - (u = u_new).run(exec); + if (method == detail::SVDMethod::GESVDJ_BATCHED) { + (u = conj(transpose_matrix((u_new)))).run(exec); + } + else { + (u = u_new).run(exec); + } } if(!vt_new.isSameView(vt)) { (vt = vt_new).run(exec); @@ -853,6 +1021,7 @@ void svd_impl(UTensor &&u, STensor &&s, auto tv = TransposeCopy(tp, a, exec); auto tvt = tv.PermuteMatrix(); + auto u_col_maj = make_tensor(u.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream); auto vt_col_maj = make_tensor(vt.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream); @@ -866,7 +1035,7 @@ void svd_impl(UTensor &&u, STensor &&s, detail::GetCacheIdFromType(), params, [&]() { - return std::make_shared(u_col_maj, s_new, vt_col_maj, tvt, job_cusolver); + return std::make_shared(u_col_maj, s_new, vt_col_maj, tvt, method, job_cusolver); }, [&](std::shared_ptr ctype) { ctype->Exec(u_col_maj, s_new, vt_col_maj, tvt, exec, job_cusolver); @@ -878,14 +1047,19 @@ void svd_impl(UTensor &&u, STensor &&s, // only want to swap the shapes, not the strides. Once we have a row-major tranposed view, // can set to the original tensor. auto utShape = u.Shape(); - std::swap(utShape[RANK - 2], utShape[RANK - 1]); + cuda::std::swap(utShape[RANK - 2], utShape[RANK - 1]); auto ut = u_col_maj.View(utShape); (u = transpose_matrix(ut)).run(exec); auto vShape = vt.Shape(); - std::swap(vShape[RANK - 2], vShape[RANK - 1]); + cuda::std::swap(vShape[RANK - 2], vShape[RANK - 1]); auto v = vt_col_maj.View(vShape); - (vt = transpose_matrix(v)).run(exec); + if (method == detail::SVDMethod::GESVDJ_BATCHED) { + (vt = conj(v)).run(exec); + } + else { + (vt = transpose_matrix(v)).run(exec); + } } if(!s_new.isSameView(s)) { diff --git a/test/00_solver/SVD.cu b/test/00_solver/SVD.cu index a6d21863..2cbb9b48 100644 --- a/test/00_solver/SVD.cu +++ b/test/00_solver/SVD.cu @@ -363,6 +363,125 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatched) MATX_EXIT_HANDLER(); } + +TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatchedSmallMGTN) +{ + MATX_ENTER_HANDLER(); + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ExecType = cuda::std::tuple_element_t<1, TypeParam>; + using value_type = typename inner_op_type_t::type; + + constexpr index_t batches = 10; + constexpr index_t m = 8; + constexpr index_t n = 4; + + auto Av = make_tensor({batches, m, n}); + auto Sv = make_tensor({batches, std::min(m, n)}); + auto Uv = make_tensor({batches, m, m}); + auto VTv = make_tensor({batches, n, n}); + + auto Dv = make_tensor({batches, m, n}); + auto UDVTv = make_tensor({batches, m, n}); + + this->pb->template InitAndRunTVGenerator("00_solver", "svd", "run", {batches, m, n}); + this->pb->NumpyToTensorView(Av, "A"); + + (mtie(Uv, Sv, VTv) = svd(Av)).run(this->exec); + + this->exec.sync(); + + // Since SVD produces a solution that's not necessarily unique, we cannot + // compare against Python output. Instead, we just make sure that A = U*S*V'. + + // Construct batched diagonal matrix D from the vector of singular values S + (Dv = zeros(Dv.Shape())).run(this->exec); + this->exec.sync(); + + for (index_t b = 0; b < batches; b++) { + for (index_t i = 0; i < Sv.Size(1); i++) { + Dv(b, i, i) = Sv(b, i); + } + } + + (UDVTv = matmul(matmul(Uv, Dv), VTv)).run(this->exec); // (U * S) * V' + this->exec.sync(); + + for (index_t b = 0; b < batches; b++) { + for (index_t i = 0; i < Av.Size(1); i++) { + for (index_t j = 0; j < Av.Size(2); j++) { + if constexpr (is_complex_v) { + ASSERT_NEAR(Av(b, i, j).real(), UDVTv(b, i, j).real(), this->thresh) << i << " " << j; + ASSERT_NEAR(Av(b, i, j).imag(), UDVTv(b, i, j).imag(), this->thresh) << i << " " << j; + } + else { + ASSERT_NEAR(Av(b, i, j), UDVTv(b, i, j), this->thresh) << i << " " << j; + } + } + } + } + + MATX_EXIT_HANDLER(); +} + +TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatchedSmallMEQN) +{ + MATX_ENTER_HANDLER(); + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ExecType = cuda::std::tuple_element_t<1, TypeParam>; + using value_type = typename inner_op_type_t::type; + + constexpr index_t batches = 10; + constexpr index_t m = 32; + constexpr index_t n = 32; + + auto Av = make_tensor({batches, m, n}); + auto Sv = make_tensor({batches, std::min(m, n)}); + auto Uv = make_tensor({batches, m, m}); + auto VTv = make_tensor({batches, n, n}); + + auto Dv = make_tensor({batches, m, n}); + auto UDVTv = make_tensor({batches, m, n}); + + this->pb->template InitAndRunTVGenerator("00_solver", "svd", "run", {batches, m, n}); + this->pb->NumpyToTensorView(Av, "A"); + + (mtie(Uv, Sv, VTv) = svd(Av)).run(this->exec); + + this->exec.sync(); + + // Since SVD produces a solution that's not necessarily unique, we cannot + // compare against Python output. Instead, we just make sure that A = U*S*V'. + + // Construct batched diagonal matrix D from the vector of singular values S + (Dv = zeros(Dv.Shape())).run(this->exec); + this->exec.sync(); + + for (index_t b = 0; b < batches; b++) { + for (index_t i = 0; i < Sv.Size(1); i++) { + Dv(b, i, i) = Sv(b, i); + } + } + + (UDVTv = matmul(matmul(Uv, Dv), VTv)).run(this->exec); // (U * S) * V' + this->exec.sync(); + + for (index_t b = 0; b < batches; b++) { + for (index_t i = 0; i < Av.Size(1); i++) { + for (index_t j = 0; j < Av.Size(2); j++) { + if constexpr (is_complex_v) { + ASSERT_NEAR(Av(b, i, j).real(), UDVTv(b, i, j).real(), this->thresh) << i << " " << j; + ASSERT_NEAR(Av(b, i, j).imag(), UDVTv(b, i, j).imag(), this->thresh) << i << " " << j; + } + else { + ASSERT_NEAR(Av(b, i, j), UDVTv(b, i, j), this->thresh) << i << " " << j; + } + } + } + } + + MATX_EXIT_HANDLER(); +} + template void svdpi_test( const index_t (&AshapeA)[RANK], Executor exec) { using AType = TypeParam; @@ -605,6 +724,7 @@ void svdbpi_test( const index_t (&AshapeA)[RANK], Executor exec) { ASSERT_NEAR( mdiffA(), SType(0), .00001); exec.sync(); + } TYPED_TEST(SVDPISolverTestNonHalfTypes, SVDBPI)