NVIDIA · cliffburdick · Aug 26, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/include/matx/transforms/chol/chol_lapack.h b/include/matx/transforms/chol/chol_lapack.h
@@ -77,7 +77,7 @@ class matxDnCholHostPlan_t : matxDnHostSolver_t<typename remove_cvref_t<ATensor>
    * matrix using the Cholesky method, where M is either the upper or lower
    * triangular portion of A. Input matrix A must be a square Hermitian matrix
    * positive-definite where only the upper or lower triangle is used. This does
-   * require a workspace.
+   * not require a workspace.
    *
    * @tparam T1
    *  Data type of A matrix
@@ -181,45 +181,22 @@ class matxDnCholHostPlan_t : matxDnHostSolver_t<typename remove_cvref_t<ATensor>
   DnCholHostParams_t params;
 };
 
-/**
- * Crude hash to get a reasonably good delta for collisions. This doesn't need
- * to be perfect, but fast enough to not slow down lookups, and different enough
- * so the common solver parameters change
- */
-struct DnCholHostParamsKeyHash {
-  std::size_t operator()(const DnCholHostParams_t &k) const noexcept
-  {
-    return (std::hash<uint64_t>()(k.n)) + (std::hash<uint64_t>()(k.batch_size));
-  }
-};
-
-/**
- * Test cholesky parameters for equality. Unlike the hash, all parameters must
- * match.
- */
-struct DnCholHostParamsKeyEq {
-  bool operator()(const DnCholHostParams_t &l, const DnCholHostParams_t &t) const
-      noexcept
-  {
-    return l.n == t.n && l.batch_size == t.batch_size && l.dtype == t.dtype;
-  }
-};
-
-using chol_Host_cache_t = std::unordered_map<DnCholHostParams_t, std::any, DnCholHostParamsKeyHash, DnCholHostParamsKeyEq>;
 #endif
 
 } // end namespace detail
 
 
 /**
- * Perform a Cholesky decomposition using a cached plan
+ * Perform a Cholesky decomposition.
  *
  * See documentation of matxDnCholHostPlan_t for a description of how the
  * algorithm works. This function provides a simple interface to the LAPACK
  * library by deducing all parameters needed to perform a Cholesky decomposition
  * from only the matrix A. The input and output parameters may be the same
  * tensor. In that case, the input is destroyed and the output is stored
  * in-place. Input must be a positive-definite Hermitian or real symmetric matrix.
+ * 
+ * No caching as LAPACK allocation/setup is not needed.
  *
  * @tparam T1
  *   Data type of matrix A
@@ -280,20 +257,8 @@ void chol_impl([[maybe_unused]] OutputTensor &&out,
 
   const char uplo_lapack = (uplo == SolverFillMode::UPPER)? 'U' : 'L';
 
-  // Get parameters required by these tensors
-  auto params = detail::matxDnCholHostPlan_t<OutputTensor, decltype(tv)>::GetCholParams(tv, uplo_lapack);
-
-  using cache_val_type = detail::matxDnCholHostPlan_t<OutputTensor, decltype(a_new)>;
-  detail::GetCache().LookupAndExec<detail::chol_Host_cache_t>(
-    detail::GetCacheIdFromType<detail::chol_Host_cache_t>(),
-    params,
-    [&]() {
-      return std::make_shared<cache_val_type>(tv, uplo_lapack);
-    },
-    [&](std::shared_ptr<cache_val_type> ctype) {
-      ctype->Exec(tv, tv, exec, uplo_lapack);
-    }
-  );
+  detail::matxDnCholHostPlan_t<OutputTensor, decltype(a_new)> chol_plan(tv, uplo_lapack);
+  chol_plan.Exec(tv, tv, exec, uplo_lapack);
 
   if (! allContiguous) {
     matx::copy(out, tv, exec);

diff --git a/include/matx/transforms/eig/eig_lapack.h b/include/matx/transforms/eig/eig_lapack.h
@@ -71,7 +71,7 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
   using T2 = typename WTensor::value_type;
   static constexpr int RANK = OutTensor_t::Rank();
   static_assert(RANK >= 2, "Input/Output tensor must be rank 2 or higher");
-  
+
   /**
    * Plan computing eigenvalues/vectors on square Hermitian A such that:
    *
@@ -122,12 +122,20 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
   {
     // Perform a workspace query with lwork = -1.
     lapack_int_t info;
-    T1 work_query;
-    T2 rwork_query;
+
+    // Due to a LAPACK bug which gives incorrect workspace size calculations for single-precision
+    // routines at large matrix sizes from to float-to-int conversions, should use double-precision
+    // for the query. Only an issue in routines which may need O(n^2) memory.
+    using WorkQuery = std::conditional_t<std::is_same_v<T1, float>, double,
+                      std::conditional_t<std::is_same_v<T1, cuda::std::complex<float>>,
+                                        cuda::std::complex<double>, T1>>;
+    using RworkQuery = std::conditional_t<std::is_same_v<T2, float>, double, T2>;
+    WorkQuery work_query;
+    RworkQuery rwork_query;
     lapack_int_t iwork_query;
 
     // Use vector mode for a larger workspace size that works for both modes
-    syevd_dispatch("V", &params.uplo, &params.n, nullptr, &params.n,
+    syevd_dispatch<WorkQuery, RworkQuery>("V", &params.uplo, &params.n, nullptr, &params.n,
                     nullptr, &work_query, &this->lwork, &rwork_query,
                     &this->lrwork, &iwork_query, &this->liwork, &info);
 
@@ -140,7 +148,7 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
       this->lrwork = static_cast<lapack_int_t>(rwork_query);
     } else {
       this->lwork = static_cast<lapack_int_t>(work_query);
-      this->lrwork = 0; // Complex variants do not use rwork.
+      this->lrwork = 0; // rwork is not used for real types
     }
     this->liwork = static_cast<lapack_int_t>(iwork_query);
   }
@@ -212,22 +220,26 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
   ~matxDnEigHostPlan_t() {}
 
 private:
+  /**
+   * Cannot use T1 and T2, as we always use the double versions for workspace queries
+   */
+  template <typename AType, typename WType>
   void syevd_dispatch(const char* jobz, const char* uplo, const lapack_int_t* n,
-            T1* a, const lapack_int_t* lda, T2* w, T1* work_in,
-            const lapack_int_t* lwork_in, [[maybe_unused]] T2* rwork_in,
+            AType* a, const lapack_int_t* lda, WType* w, AType* work_in,
+            const lapack_int_t* lwork_in, [[maybe_unused]] WType* rwork_in,
             [[maybe_unused]] const lapack_int_t* lrwork_in, lapack_int_t* iwork_in,
             const lapack_int_t* liwork_in, lapack_int_t* info)
   {
     // TODO: remove warning suppression once syevd is optimized in NVPL LAPACK
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-    if constexpr (std::is_same_v<T1, float>) {
+    if constexpr (std::is_same_v<AType, float>) {
       LAPACK_CALL(ssyevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, iwork_in, liwork_in, info);
-    } else if constexpr (std::is_same_v<T1, double>) {
+    } else if constexpr (std::is_same_v<AType, double>) {
       LAPACK_CALL(dsyevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, iwork_in, liwork_in, info);
-    } else if constexpr (std::is_same_v<T1, cuda::std::complex<float>>) {
+    } else if constexpr (std::is_same_v<AType, cuda::std::complex<float>>) {
       LAPACK_CALL(cheevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, rwork_in, lrwork_in, iwork_in, liwork_in, info);
-    } else if constexpr (std::is_same_v<T1, cuda::std::complex<double>>) {
+    } else if constexpr (std::is_same_v<AType, cuda::std::complex<double>>) {
       LAPACK_CALL(zheevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, rwork_in, lrwork_in, iwork_in, liwork_in, info);
     }
 #pragma GCC diagnostic pop
@@ -260,7 +272,7 @@ struct DnEigHostParamsKeyEq {
   }
 };
 
-using eig_Host_cache_t = std::unordered_map<DnEigHostParams_t, std::any, DnEigHostParamsKeyHash, DnEigHostParamsKeyEq>;
+using eig_host_cache_t = std::unordered_map<DnEigHostParams_t, std::any, DnEigHostParamsKeyHash, DnEigHostParamsKeyEq>;
 #endif
 
 } // end namespace detail
@@ -335,8 +347,8 @@ void eig_impl([[maybe_unused]] OutputTensor &&out,
 
   // Get cache or new eigen plan if it doesn't exist
   using cache_val_type = detail::matxDnEigHostPlan_t<OutputTensor, decltype(w_new), decltype(a_new)>;
-  detail::GetCache().LookupAndExec<detail::eig_Host_cache_t>(
-    detail::GetCacheIdFromType<detail::eig_Host_cache_t>(),
+  detail::GetCache().LookupAndExec<detail::eig_host_cache_t>(
+    detail::GetCacheIdFromType<detail::eig_host_cache_t>(),
     params,
     [&]() {
       return std::make_shared<cache_val_type>(w_new, tv, jobz_lapack, uplo_lapack);

diff --git a/include/matx/transforms/lu/lu_lapack.h b/include/matx/transforms/lu/lu_lapack.h
@@ -188,32 +188,6 @@ class matxDnLUHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
   DnLUHostParams_t params;
 };
 
-/**
- * Crude hash to get a reasonably good delta for collisions. This doesn't need
- * to be perfect, but fast enough to not slow down lookups, and different enough
- * so the common solver parameters change
- */
-struct DnLUHostParamsKeyHash {
-  std::size_t operator()(const DnLUHostParams_t &k) const noexcept
-  {
-    return (std::hash<uint64_t>()(k.m)) + (std::hash<uint64_t>()(k.n)) +
-           (std::hash<uint64_t>()(k.batch_size));
-  }
-};
-
-/**
- * Test LU parameters for equality. Unlike the hash, all parameters must match.
- */
-struct DnLUHostParamsKeyEq {
-  bool operator()(const DnLUHostParams_t &l, const DnLUHostParams_t &t) const noexcept
-  {
-    return l.n == t.n && l.m == t.m && l.batch_size == t.batch_size &&
-           l.dtype == t.dtype;
-  }
-};
-
-// Static caches of LU this->handles
-using lu_Host_cache_t = std::unordered_map<DnLUHostParams_t, std::any, DnLUHostParamsKeyHash, DnLUHostParamsKeyEq>;
 #endif
 
 } // end namespace detail
@@ -227,6 +201,8 @@ using lu_Host_cache_t = std::unordered_map<DnLUHostParams_t, std::any, DnLUHostP
  * library by deducing all parameters needed to perform an LU decomposition from
  * only the matrix A. The input and output parameters may be the same tensor. In
  * that case, the input is destroyed and the output is stored in-place.
+ * 
+ * No caching as LAPACK allocation/setup is not needed.
  *
  * @tparam T1
  *   Data type of matrix A
@@ -272,21 +248,8 @@ void lu_impl([[maybe_unused]] OutputTensor &&out,
   auto tv = TransposeCopy(tp, a_new, exec);
   auto tvt = tv.PermuteMatrix();
 
-  // Get parameters required by these tensors
-  auto params = detail::matxDnLUHostPlan_t<OutputTensor, decltype(piv_new), decltype(a_new)>::GetLUParams(piv_new, tvt);
-
-  // Get cache or new LU plan if it doesn't exist
-  using cache_val_type = detail::matxDnLUHostPlan_t<OutputTensor, decltype(piv_new), decltype(a_new)>;
-  detail::GetCache().LookupAndExec<detail::lu_Host_cache_t>(
-    detail::GetCacheIdFromType<detail::lu_Host_cache_t>(),
-    params,
-    [&]() {
-      return std::make_shared<cache_val_type>(piv_new, tvt);
-    },
-    [&](std::shared_ptr<cache_val_type> ctype) {
-      ctype->Exec(tvt, piv_new, tvt, exec);
-    }
-  );
+  detail::matxDnLUHostPlan_t<OutputTensor, decltype(piv_new), decltype(a_new)> lu_plan(piv_new, tvt);
+  lu_plan.Exec(tvt, piv_new, tvt, exec);
 
   /* Temporary WAR
    * Copy and free async buffer for transpose */

diff --git a/include/matx/transforms/qr/qr_lapack.h b/include/matx/transforms/qr/qr_lapack.h
@@ -234,7 +234,7 @@ struct DnQRHostParamsKeyEq {
   }
 };
 
-using qr_Host_cache_t = std::unordered_map<DnQRHostParams_t, std::any, DnQRHostParamsKeyHash, DnQRHostParamsKeyEq>;
+using qr_host_cache_t = std::unordered_map<DnQRHostParams_t, std::any, DnQRHostParamsKeyHash, DnQRHostParamsKeyEq>;
 #endif
 
 } // end namespace detail
@@ -297,8 +297,8 @@ void qr_solver_impl([[maybe_unused]] OutTensor &&out,
 
   // Get cache or new QR plan if it doesn't exist
   using cache_val_type = detail::matxDnQRHostPlan_t<OutTensor, decltype(tau_new), decltype(a_new)>;
-  detail::GetCache().LookupAndExec<detail::qr_Host_cache_t>(
-    detail::GetCacheIdFromType<detail::qr_Host_cache_t>(),
+  detail::GetCache().LookupAndExec<detail::qr_host_cache_t>(
+    detail::GetCacheIdFromType<detail::qr_host_cache_t>(),
     params,
     [&]() {
       return std::make_shared<cache_val_type>(tau_new, tvt);

diff --git a/include/matx/transforms/svd/svd_lapack.h b/include/matx/transforms/svd/svd_lapack.h
@@ -141,11 +141,11 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
     // larger workspace size that works for all modes
 
     lapack_int_t info;
-    T1 work_query;
     lapack_int_t mn = cuda::std::min(params.m, params.n);
     lapack_int_t mx = cuda::std::max(params.m, params.n);
 
     if (params.algo == SVDHostAlgo::QR) {
+      T1 work_query;
       gesvd_dispatch("A", "A", &params.m, &params.n, nullptr,
                     &params.m, nullptr, nullptr, &params.m, nullptr, &params.n,
                     &work_query, &this->lwork, nullptr, &info);
@@ -163,7 +163,16 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
         this->lrwork = 0;
       }
     } else if (params.algo == SVDHostAlgo::DC) {
-      gesdd_dispatch("A", &params.m, &params.n, nullptr, &params.m, nullptr,
+      // Due to a LAPACK bug which gives incorrect workspace size calculations for single-precision
+      // routines at large matrix sizes from to float-to-int conversions, should use double-precision
+      // for the query. Only an issue in routines which may need O(n^2) memory.
+      using WorkQuery = std::conditional_t<std::is_same_v<T1, float>, double,
+                          std::conditional_t<std::is_same_v<T1, cuda::std::complex<float>>,
+                                            cuda::std::complex<double>, T1>>;
+      using RworkQuery = std::conditional_t<std::is_same_v<T3, float>, double, T3>;
+      WorkQuery work_query;
+
+      gesdd_dispatch<WorkQuery, RworkQuery>("A", &params.m, &params.n, nullptr, &params.m, nullptr,
                     nullptr, &params.m, nullptr, &params.n, &work_query,
                     &this->lwork, nullptr, nullptr, &info);
 
@@ -173,11 +182,11 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
       this->liwork = 8 * mn; // iwork has size 8*min(M,N) and is used for all types
 
       // the real part of the first elem of work holds the optimal lwork.
-      if constexpr (is_complex_v<T1>) {
+      if constexpr (is_complex_v<WorkQuery>) {
         this->lwork = static_cast<lapack_int_t>(work_query.real());
 
         lapack_int_t mnthr = (mn * 5) / 3;
-        if (mx >= mnthr) {
+        if (mx >= mnthr) { // mx >> mn condition in LAPACK
           this->lrwork = 5*mn*mn + 5*mn;
         } else {
           this->lrwork = cuda::std::max(5*mn*mn + 5*mn, 2*mx*mn + 2*mn*mn + mn);
@@ -297,18 +306,22 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t<typename ATensor::value_type> {
 #pragma GCC diagnostic pop
   }
 
+  /**
+   * Cannot use T1 and T3, as we always use the double versions for workspace queries
+   */
+  template <typename T, typename S>
   void gesdd_dispatch(const char *jobz, const lapack_int_t *m, const lapack_int_t *n,
-                      T1 *a, const lapack_int_t *lda, T3 *s, T1 *u, const lapack_int_t *ldu,
-                      T1 *vt, const lapack_int_t *ldvt, T1 *work_in, const lapack_int_t *lwork_in,
-                      [[maybe_unused]] T3 *rwork_in, lapack_int_t *iwork_in, lapack_int_t *info)
+                      T *a, const lapack_int_t *lda, S *s, T *u, const lapack_int_t *ldu,
+                      T *vt, const lapack_int_t *ldvt, T *work_in, const lapack_int_t *lwork_in,
+                      [[maybe_unused]] S *rwork_in, lapack_int_t *iwork_in, lapack_int_t *info)
   {
-    if constexpr (std::is_same_v<T1, float>) {
+    if constexpr (std::is_same_v<T, float>) {
       LAPACK_CALL(sgesdd)(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, iwork_in, info);
-    } else if constexpr (std::is_same_v<T1, double>) {
+    } else if constexpr (std::is_same_v<T, double>) {
       LAPACK_CALL(dgesdd)(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, iwork_in, info);
-    } else if constexpr  (std::is_same_v<T1, cuda::std::complex<float>>) {
+    } else if constexpr (std::is_same_v<T, cuda::std::complex<float>>) {
       LAPACK_CALL(cgesdd)(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, rwork_in, iwork_in, info);
-    } else if constexpr   (std::is_same_v<T1, cuda::std::complex<double>>)  {
+    } else if constexpr (std::is_same_v<T, cuda::std::complex<double>>)  {
       LAPACK_CALL(zgesdd)(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, rwork_in, iwork_in, info);
     }
   }
@@ -343,7 +356,7 @@ struct DnSVDHostParamsKeyEq {
   }
 };
 
-using svd_Host_cache_t = std::unordered_map<DnSVDHostParams_t, std::any, DnSVDHostParamsKeyHash, DnSVDHostParamsKeyEq>;
+using svd_host_cache_t = std::unordered_map<DnSVDHostParams_t, std::any, DnSVDHostParamsKeyHash, DnSVDHostParamsKeyEq>;
 #endif
 
 }
@@ -424,8 +437,8 @@ void svd_impl([[maybe_unused]] UTensor &&u,
 
   // Get cache or new SVD plan if it doesn't exist
   using cache_val_type = detail::matxDnSVDHostPlan_t<decltype(u_in), decltype(s_new), decltype(vt_in), decltype(at_col_maj)>;
-  detail::GetCache().LookupAndExec<detail::svd_Host_cache_t>(
-    detail::GetCacheIdFromType<detail::svd_Host_cache_t>(),
+  detail::GetCache().LookupAndExec<detail::svd_host_cache_t>(
+    detail::GetCacheIdFromType<detail::svd_host_cache_t>(),
     params,
     [&]() {
       return std::make_shared<cache_val_type>(u_in, s_new, vt_in, at_col_maj, job_lapack, algo);