From c33d749732d747115557c3d54e4fff3e78bc7302 Mon Sep 17 00:00:00 2001
From: Cliff Burdick <30670611+cliffburdick@users.noreply.github.com>
Date: Thu, 21 Nov 2024 17:15:54 -0800
Subject: [PATCH] Faster batched SVD for small sizes (#805)

---
 include/matx/transforms/solver_common.h |   6 +-
 include/matx/transforms/svd/svd_cuda.h  | 256 ++++++++++++++++++++----
 test/00_solver/SVD.cu                   | 120 +++++++++++
 3 files changed, 338 insertions(+), 44 deletions(-)
diff --git a/include/matx/transforms/solver_common.h b/include/matx/transforms/solver_common.h
index 7e065ab1..67d8f46a 100644
--- a/include/matx/transforms/solver_common.h
+++ b/include/matx/transforms/solver_common.h
@@ -118,10 +118,10 @@ __MATX_INLINE__ char SVDModeToChar(SVDMode jobz) {
 
 
 template <typename Op, typename Executor>
-__MATX_INLINE__ auto getSolverSupportedTensor(const Op &in, const Executor &exec) {
+__MATX_INLINE__ auto getSolverSupportedTensor(const Op &in, const Executor &exec, bool force = false) {
   constexpr int RANK = Op::Rank();
 
-  bool supported = true; 
+  bool supported = !force; // If we're forcing a new tensor just make it unsupported
   if constexpr (!(is_tensor_view_v<Op>)) {
     supported = false;
   } else {
@@ -274,7 +274,7 @@ class matxDnCUDASolver_t {
       }
     }
     else {
-#endif      
+#endif 
       if (dspace > 0) {
         matxAlloc(&d_workspace, batches * dspace, MATX_DEVICE_MEMORY);
       }
diff --git a/include/matx/transforms/svd/svd_cuda.h b/include/matx/transforms/svd/svd_cuda.h
index c8e513c9..0ef4f8b5 100644
--- a/include/matx/transforms/svd/svd_cuda.h
+++ b/include/matx/transforms/svd/svd_cuda.h
@@ -50,6 +50,12 @@ namespace matx {
 
 namespace detail {
 
+enum class SVDMethod {
+  GESVD,
+  GESVDJ_BATCHED,
+  POW_ITER,
+};
+
 template<typename AType>
 inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) {
   using ATypeS = typename AType::value_type;
@@ -536,8 +542,44 @@ struct DnSVDCUDAParams_t {
   void *S;
   size_t batch_size;
   MatXDataType_t dtype;
+  SVDMethod method;
 };
 
+template <typename ATensor>
+static __MATX_INLINE__ SVDMethod GetCUDASVDMethod(const ATensor &a) {
+  static constexpr int RANK = ATensor::Rank();  
+  index_t m = a.Size(RANK - 2);
+  index_t n = a.Size(RANK - 1);
+  SVDMethod method;
+
+  // This assumes the matrix sizes are fairly large, in which case gesvd should win out on speed
+  if (a.Rank() == 2) {
+    method = detail::SVDMethod::GESVD;
+  }
+  else {
+    if (a.Size(RANK-2) <= 32 && 
+        a.Size(RANK-1) <= 32) {
+      if constexpr (is_tensor_view_v<ATensor>) {
+  #if !defined(INDEX_32_BIT)        
+        if (a.Stride(0) < std::numeric_limits<int32_t>::max()) {
+          method = detail::SVDMethod::GESVDJ_BATCHED;
+        }
+  #else
+        method = detail::SVDMethod::GESVDJ_BATCHED;
+  #endif        
+      }
+      else {
+        method = detail::SVDMethod::GESVDJ_BATCHED;
+      }
+    }
+    else {
+      method = detail::SVDMethod::GESVD;
+    }
+  }
+
+  return method;
+}
+
 template <typename UTensor, typename STensor, typename VtTensor, typename ATensor>
 class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
   using T1 = typename ATensor::value_type;
@@ -583,6 +625,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
                         STensor &s,
                         VtTensor &vt,
                         const ATensor &a,
+                        SVDMethod method,
                         const char jobz = 'A')
   {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
@@ -600,24 +643,83 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
     MATX_STATIC_ASSERT_STR((std::is_same_v<typename inner_op_type_t<T1>::type, T3>), matxInvalidType, "A and S inner types must match");
 
     params = GetSVDParams(u, s, vt, a, jobz);
+    params.method = method;
+
+    if (params.method == SVDMethod::GESVDJ_BATCHED) {
+      [[maybe_unused]] cusolverStatus_t ret;
+      ret = cusolverDnCreateGesvdjInfo(&batch_params);
+      MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnCreateGesvdjInfo");
+
+      ret = cusolverDnXgesvdjSetTolerance(batch_params, 1e-9);
+      MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnXgesvdjSetTolerance");      
+
+      ret = cusolverDnXgesvdjSetMaxSweeps(batch_params, 15);
+      MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnXgesvdjSetMaxSweeps");
+    }
+
     this->GetWorkspaceSize();
-    this->AllocateWorkspace(params.batch_size, false);
+    this->AllocateWorkspace(params.batch_size, params.method == SVDMethod::GESVDJ_BATCHED);
   }
 
   void GetWorkspaceSize() override
   {
+    cusolverStatus_t ret;
+
     // Use all mode for a larger workspace size that works for all modes
-    cusolverStatus_t ret =
-        cusolverDnXgesvd_bufferSize(
-            this->handle, this->dn_params, 'A', 'A', params.m, params.n,
-            MatXTypeToCudaType<T1>(), params.A, params.m,
-            MatXTypeToCudaType<T3>(), params.S, MatXTypeToCudaType<T2>(),
-            params.U, params.m, MatXTypeToCudaType<T4>(), params.VT, params.n,
-            MatXTypeToCudaType<T1>(), &this->dspace, &this->hspace);
+    if (!batched) {
+      ret = cusolverDnXgesvd_bufferSize(
+              this->handle, this->dn_params, 'A', 'A', params.m, params.n,
+              MatXTypeToCudaType<T1>(), params.A, params.m,
+              MatXTypeToCudaType<T3>(), params.S, MatXTypeToCudaType<T2>(),
+              params.U, params.m, MatXTypeToCudaType<T4>(), params.VT, params.n,
+              MatXTypeToCudaType<T1>(), &this->dspace, &this->hspace);
+    }
+    else {
+      if constexpr (std::is_same_v<float, T1>) {
+        ret = cusolverDnSgesvdjBatched_bufferSize(
+                this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
+                reinterpret_cast<const float *>(params.A), static_cast<int>(params.m),
+                reinterpret_cast<const float *>(params.S), reinterpret_cast<const float *>(params.U), static_cast<int>(params.m), 
+                reinterpret_cast<const float *>(params.VT), static_cast<int>(params.n),
+                reinterpret_cast<int*>(&this->dspace), batch_params, static_cast<int>(params.batch_size));
+      }
+      else if constexpr (std::is_same_v<double, T1>) {
+        ret = cusolverDnDgesvdjBatched_bufferSize(
+                this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
+                reinterpret_cast<const double *>(params.A), static_cast<int>(params.m),
+                reinterpret_cast<const double *>(params.S), reinterpret_cast<const double *>(params.U), 
+                static_cast<int>(params.m), reinterpret_cast<const double *>(params.VT), static_cast<int>(params.n),
+                reinterpret_cast<int*>(&this->dspace), batch_params, static_cast<int>(params.batch_size));
+      }
+      else if constexpr (std::is_same_v<cuda::std::complex<float>, T1>) {
+        ret = cusolverDnCgesvdjBatched_bufferSize(
+                this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
+                reinterpret_cast<const cuComplex *>(params.A), static_cast<int>(params.m),
+                reinterpret_cast<const float *>(params.S), reinterpret_cast<const cuComplex *>(params.U), 
+                static_cast<int>(params.m), reinterpret_cast<const cuComplex *>(params.VT), static_cast<int>(params.n),
+                reinterpret_cast<int*>(&this->dspace), batch_params, static_cast<int>(params.batch_size));
+      }
+      else if constexpr (std::is_same_v<cuda::std::complex<double>, T1>) {
+        ret = cusolverDnZgesvdjBatched_bufferSize(
+                this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
+                reinterpret_cast<const cuDoubleComplex *>(params.A), static_cast<int>(params.m),
+                reinterpret_cast<const double *>(params.S), reinterpret_cast<const cuDoubleComplex *>(params.U), 
+                static_cast<int>(params.m), reinterpret_cast<const cuDoubleComplex *>(params.VT), static_cast<int>(params.n),
+                reinterpret_cast<int*>(&this->dspace), batch_params, static_cast<int>(params.batch_size));
+      }
+      else {
+        MATX_THROW(matxInvalidType, "Invalid data type passed to svd()");
+      } 
+
+      int dspace_tmp = *reinterpret_cast<int*>(&this->dspace);
+      this->dspace = dspace_tmp;
+      this->hspace = 0;
+    }
 
     MATX_ASSERT(ret == CUSOLVER_STATUS_SUCCESS, matxSolverError);
   }
 
+
   static DnSVDCUDAParams_t
   GetSVDParams(UTensor &u, STensor &s,
                VtTensor &vt, const ATensor &a,
@@ -633,7 +735,6 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
     params.S = s.Data();
     params.jobz = jobz;
     params.dtype = TypeToInt<T1>();
-
     return params;
   }
 
@@ -643,6 +744,8 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
   {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
 
+    cusolverStatus_t ret;
+
     // Batch size checks
     for(int i = 0 ; i < RANK-2; i++) {
       MATX_ASSERT_STR(u.Size(i) == a.Size(i), matxInvalidDim, "U and A must have the same batch sizes");
@@ -661,30 +764,78 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
     }
     MATX_ASSERT_STR(s.Size(RANK-2) == cuda::std::min(params.m, params.n), matxInvalidSize, "S must be ... x min(m,n)");
 
+    const auto stream = exec.getStream();
+    cusolverDnSetStream(this->handle, stream);
+
     SetBatchPointers<BatchType::MATRIX>(a, this->batch_a_ptrs);
     SetBatchPointers<BatchType::MATRIX>(u, this->batch_u_ptrs);
     SetBatchPointers<BatchType::MATRIX>(vt, this->batch_vt_ptrs);
     SetBatchPointers<BatchType::VECTOR>(s, this->batch_s_ptrs);
 
-    const auto stream = exec.getStream();
-    cusolverDnSetStream(this->handle, stream);
-    const int64_t ldvt = vt.Size(RANK-2);
-
-    // At this time cuSolver does not have a batched 64-bit SVD interface. Change
-    // this to use the batched version once available.
-    for (size_t i = 0; i < this->batch_a_ptrs.size(); i++) {
-
-      auto ret = cusolverDnXgesvd(
-          this->handle, this->dn_params, jobz, jobz, params.m, params.n,
-          MatXTypeToCudaType<T1>(), this->batch_a_ptrs[i], params.m,
-          MatXTypeToCudaType<T3>(), this->batch_s_ptrs[i], MatXTypeToCudaType<T2>(),
-          this->batch_u_ptrs[i], params.m, MatXTypeToCudaType<T4>(), this->batch_vt_ptrs[i],
-          ldvt, MatXTypeToCudaType<T1>(),
-          reinterpret_cast<uint8_t *>(this->d_workspace) + i * this->dspace, this->dspace,
-          reinterpret_cast<uint8_t *>(this->h_workspace) + i * this->hspace, this->hspace,
-          this->d_info + i);
-
-      MATX_ASSERT(ret == CUSOLVER_STATUS_SUCCESS, matxSolverError);
+    const int64_t ldvt = vt.Size(RANK-2);    
+
+    if (params.method == SVDMethod::GESVD) {
+      // At this time cuSolver does not have a batched 64-bit SVD interface. Change
+      // this to use the batched version once available.
+      for (size_t i = 0; i < this->batch_a_ptrs.size(); i++) {
+        ret = cusolverDnXgesvd(
+            this->handle, this->dn_params, jobz, jobz, params.m, params.n,
+            MatXTypeToCudaType<T1>(), this->batch_a_ptrs[i], params.m,
+            MatXTypeToCudaType<T3>(), this->batch_s_ptrs[i], MatXTypeToCudaType<T2>(),
+            this->batch_u_ptrs[i], params.m, MatXTypeToCudaType<T4>(), this->batch_vt_ptrs[i],
+            ldvt, MatXTypeToCudaType<T1>(),
+            reinterpret_cast<uint8_t *>(this->d_workspace) + i * this->dspace, this->dspace,
+            reinterpret_cast<uint8_t *>(this->h_workspace) + i * this->hspace, this->hspace,
+            this->d_info + i);
+
+        MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "cusolverDnXgesvd failed");
+      }
+    }
+    else if (params.method == SVDMethod::GESVDJ_BATCHED) {
+      if constexpr (std::is_same_v<float, T1>) {
+        ret = cusolverDnSgesvdjBatched(
+                this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
+                reinterpret_cast<float *>(params.A), static_cast<int>(params.m),
+                reinterpret_cast<float *>(params.S), reinterpret_cast<float *>(params.U), 
+                static_cast<int>(params.m), reinterpret_cast<float *>(params.VT), static_cast<int>(params.n),
+                reinterpret_cast<float *>(this->d_workspace), static_cast<int>(this->dspace), 
+                this->d_info, batch_params, static_cast<int>(params.batch_size));
+      }
+      else if constexpr (std::is_same_v<double, T1>) {
+        ret = cusolverDnDgesvdjBatched(
+                this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
+                reinterpret_cast<double *>(params.A), static_cast<int>(params.m),
+                reinterpret_cast<double *>(params.S), reinterpret_cast<double *>(params.U),
+                static_cast<int>(params.m), reinterpret_cast<double *>(params.VT), static_cast<int>(params.n),
+                reinterpret_cast<double *>(this->d_workspace), static_cast<int>(this->dspace), 
+                this->d_info, batch_params, static_cast<int>(params.batch_size));
+      }
+      else if constexpr (std::is_same_v<cuda::std::complex<float>, T1>) {
+        ret = cusolverDnCgesvdjBatched(
+                this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
+                reinterpret_cast<cuComplex *>(params.A), static_cast<int>(params.m),
+                reinterpret_cast<float *>(params.S), reinterpret_cast<cuComplex *>(params.U), 
+                static_cast<int>(params.m), reinterpret_cast<cuComplex *>(params.VT), static_cast<int>(params.n),
+                reinterpret_cast<cuComplex *>(this->d_workspace), static_cast<int>(this->dspace), 
+                this->d_info, batch_params, static_cast<int>(params.batch_size));
+      }
+      else if constexpr (std::is_same_v<cuda::std::complex<double>, T1>) {
+        ret = cusolverDnZgesvdjBatched(
+                this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast<int>(params.m), static_cast<int>(params.n),
+                reinterpret_cast<cuDoubleComplex *>(params.A), static_cast<int>(params.m),
+                reinterpret_cast<double *>(params.S), reinterpret_cast<cuDoubleComplex *>(params.U), 
+                static_cast<int>(params.m), reinterpret_cast<cuDoubleComplex *>(params.VT), static_cast<int>(params.n),
+                reinterpret_cast<cuDoubleComplex *>(this->d_workspace), static_cast<int>(this->dspace), 
+                this->d_info, batch_params, static_cast<int>(params.batch_size));
+      }
+      else {
+        MATX_THROW(matxInvalidType, "Invalid data type passed to svd()");
+      }         
+
+      MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "cusolverDnSgesvdjBatched failed");
+    }
+    else {
+      MATX_THROW(matxInvalidType, "Invalid backend for SVD");
     }
 
     std::vector<int> h_info(this->batch_a_ptrs.size());
@@ -701,7 +852,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
         MATX_ASSERT_STR_EXP(info, 0, matxSolverError, 
           (std::to_string(info) + " superdiagonals of an intermediate bidiagonal form did not converge to zero in cuSolver Xgesvd").c_str());
       }
-    }
+    }    
   }
 
   /**
@@ -717,7 +868,9 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t {
   std::vector<T2 *> batch_u_ptrs;
   std::vector<T3 *> batch_s_ptrs;
   std::vector<T4 *> batch_vt_ptrs;
+  gesvdjInfo_t batch_params = nullptr;
   DnSVDCUDAParams_t params;
+  bool batched;
 };
 
 /**
@@ -800,15 +953,22 @@ void svd_impl(UTensor &&u, STensor &&s,
 
      Eventually these limitations may be fixed in cuSolver.
   */
- 
+
+  const char job_cusolver = detail::SVDModeToChar(jobz);
+  const bool m_leq_n = a.Size(RANK-2) <= a.Size(RANK-1);
+  const auto method = GetCUDASVDMethod(a);  
+
+  // The power iteration method is a custom kernel so we don't need to test the types
+  if (method == detail::SVDMethod::POW_ITER) {
+    svdbpi_impl(u, s, vt, a, 10, 0.f, exec);
+    return;
+  }
+
   // cuSolver destroys the input, so we need to make a copy of A regardless
   T1 *tp;
   auto a_shape = a.Shape();
   auto a_total_size = std::accumulate(a_shape.begin(), a_shape.begin() + ATensor::Rank(), 1, std::multiplies<index_t>());
-  matxAlloc(reinterpret_cast<void **>(&tp), sizeof(T1) * a_total_size, MATX_ASYNC_DEVICE_MEMORY, stream);
-
-  const char job_cusolver = detail::SVDModeToChar(jobz);
-  const bool m_leq_n = a.Size(RANK-2) <= a.Size(RANK-1);
+  matxAlloc(reinterpret_cast<void **>(&tp), sizeof(T1) * a_total_size, MATX_ASYNC_DEVICE_MEMORY, stream);  
   
   if (m_leq_n) {
     // get col-major AT
@@ -816,10 +976,13 @@ void svd_impl(UTensor &&u, STensor &&s,
     (a_new = a).run(exec);
     auto at_col_maj = transpose_matrix(a_new);
 
-    auto u_new = getSolverSupportedTensor(u, exec);
+    // gesvdj does not return VT as gesvd does, so we need to transpose U before storing. Force an allocation
+    // of a temporary tensor here
+    auto u_new = getSolverSupportedTensor(u, exec, method == detail::SVDMethod::GESVDJ_BATCHED);
     auto vt_new = getSolverSupportedTensor(vt, exec);
 
     // swap U and VT. svd(AT) = V*S*UT
+    // svd(AT) = V*S*UT 
     // Need the tensors to appear like V and UT since that is expected based on AT view
     // inputted, although the results when read as row-major are VT and U.
     auto u_in = transpose_matrix(vt_new);
@@ -835,7 +998,7 @@ void svd_impl(UTensor &&u, STensor &&s,
       detail::GetCacheIdFromType<detail::svd_cuda_cache_t>(),
       params,
       [&]() {
-        return std::make_shared<cache_val_type>(u_in, s_new, vt_in, at_col_maj, job_cusolver);
+        return std::make_shared<cache_val_type>(u_in, s_new, vt_in, at_col_maj, method, job_cusolver);
       },
       [&](std::shared_ptr<cache_val_type> ctype) {
         ctype->Exec(u_in, s_new, vt_in, at_col_maj, exec, job_cusolver);
@@ -843,7 +1006,12 @@ void svd_impl(UTensor &&u, STensor &&s,
     );
 
     if(!u_new.isSameView(u)) {
-      (u = u_new).run(exec);
+      if (method == detail::SVDMethod::GESVDJ_BATCHED) {
+        (u = conj(transpose_matrix((u_new)))).run(exec);
+      }
+      else {
+        (u = u_new).run(exec);
+      }
     }
     if(!vt_new.isSameView(vt)) {
       (vt = vt_new).run(exec);
@@ -853,6 +1021,7 @@ void svd_impl(UTensor &&u, STensor &&s,
     auto tv = TransposeCopy(tp, a, exec);
     auto tvt = tv.PermuteMatrix();
 
+
     auto u_col_maj = make_tensor<T1>(u.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);
     auto vt_col_maj = make_tensor<T1>(vt.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);
 
@@ -866,7 +1035,7 @@ void svd_impl(UTensor &&u, STensor &&s,
       detail::GetCacheIdFromType<detail::svd_cuda_cache_t>(),
       params,
       [&]() {
-        return std::make_shared<cache_val_type>(u_col_maj, s_new, vt_col_maj, tvt, job_cusolver);
+        return std::make_shared<cache_val_type>(u_col_maj, s_new, vt_col_maj, tvt, method, job_cusolver);
       },
       [&](std::shared_ptr<cache_val_type> ctype) {
         ctype->Exec(u_col_maj, s_new, vt_col_maj, tvt, exec, job_cusolver);
@@ -878,14 +1047,19 @@ void svd_impl(UTensor &&u, STensor &&s,
     // only want to swap the shapes, not the strides. Once we have a row-major tranposed view,
     // can set to the original tensor.
     auto utShape = u.Shape();
-    std::swap(utShape[RANK - 2], utShape[RANK - 1]);
+    cuda::std::swap(utShape[RANK - 2], utShape[RANK - 1]);
     auto ut = u_col_maj.View(utShape);
     (u = transpose_matrix(ut)).run(exec);
 
     auto vShape = vt.Shape();
-    std::swap(vShape[RANK - 2], vShape[RANK - 1]);
+    cuda::std::swap(vShape[RANK - 2], vShape[RANK - 1]);
     auto v = vt_col_maj.View(vShape);
-    (vt = transpose_matrix(v)).run(exec);
+    if (method == detail::SVDMethod::GESVDJ_BATCHED) {
+      (vt = conj(v)).run(exec);
+    }
+    else {
+      (vt = transpose_matrix(v)).run(exec);
+    }
   }
 
   if(!s_new.isSameView(s)) {
diff --git a/test/00_solver/SVD.cu b/test/00_solver/SVD.cu
index a6d21863..2cbb9b48 100644
--- a/test/00_solver/SVD.cu
+++ b/test/00_solver/SVD.cu
@@ -363,6 +363,125 @@ TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatched)
   MATX_EXIT_HANDLER();
 }
 
+
+TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatchedSmallMGTN)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ExecType = cuda::std::tuple_element_t<1, TypeParam>;      
+  using value_type = typename inner_op_type_t<TestType>::type;
+
+  constexpr index_t batches = 10;
+  constexpr index_t m = 8;
+  constexpr index_t n = 4;
+
+  auto Av = make_tensor<TestType>({batches, m, n});
+  auto Sv = make_tensor<value_type>({batches, std::min(m, n)});
+  auto Uv = make_tensor<TestType>({batches, m, m});
+  auto VTv = make_tensor<TestType>({batches, n, n});
+
+  auto Dv = make_tensor<value_type>({batches, m, n});
+  auto UDVTv = make_tensor<TestType>({batches, m, n});
+
+  this->pb->template InitAndRunTVGenerator<TestType>("00_solver", "svd", "run", {batches, m, n});
+  this->pb->NumpyToTensorView(Av, "A");
+
+  (mtie(Uv, Sv, VTv) = svd(Av)).run(this->exec);
+
+  this->exec.sync();
+
+  // Since SVD produces a solution that's not necessarily unique, we cannot
+  // compare against Python output. Instead, we just make sure that A = U*S*V'.
+
+  // Construct batched diagonal matrix D from the vector of singular values S
+  (Dv = zeros<value_type>(Dv.Shape())).run(this->exec);
+  this->exec.sync();
+
+  for (index_t b = 0; b < batches; b++) {
+    for (index_t i = 0; i < Sv.Size(1); i++) {
+      Dv(b, i, i) = Sv(b, i);
+    }
+  }
+
+  (UDVTv = matmul(matmul(Uv, Dv), VTv)).run(this->exec); // (U * S) * V'
+  this->exec.sync();
+
+  for (index_t b = 0; b < batches; b++) {
+    for (index_t i = 0; i < Av.Size(1); i++) {
+      for (index_t j = 0; j < Av.Size(2); j++) {
+        if constexpr (is_complex_v<TestType>) {
+          ASSERT_NEAR(Av(b, i, j).real(), UDVTv(b, i, j).real(), this->thresh) << i << " " << j;
+          ASSERT_NEAR(Av(b, i, j).imag(), UDVTv(b, i, j).imag(), this->thresh) << i << " " << j;
+        }
+        else {
+          ASSERT_NEAR(Av(b, i, j), UDVTv(b, i, j), this->thresh) << i << " " << j;
+        }
+      }
+    }
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+TYPED_TEST(SVDSolverTestNonHalfTypes, SVDBasicBatchedSmallMEQN)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ExecType = cuda::std::tuple_element_t<1, TypeParam>;      
+  using value_type = typename inner_op_type_t<TestType>::type;
+
+  constexpr index_t batches = 10;
+  constexpr index_t m = 32;
+  constexpr index_t n = 32;
+
+  auto Av = make_tensor<TestType>({batches, m, n});
+  auto Sv = make_tensor<value_type>({batches, std::min(m, n)});
+  auto Uv = make_tensor<TestType>({batches, m, m});
+  auto VTv = make_tensor<TestType>({batches, n, n});
+
+  auto Dv = make_tensor<value_type>({batches, m, n});
+  auto UDVTv = make_tensor<TestType>({batches, m, n});
+
+  this->pb->template InitAndRunTVGenerator<TestType>("00_solver", "svd", "run", {batches, m, n});
+  this->pb->NumpyToTensorView(Av, "A");
+
+  (mtie(Uv, Sv, VTv) = svd(Av)).run(this->exec);
+
+  this->exec.sync();
+
+  // Since SVD produces a solution that's not necessarily unique, we cannot
+  // compare against Python output. Instead, we just make sure that A = U*S*V'.
+
+  // Construct batched diagonal matrix D from the vector of singular values S
+  (Dv = zeros<value_type>(Dv.Shape())).run(this->exec);
+  this->exec.sync();
+
+  for (index_t b = 0; b < batches; b++) {
+    for (index_t i = 0; i < Sv.Size(1); i++) {
+      Dv(b, i, i) = Sv(b, i);
+    }
+  }
+
+  (UDVTv = matmul(matmul(Uv, Dv), VTv)).run(this->exec); // (U * S) * V'
+  this->exec.sync();
+
+  for (index_t b = 0; b < batches; b++) {
+    for (index_t i = 0; i < Av.Size(1); i++) {
+      for (index_t j = 0; j < Av.Size(2); j++) {
+        if constexpr (is_complex_v<TestType>) {
+          ASSERT_NEAR(Av(b, i, j).real(), UDVTv(b, i, j).real(), this->thresh) << i << " " << j;
+          ASSERT_NEAR(Av(b, i, j).imag(), UDVTv(b, i, j).imag(), this->thresh) << i << " " << j;
+        }
+        else {
+          ASSERT_NEAR(Av(b, i, j), UDVTv(b, i, j), this->thresh) << i << " " << j;
+        }
+      }
+    }
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
 template <typename TypeParam, int RANK, typename Executor>
 void svdpi_test( const index_t (&AshapeA)[RANK], Executor exec) {
   using AType = TypeParam;
@@ -605,6 +724,7 @@ void svdbpi_test( const index_t (&AshapeA)[RANK], Executor exec) {
   ASSERT_NEAR( mdiffA(), SType(0), .00001);
   
   exec.sync();
+
 }
 
 TYPED_TEST(SVDPISolverTestNonHalfTypes, SVDBPI)