use new async allocator

NVIDIA · Sep 7, 2022 · ab64d60 · ab64d60
1 parent 1be82c8
commit ab64d60
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 93 deletions.
diff --git a/include/matx/transforms/ambgfun.h b/include/matx/transforms/ambgfun.h
@@ -167,20 +167,12 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
   using T1 = typename XTensor::scalar_type;
   using T2 = typename AMFTensor::scalar_type;
 
-  T1 *x_normdiv, *y_normdiv;
-  float *x_norm, *y_norm;
-
   MATX_STATIC_ASSERT(is_cuda_complex_v<T1>, matxInvalidType);
   auto ry = x.View();
   //tensor_t<T1, RANK> ry(x);
 
-  matxAlloc(reinterpret_cast<void **>(&x_normdiv),
-            sizeof(T1) * x.Size(RANK - 1), MATX_ASYNC_DEVICE_MEMORY, stream);
-  matxAlloc(reinterpret_cast<void **>(&x_norm), sizeof(*x_norm),
-            MATX_ASYNC_DEVICE_MEMORY, stream);
-
-  auto x_normdiv_v = make_tensor<T1>(x_normdiv, x.Shape());
-  auto x_norm_v = make_tensor<float>(x_norm);
+  auto x_normdiv_v = make_tensor<T1>(x.Shape(),  MATX_ASYNC_DEVICE_MEMORY, stream);
+  auto x_norm_v = make_tensor<float>(MATX_ASYNC_DEVICE_MEMORY, stream);
 
   sum(x_norm_v, norm(x), stream);
   (x_norm_v = sqrt(x_norm_v)).run(stream);
@@ -190,13 +182,8 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
 
   if (y) {
     ry.Reset(y.value().Data(), y.value().Shape());
-    matxAlloc(reinterpret_cast<void **>(&y_normdiv),
-              sizeof(T1) * ry.Size(RANK - 1), MATX_ASYNC_DEVICE_MEMORY, stream);
-    matxAlloc(reinterpret_cast<void **>(&y_norm), sizeof(*y_norm),
-              MATX_ASYNC_DEVICE_MEMORY, stream);
-
-    y_normdiv_v.Reset(y_normdiv, ry.Shape());
-    auto y_norm_v = make_tensor<float>(y_norm);
+    y_normdiv_v.Shallow(make_tensor<T1>(y_normdiv_v.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream));
+    auto y_norm_v = make_tensor<float>(MATX_ASYNC_DEVICE_MEMORY, stream);
 
     sum(y_norm_v, norm(ry), stream);
     (y_normdiv_v = ry / y_norm_v).run(stream);
@@ -208,21 +195,12 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
   index_t xlen = x_normdiv_v.Size(RANK - 1);
 
   if (cut == ::matx::AMGBFUN_CUT_TYPE_2D) {
-    T1 *new_ynorm;
-    matxAlloc(reinterpret_cast<void **>(&new_ynorm),
-              sizeof(T1) * (len_seq - 1) * xlen, MATX_ASYNC_DEVICE_MEMORY,
-              stream);
 
-    auto new_ynorm_v = make_tensor<T1>(new_ynorm, {len_seq - 1, xlen});
+    auto new_ynorm_v = make_tensor<T1>({len_seq - 1, xlen}, MATX_ASYNC_DEVICE_MEMORY, stream);
 
     newYNorm(new_ynorm_v, x_normdiv_v, y_normdiv_v).run(stream);
 
-    T1 *fft_data, *amf_tmp;
-    matxAlloc(reinterpret_cast<void **>(&fft_data),
-              sizeof(*fft_data) * nfreq * (len_seq - 1),
-              MATX_ASYNC_DEVICE_MEMORY, stream);
-
-    auto fullfft = make_tensor<T1>(fft_data, {(len_seq - 1), nfreq});
+    auto fullfft = make_tensor<T1>({(len_seq - 1), nfreq}, MATX_ASYNC_DEVICE_MEMORY, stream);
     auto partfft = fullfft.Slice({0, 0}, {(len_seq - 1), xlen});
 
     (fullfft = 0).run(stream);
@@ -232,21 +210,13 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
 
     // We need to temporarily allocate a complex output version of AMF since we
     // have no way to convert complex to real in an operator currently
-    matxAlloc(reinterpret_cast<void **>(&amf_tmp),
-              sizeof(*amf_tmp) * nfreq * (len_seq - 1),
-              MATX_ASYNC_DEVICE_MEMORY, stream);
-
-    auto amf_tmp_v = make_tensor<T1>(amf_tmp, {(len_seq - 1), nfreq});
+    auto amf_tmp_v = make_tensor<T1>({(len_seq - 1), nfreq}, MATX_ASYNC_DEVICE_MEMORY, stream);
+
     (amf_tmp_v = (float)nfreq * abs(fftshift1D(fullfft))).run(stream);
     matx::copy(amf, amf_tmp_v.RealView(), stream);
   }
   else if (cut == ::matx::AMGBFUN_CUT_TYPE_DELAY) {
-    T1 *fft_data_x, *fft_data_y, *amf_tmp;
-    matxAlloc(reinterpret_cast<void **>(&fft_data_x),
-              sizeof(*fft_data_x) * nfreq, MATX_ASYNC_DEVICE_MEMORY, stream);
-    matxAlloc(reinterpret_cast<void **>(&fft_data_y),
-              sizeof(*fft_data_y) * nfreq, MATX_ASYNC_DEVICE_MEMORY, stream);
-    auto fullfft_x = make_tensor<T1>(fft_data_x, {nfreq});
+    auto fullfft_x = make_tensor<T1>({nfreq}, MATX_ASYNC_DEVICE_MEMORY, stream);
     auto partfft_x = fullfft_x.Slice({0}, {xlen});
     (fullfft_x = 0).run(stream);
     matx::copy(partfft_x, x_normdiv_v, stream);
@@ -255,7 +225,7 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
     AmbgFftXOp(fullfft_x, fullfft_x, fs, cut_val, (float)nfreq).run(stream);
     ifft(fullfft_x, fullfft_x, 0, stream);
 
-    auto fullfft_y = make_tensor<T1>(fft_data_y, {nfreq});
+    auto fullfft_y = make_tensor<T1>({nfreq}, MATX_ASYNC_DEVICE_MEMORY, stream);
     (fullfft_y = 0).run(stream);
 
     auto partfft_y = fullfft_y.Slice({0}, {xlen});
@@ -265,10 +235,7 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
 
     // This allocation should not be necessary, but we're getting compiler
     // errors when cloning/slicing
-    matxAlloc(reinterpret_cast<void **>(&amf_tmp),
-              sizeof(*amf_tmp) * fullfft_y.Size(0), MATX_ASYNC_DEVICE_MEMORY,
-              stream);
-    auto amf_tmp_v = make_tensor<T1>(amf_tmp, {fullfft_y.Size(0)});
+    auto amf_tmp_v = make_tensor<T1>({fullfft_y.Size(0)}, MATX_ASYNC_DEVICE_MEMORY, stream);
 
     (amf_tmp_v = (float)nfreq * abs(ifftshift1D(fullfft_y))).run(stream);
 
@@ -277,21 +244,14 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
     matx::copy(amf, amfv.RealView(), stream);
   }
   else if (cut == ::matx::AMGBFUN_CUT_TYPE_DOPPLER) {
-    T1 *fft_data_x, *fft_data_y, *amf_tmp;
-    matxAlloc(reinterpret_cast<void **>(&fft_data_x),
-              sizeof(*fft_data_x) * (len_seq - 1), MATX_ASYNC_DEVICE_MEMORY,
-              stream);
-    matxAlloc(reinterpret_cast<void **>(&fft_data_y),
-              sizeof(*fft_data_y) * (len_seq - 1), MATX_ASYNC_DEVICE_MEMORY,
-              stream);
-    auto fullfft_y = make_tensor<T1>(fft_data_y, {len_seq - 1});
+    auto fullfft_y = make_tensor<T1>({len_seq - 1}, MATX_ASYNC_DEVICE_MEMORY, stream);
     auto partfft_y = fullfft_y.Slice({0}, {y_normdiv_v.Size(0)});
 
     (fullfft_y = 0).run(stream);
     matx::copy(partfft_y, y_normdiv_v, stream);
     fft(fullfft_y, fullfft_y, 0, stream);
 
-    auto fullfft_x = make_tensor<T1>(fft_data_x, {len_seq - 1});
+    auto fullfft_x = make_tensor<T1>({len_seq - 1}, MATX_ASYNC_DEVICE_MEMORY, stream);
     (fullfft_x = 0).run(stream);
 
     std::array<index_t, 1> xnd_size = {x_normdiv_v.Size(0)};
@@ -302,10 +262,7 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
 
     // This allocation should not be necessary, but we're getting compiler
     // errors when cloning/slicing
-    matxAlloc(reinterpret_cast<void **>(&amf_tmp),
-              sizeof(*amf_tmp) * fullfft_x.Size(0), MATX_ASYNC_DEVICE_MEMORY,
-              stream);
-    auto amf_tmp_v = make_tensor<T1>(amf_tmp, {fullfft_x.Size(0)});
+    auto amf_tmp_v = make_tensor<T1>({fullfft_x.Size(0)}, MATX_ASYNC_DEVICE_MEMORY, stream);
     (fullfft_y = fullfft_y * conj(fullfft_x)).run(stream);
     ifft(fullfft_y, fullfft_y, 0, stream);
 

diff --git a/include/matx/transforms/fft.h b/include/matx/transforms/fft.h
@@ -708,7 +708,6 @@ auto  GetFFTInputView([[maybe_unused]] OutputTensor &o,
       return i.Slice(starts, ends);
     }
     else { // FFT length is longer than the input. Pad input
-      T2 *i_pad;
 
       // If the input needs to be padded we have to temporarily allocate a new
       // buffer, zero the output, then copy our input buffer. This is not very
@@ -721,11 +720,7 @@ auto  GetFFTInputView([[maybe_unused]] OutputTensor &o,
       auto tot = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<typename decltype(shape)::value_type>());
 
       // Make a new buffer large enough for our input
-      matxAlloc(reinterpret_cast<void **>(&i_pad),
-                sizeof(T1) * tot, MATX_ASYNC_DEVICE_MEMORY,
-                stream);
-
-      auto i_new = make_tensor<T2>(i_pad, shape);
+      auto i_new = make_tensor<T2>(shape, MATX_ASYNC_DEVICE_MEMORY, stream);
       ends[RANK - 1] = i.Lsize();
       auto i_pad_part_v = i_new.Slice(starts, ends);
 

diff --git a/include/matx/transforms/matmul.h b/include/matx/transforms/matmul.h
@@ -626,32 +626,28 @@ class matxMatMulHandle_t {
     // If the tensors are complex half precision, we need to do a planar
     // transform since all libraries expect this format at the moment.
     if constexpr (is_complex_half_v<T1>) {
-      typename T1::value_type *A;
-      typename T2::value_type *B;
-      typename T3::value_type *C;
-
-      matxAlloc(reinterpret_cast<void **>(&A), a.Bytes(),
-                MATX_ASYNC_DEVICE_MEMORY, stream);
-      matxAlloc(reinterpret_cast<void **>(&B), b.Bytes(),
-                MATX_ASYNC_DEVICE_MEMORY, stream);
-      matxAlloc(reinterpret_cast<void **>(&C), c.Bytes(),
-                MATX_ASYNC_DEVICE_MEMORY, stream);
-
+
       auto a_shape = a.Shape();
       *(a_shape.begin() + a.Rank() - 2) = a.Size(a.Rank() - 2) * 2;
-      auto a_planar = make_tensor<typename T1::value_type>(A, a_shape);
+      auto a_planar = make_tensor<typename T1::value_type>(a_shape, MATX_ASYNC_DEVICE_MEMORY, stream);
 
       auto b_shape = b.Shape();
       *(b_shape.begin() + b.Rank() - 2) = b.Size(b.Rank() - 2) * 2;
-      auto b_planar = make_tensor<typename T1::value_type>(B, b_shape);
+      auto b_planar = make_tensor<typename T2::value_type>(b_shape, MATX_ASYNC_DEVICE_MEMORY, stream);
+
+      auto c_shape = c.Shape();
+      *(c_shape.begin() + c.Rank() - 2) = c.Size(c.Rank() - 2) * 2;
+      auto c_planar = make_tensor<typename T3::value_type>(c_shape, MATX_ASYNC_DEVICE_MEMORY, stream);
 
       // Convert A/B to planar layout
       (a_planar = planar(a)).run(stream);
       (b_planar = planar(b)).run(stream);
 
-      a_adj.Reset(reinterpret_cast<T1 *>(A));
-      b_adj.Reset(reinterpret_cast<T2 *>(B));
-      c_adj.Reset(reinterpret_cast<T3 *>(C));
+      // update poitners to planar data. 
+      // must use Reset because types for planar are different
+      a_adj.Reset(reinterpret_cast<T1 *>(a_planar.Data()));
+      b_adj.Reset(reinterpret_cast<T2 *>(b_planar.Data()));
+      c_adj.Reset(reinterpret_cast<T3 *>(c_planar.Data()));
     }
 
     // Prep for batch looping

diff --git a/include/matx/transforms/reduce.h b/include/matx/transforms/reduce.h
@@ -1180,11 +1180,7 @@ void __MATX_INLINE__ median(OutType dest,
   constexpr int RANK_IN = TensorInType::Rank();
   static_assert(RANK_IN <= 2 && (RANK_IN == OutType::Rank() + 1));
 
-  T *tmp_alloc;
-  matxAlloc(reinterpret_cast<void **>(&tmp_alloc),
-                sizeof(T) * TotalSize(in), MATX_ASYNC_DEVICE_MEMORY,
-                stream);
-  auto tmp_sort = make_tensor<T>(tmp_alloc, in.Shape());
+  auto tmp_sort = make_tensor<T>(in.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);
 
   // If the rank is 0 we're finding the median of a vector
   if constexpr (RANK_IN == 1) {
@@ -1223,7 +1219,6 @@ void __MATX_INLINE__ median(OutType dest,
     }
   }
 
-  matxFree(tmp_alloc);
 #endif  
 }
 
@@ -1503,11 +1498,9 @@ template <typename OutType, typename InType>
 void __MATX_INLINE__ var(OutType dest, const InType &in, cudaStream_t stream = 0)
 {
 #ifdef __CUDACC__    
-  typename InType::scalar_type *tmps;
   using inner_type = typename inner_op_type_t<typename InType::scalar_type>::type;
 
-  matxAlloc((void **)&tmps, TotalSize(dest)*sizeof(decltype(*tmps)), MATX_ASYNC_DEVICE_MEMORY, stream);
-  auto mean_tns = make_tensor<typename InType::scalar_type>(tmps, dest.Descriptor());
+  auto mean_tns = make_tensor<typename InType::scalar_type>(dest.Descriptor(), MATX_ASYNC_DEVICE_MEMORY, stream);
 
   // Compute mean of each dimension
   mean(mean_tns, in, stream);

diff --git a/include/matx/transforms/solver.h b/include/matx/transforms/solver.h
@@ -709,11 +709,7 @@ class matxDnSVDSolverPlan_t : public matxDnSolver_t {
   {
     static_assert(RANK >= 2);
 
-    T1 *tmp;
-    matxAlloc(reinterpret_cast<void **>(&tmp), a.Bytes(), MATX_DEVICE_MEMORY);
-    MATX_ASSERT(tmp != nullptr, matxOutOfMemory);
-
-    scratch = make_tensor_p<T1>(tmp, a.Shape());
+    scratch = make_tensor_p<T1>(a.Shape(), MATX_DEVICE_MEMORY);
     params = GetSVDParams(u, s, v, *scratch, jobu, jobvt);
 
     GetWorkspaceSize(&hspace, &dspace);