Skip to content

Commit

Permalink
use new async allocator
Browse files Browse the repository at this point in the history
  • Loading branch information
luitjens committed Sep 7, 2022
1 parent 1be82c8 commit ab64d60
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 93 deletions.
71 changes: 14 additions & 57 deletions include/matx/transforms/ambgfun.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,20 +167,12 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
using T1 = typename XTensor::scalar_type;
using T2 = typename AMFTensor::scalar_type;

T1 *x_normdiv, *y_normdiv;
float *x_norm, *y_norm;

MATX_STATIC_ASSERT(is_cuda_complex_v<T1>, matxInvalidType);
auto ry = x.View();
//tensor_t<T1, RANK> ry(x);

matxAlloc(reinterpret_cast<void **>(&x_normdiv),
sizeof(T1) * x.Size(RANK - 1), MATX_ASYNC_DEVICE_MEMORY, stream);
matxAlloc(reinterpret_cast<void **>(&x_norm), sizeof(*x_norm),
MATX_ASYNC_DEVICE_MEMORY, stream);

auto x_normdiv_v = make_tensor<T1>(x_normdiv, x.Shape());
auto x_norm_v = make_tensor<float>(x_norm);
auto x_normdiv_v = make_tensor<T1>(x.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);
auto x_norm_v = make_tensor<float>(MATX_ASYNC_DEVICE_MEMORY, stream);

sum(x_norm_v, norm(x), stream);
(x_norm_v = sqrt(x_norm_v)).run(stream);
Expand All @@ -190,13 +182,8 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,

if (y) {
ry.Reset(y.value().Data(), y.value().Shape());
matxAlloc(reinterpret_cast<void **>(&y_normdiv),
sizeof(T1) * ry.Size(RANK - 1), MATX_ASYNC_DEVICE_MEMORY, stream);
matxAlloc(reinterpret_cast<void **>(&y_norm), sizeof(*y_norm),
MATX_ASYNC_DEVICE_MEMORY, stream);

y_normdiv_v.Reset(y_normdiv, ry.Shape());
auto y_norm_v = make_tensor<float>(y_norm);
y_normdiv_v.Shallow(make_tensor<T1>(y_normdiv_v.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream));
auto y_norm_v = make_tensor<float>(MATX_ASYNC_DEVICE_MEMORY, stream);

sum(y_norm_v, norm(ry), stream);
(y_normdiv_v = ry / y_norm_v).run(stream);
Expand All @@ -208,21 +195,12 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
index_t xlen = x_normdiv_v.Size(RANK - 1);

if (cut == ::matx::AMGBFUN_CUT_TYPE_2D) {
T1 *new_ynorm;
matxAlloc(reinterpret_cast<void **>(&new_ynorm),
sizeof(T1) * (len_seq - 1) * xlen, MATX_ASYNC_DEVICE_MEMORY,
stream);

auto new_ynorm_v = make_tensor<T1>(new_ynorm, {len_seq - 1, xlen});
auto new_ynorm_v = make_tensor<T1>({len_seq - 1, xlen}, MATX_ASYNC_DEVICE_MEMORY, stream);

newYNorm(new_ynorm_v, x_normdiv_v, y_normdiv_v).run(stream);

T1 *fft_data, *amf_tmp;
matxAlloc(reinterpret_cast<void **>(&fft_data),
sizeof(*fft_data) * nfreq * (len_seq - 1),
MATX_ASYNC_DEVICE_MEMORY, stream);

auto fullfft = make_tensor<T1>(fft_data, {(len_seq - 1), nfreq});
auto fullfft = make_tensor<T1>({(len_seq - 1), nfreq}, MATX_ASYNC_DEVICE_MEMORY, stream);
auto partfft = fullfft.Slice({0, 0}, {(len_seq - 1), xlen});

(fullfft = 0).run(stream);
Expand All @@ -232,21 +210,13 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,

// We need to temporarily allocate a complex output version of AMF since we
// have no way to convert complex to real in an operator currently
matxAlloc(reinterpret_cast<void **>(&amf_tmp),
sizeof(*amf_tmp) * nfreq * (len_seq - 1),
MATX_ASYNC_DEVICE_MEMORY, stream);

auto amf_tmp_v = make_tensor<T1>(amf_tmp, {(len_seq - 1), nfreq});
auto amf_tmp_v = make_tensor<T1>({(len_seq - 1), nfreq}, MATX_ASYNC_DEVICE_MEMORY, stream);

(amf_tmp_v = (float)nfreq * abs(fftshift1D(fullfft))).run(stream);
matx::copy(amf, amf_tmp_v.RealView(), stream);
}
else if (cut == ::matx::AMGBFUN_CUT_TYPE_DELAY) {
T1 *fft_data_x, *fft_data_y, *amf_tmp;
matxAlloc(reinterpret_cast<void **>(&fft_data_x),
sizeof(*fft_data_x) * nfreq, MATX_ASYNC_DEVICE_MEMORY, stream);
matxAlloc(reinterpret_cast<void **>(&fft_data_y),
sizeof(*fft_data_y) * nfreq, MATX_ASYNC_DEVICE_MEMORY, stream);
auto fullfft_x = make_tensor<T1>(fft_data_x, {nfreq});
auto fullfft_x = make_tensor<T1>({nfreq}, MATX_ASYNC_DEVICE_MEMORY, stream);
auto partfft_x = fullfft_x.Slice({0}, {xlen});
(fullfft_x = 0).run(stream);
matx::copy(partfft_x, x_normdiv_v, stream);
Expand All @@ -255,7 +225,7 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
AmbgFftXOp(fullfft_x, fullfft_x, fs, cut_val, (float)nfreq).run(stream);
ifft(fullfft_x, fullfft_x, 0, stream);

auto fullfft_y = make_tensor<T1>(fft_data_y, {nfreq});
auto fullfft_y = make_tensor<T1>({nfreq}, MATX_ASYNC_DEVICE_MEMORY, stream);
(fullfft_y = 0).run(stream);

auto partfft_y = fullfft_y.Slice({0}, {xlen});
Expand All @@ -265,10 +235,7 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,

// This allocation should not be necessary, but we're getting compiler
// errors when cloning/slicing
matxAlloc(reinterpret_cast<void **>(&amf_tmp),
sizeof(*amf_tmp) * fullfft_y.Size(0), MATX_ASYNC_DEVICE_MEMORY,
stream);
auto amf_tmp_v = make_tensor<T1>(amf_tmp, {fullfft_y.Size(0)});
auto amf_tmp_v = make_tensor<T1>({fullfft_y.Size(0)}, MATX_ASYNC_DEVICE_MEMORY, stream);

(amf_tmp_v = (float)nfreq * abs(ifftshift1D(fullfft_y))).run(stream);

Expand All @@ -277,21 +244,14 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
matx::copy(amf, amfv.RealView(), stream);
}
else if (cut == ::matx::AMGBFUN_CUT_TYPE_DOPPLER) {
T1 *fft_data_x, *fft_data_y, *amf_tmp;
matxAlloc(reinterpret_cast<void **>(&fft_data_x),
sizeof(*fft_data_x) * (len_seq - 1), MATX_ASYNC_DEVICE_MEMORY,
stream);
matxAlloc(reinterpret_cast<void **>(&fft_data_y),
sizeof(*fft_data_y) * (len_seq - 1), MATX_ASYNC_DEVICE_MEMORY,
stream);
auto fullfft_y = make_tensor<T1>(fft_data_y, {len_seq - 1});
auto fullfft_y = make_tensor<T1>({len_seq - 1}, MATX_ASYNC_DEVICE_MEMORY, stream);
auto partfft_y = fullfft_y.Slice({0}, {y_normdiv_v.Size(0)});

(fullfft_y = 0).run(stream);
matx::copy(partfft_y, y_normdiv_v, stream);
fft(fullfft_y, fullfft_y, 0, stream);

auto fullfft_x = make_tensor<T1>(fft_data_x, {len_seq - 1});
auto fullfft_x = make_tensor<T1>({len_seq - 1}, MATX_ASYNC_DEVICE_MEMORY, stream);
(fullfft_x = 0).run(stream);

std::array<index_t, 1> xnd_size = {x_normdiv_v.Size(0)};
Expand All @@ -302,10 +262,7 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,

// This allocation should not be necessary, but we're getting compiler
// errors when cloning/slicing
matxAlloc(reinterpret_cast<void **>(&amf_tmp),
sizeof(*amf_tmp) * fullfft_x.Size(0), MATX_ASYNC_DEVICE_MEMORY,
stream);
auto amf_tmp_v = make_tensor<T1>(amf_tmp, {fullfft_x.Size(0)});
auto amf_tmp_v = make_tensor<T1>({fullfft_x.Size(0)}, MATX_ASYNC_DEVICE_MEMORY, stream);
(fullfft_y = fullfft_y * conj(fullfft_x)).run(stream);
ifft(fullfft_y, fullfft_y, 0, stream);

Expand Down
7 changes: 1 addition & 6 deletions include/matx/transforms/fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,6 @@ auto GetFFTInputView([[maybe_unused]] OutputTensor &o,
return i.Slice(starts, ends);
}
else { // FFT length is longer than the input. Pad input
T2 *i_pad;

// If the input needs to be padded we have to temporarily allocate a new
// buffer, zero the output, then copy our input buffer. This is not very
Expand All @@ -721,11 +720,7 @@ auto GetFFTInputView([[maybe_unused]] OutputTensor &o,
auto tot = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<typename decltype(shape)::value_type>());

// Make a new buffer large enough for our input
matxAlloc(reinterpret_cast<void **>(&i_pad),
sizeof(T1) * tot, MATX_ASYNC_DEVICE_MEMORY,
stream);

auto i_new = make_tensor<T2>(i_pad, shape);
auto i_new = make_tensor<T2>(shape, MATX_ASYNC_DEVICE_MEMORY, stream);
ends[RANK - 1] = i.Lsize();
auto i_pad_part_v = i_new.Slice(starts, ends);

Expand Down
28 changes: 12 additions & 16 deletions include/matx/transforms/matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -626,32 +626,28 @@ class matxMatMulHandle_t {
// If the tensors are complex half precision, we need to do a planar
// transform since all libraries expect this format at the moment.
if constexpr (is_complex_half_v<T1>) {
typename T1::value_type *A;
typename T2::value_type *B;
typename T3::value_type *C;

matxAlloc(reinterpret_cast<void **>(&A), a.Bytes(),
MATX_ASYNC_DEVICE_MEMORY, stream);
matxAlloc(reinterpret_cast<void **>(&B), b.Bytes(),
MATX_ASYNC_DEVICE_MEMORY, stream);
matxAlloc(reinterpret_cast<void **>(&C), c.Bytes(),
MATX_ASYNC_DEVICE_MEMORY, stream);


auto a_shape = a.Shape();
*(a_shape.begin() + a.Rank() - 2) = a.Size(a.Rank() - 2) * 2;
auto a_planar = make_tensor<typename T1::value_type>(A, a_shape);
auto a_planar = make_tensor<typename T1::value_type>(a_shape, MATX_ASYNC_DEVICE_MEMORY, stream);

auto b_shape = b.Shape();
*(b_shape.begin() + b.Rank() - 2) = b.Size(b.Rank() - 2) * 2;
auto b_planar = make_tensor<typename T1::value_type>(B, b_shape);
auto b_planar = make_tensor<typename T2::value_type>(b_shape, MATX_ASYNC_DEVICE_MEMORY, stream);

auto c_shape = c.Shape();
*(c_shape.begin() + c.Rank() - 2) = c.Size(c.Rank() - 2) * 2;
auto c_planar = make_tensor<typename T3::value_type>(c_shape, MATX_ASYNC_DEVICE_MEMORY, stream);

// Convert A/B to planar layout
(a_planar = planar(a)).run(stream);
(b_planar = planar(b)).run(stream);

a_adj.Reset(reinterpret_cast<T1 *>(A));
b_adj.Reset(reinterpret_cast<T2 *>(B));
c_adj.Reset(reinterpret_cast<T3 *>(C));
// update poitners to planar data.
// must use Reset because types for planar are different
a_adj.Reset(reinterpret_cast<T1 *>(a_planar.Data()));
b_adj.Reset(reinterpret_cast<T2 *>(b_planar.Data()));
c_adj.Reset(reinterpret_cast<T3 *>(c_planar.Data()));
}

// Prep for batch looping
Expand Down
11 changes: 2 additions & 9 deletions include/matx/transforms/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -1180,11 +1180,7 @@ void __MATX_INLINE__ median(OutType dest,
constexpr int RANK_IN = TensorInType::Rank();
static_assert(RANK_IN <= 2 && (RANK_IN == OutType::Rank() + 1));

T *tmp_alloc;
matxAlloc(reinterpret_cast<void **>(&tmp_alloc),
sizeof(T) * TotalSize(in), MATX_ASYNC_DEVICE_MEMORY,
stream);
auto tmp_sort = make_tensor<T>(tmp_alloc, in.Shape());
auto tmp_sort = make_tensor<T>(in.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream);

// If the rank is 0 we're finding the median of a vector
if constexpr (RANK_IN == 1) {
Expand Down Expand Up @@ -1223,7 +1219,6 @@ void __MATX_INLINE__ median(OutType dest,
}
}

matxFree(tmp_alloc);
#endif
}

Expand Down Expand Up @@ -1503,11 +1498,9 @@ template <typename OutType, typename InType>
void __MATX_INLINE__ var(OutType dest, const InType &in, cudaStream_t stream = 0)
{
#ifdef __CUDACC__
typename InType::scalar_type *tmps;
using inner_type = typename inner_op_type_t<typename InType::scalar_type>::type;

matxAlloc((void **)&tmps, TotalSize(dest)*sizeof(decltype(*tmps)), MATX_ASYNC_DEVICE_MEMORY, stream);
auto mean_tns = make_tensor<typename InType::scalar_type>(tmps, dest.Descriptor());
auto mean_tns = make_tensor<typename InType::scalar_type>(dest.Descriptor(), MATX_ASYNC_DEVICE_MEMORY, stream);

// Compute mean of each dimension
mean(mean_tns, in, stream);
Expand Down
6 changes: 1 addition & 5 deletions include/matx/transforms/solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -709,11 +709,7 @@ class matxDnSVDSolverPlan_t : public matxDnSolver_t {
{
static_assert(RANK >= 2);

T1 *tmp;
matxAlloc(reinterpret_cast<void **>(&tmp), a.Bytes(), MATX_DEVICE_MEMORY);
MATX_ASSERT(tmp != nullptr, matxOutOfMemory);

scratch = make_tensor_p<T1>(tmp, a.Shape());
scratch = make_tensor_p<T1>(a.Shape(), MATX_DEVICE_MEMORY);
params = GetSVDParams(u, s, v, *scratch, jobu, jobvt);

GetWorkspaceSize(&hspace, &dspace);
Expand Down

0 comments on commit ab64d60

Please sign in to comment.