From a3f19e3be873a8c9a902d21894dc3f10d99b2e7f Mon Sep 17 00:00:00 2001
From: cliffburdick <cburdick@nvidia.com>
Date: Mon, 20 Mar 2023 11:06:48 -0700
Subject: [PATCH] Adding ref-count for DLPack for clients to let local tensors
 go out of scope

---
 include/matx/core/tensor.h         | 38 ++++++++++++++++++------------
 test/00_tensor/BasicTensorTests.cu |  2 ++
 2 files changed, 25 insertions(+), 15 deletions(-)
diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
index 81b9dbda..52cfa5c0 100644
--- a/include/matx/core/tensor.h
+++ b/include/matx/core/tensor.h
@@ -1034,7 +1034,6 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     return storage_.use_count();
   }  
 
-
   /**
    * Create an overlapping tensor view
    *
@@ -1752,6 +1751,10 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * returns a DLPack structure based on a tensor_t. The caller is responsible for freeing the memory
    * by calling ->deleter(self).
    * 
+   * **Note**: This function will increment the reference count of the tensor. It is expected that once a tensor
+   * is converted to DLPack someone will eventually call deleter(). If that does not happen a memory leak
+   * will occur.
+   * 
    * @returns Pointer to new DLManagedTensorVersioned pointer. The caller must call the deleter function when finished.
    */
   DLManagedTensor *GetDLPackTensor() const {
@@ -1762,14 +1765,14 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     CUpointer_attribute attr[] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL};
     CUmemorytype mem_type;
     int dev_ord;
-    void *data[2] = {&mem_type, &dev_ord};
+    void *data[2]       = {&mem_type, &dev_ord};
 
-    t->data   = static_cast<void*>(this->ldata_);
+    t->data             = static_cast<void*>(this->ldata_);
     t->device.device_id = 0;
 
     // Determine where this memory resides
-    auto kind = GetPointerKind(this->ldata_);
-    auto mem_res = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->ldata_));
+    auto kind     = GetPointerKind(this->ldata_);
+    auto mem_res  = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->ldata_));
     MATX_ASSERT_STR_EXP(mem_res, CUDA_SUCCESS, matxCudaError, "Error returned from cuPointerGetAttributes");
     if (kind == MATX_INVALID_MEMORY) {
       if (mem_type == CU_MEMORYTYPE_DEVICE) {
@@ -1802,28 +1805,33 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
       }
     }
 
-    t->ndim = RANK;
-    t->dtype = detail::TypeToDLPackType<T>();
-    t->shape = new int64_t[RANK];
-    t->strides = new int64_t[RANK];
+    t->ndim         = RANK;
+    t->dtype        = detail::TypeToDLPackType<T>();
+    t->shape        = new int64_t[RANK];
+    t->strides      = new int64_t[RANK];
     for (int r = 0; r < RANK; r++) {
-      t->shape[r] = this->Size(r);
+      t->shape[r]   = this->Size(r);
       t->strides[r] = this->Stride(r);
     }
-    t->byte_offset = 0;
+    t->byte_offset  = 0;
 
-    mt->manager_ctx = nullptr;
+    // Increment reference count by making a copy of the shared_ptr by allocating on the heap and
+    // setting it as the context
+    auto t_copy = new self_type{*this};
+    //*t_copy = *this;
+    mt->manager_ctx = t_copy;
     //mt->flags = 0; // Only for v1.0
 
     //auto deleter = [](struct DLManagedTensorVersioned *mtv) { // v1.0
     auto deleter = [](struct DLManagedTensor *mtv) {
       delete [] mtv->dl_tensor.shape;
       delete [] mtv->dl_tensor.strides;
+      delete static_cast<self_type *>(mtv->manager_ctx);
       delete mtv;
 
-      mtv->dl_tensor.shape = nullptr;
-      mtv->dl_tensor.strides = nullptr;
-      mtv = nullptr;
+      mtv->dl_tensor.shape    = nullptr;
+      mtv->dl_tensor.strides  = nullptr;
+      mtv                     = nullptr;
     };
 
     mt->deleter = deleter;
diff --git a/test/00_tensor/BasicTensorTests.cu b/test/00_tensor/BasicTensorTests.cu
index 56bc8ccf..d26674bb 100644
--- a/test/00_tensor/BasicTensorTests.cu
+++ b/test/00_tensor/BasicTensorTests.cu
@@ -471,9 +471,11 @@ TYPED_TEST(BasicTensorTestsAll, DLPack)
   ASSERT_EQ(dl->dl_tensor.strides[0], t.Stride(0));
   ASSERT_EQ(dl->dl_tensor.strides[1], t.Stride(1));
   ASSERT_EQ(dl->dl_tensor.strides[2], t.Stride(2));
+  ASSERT_EQ(t.GetRefCount(), 2);
   dl->deleter(dl);
   ASSERT_EQ(dl->dl_tensor.shape, nullptr);
   ASSERT_EQ(dl->dl_tensor.strides, nullptr);
+  ASSERT_EQ(t.GetRefCount(), 1);
 
   MATX_EXIT_HANDLER();
 }