From ea706628e841846ef08fda833fab748705971c70 Mon Sep 17 00:00:00 2001
From: cliffburdick <cburdick@nvidia.com>
Date: Mon, 28 Aug 2023 08:33:42 -0700
Subject: [PATCH] Added at() and percentile() operators

---
 docs_input/api/manipulation/selecting/at.rst  |  31 ++++
 docs_input/api/stats/misc/index.rst           |   0
 docs_input/api/stats/misc/percentile.rst      |  24 +++
 examples/fft_conv.cu                          |   1 -
 include/matx/core/make_tensor.h               |   6 +-
 include/matx/operators/at.h                   |  81 +++++++++
 include/matx/operators/flatten.h              |   7 +-
 include/matx/operators/operators.h            |   2 +
 include/matx/operators/percentile.h           | 160 ++++++++++++++++
 include/matx/transforms/percentile.h          | 171 ++++++++++++++++++
 include/matx/transforms/reduce.h              |  69 +++++++
 test/00_operators/OperatorTests.cu            |  32 ++++
 test/00_operators/ReductionTests.cu           |  75 ++++++++
 test/test_vectors/generators/00_reductions.py |  26 +++
 14 files changed, 680 insertions(+), 5 deletions(-)
 create mode 100644 docs_input/api/manipulation/selecting/at.rst
 create mode 100644 docs_input/api/stats/misc/index.rst
 create mode 100644 docs_input/api/stats/misc/percentile.rst
 create mode 100644 include/matx/operators/at.h
 create mode 100644 include/matx/operators/percentile.h
 create mode 100644 include/matx/transforms/percentile.h

diff --git a/docs_input/api/manipulation/selecting/at.rst b/docs_input/api/manipulation/selecting/at.rst
new file mode 100644
index 000000000..a69c7652f
--- /dev/null
+++ b/docs_input/api/manipulation/selecting/at.rst
@@ -0,0 +1,31 @@
+.. _at_func:
+
+at
+==
+
+Selects a single value from an operator. Since `at` is a lazily-evaluated operator, it should be used
+in situations where `operator()` cannot be used. For instance:
+
+.. code-block:: cpp
+
+    (a = b(5)).run();
+
+The code above creates a race condition where `b(5)` is evaluated on the host before launch, but the value may
+not be computed from a previous operation. Instead, the `at()` operator can be used to defer the load until 
+the operation is launched:
+
+.. code-block:: cpp
+
+    (a = at(b, 5)).run();
+
+.. doxygenfunction:: at(const Op op, Is... indices)
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../../test/00_operators/OperatorTests.cu
+   :language: cpp
+   :start-after: example-begin at-test-1
+   :end-before: example-end at-test-1
+   :dedent:
+
diff --git a/docs_input/api/stats/misc/index.rst b/docs_input/api/stats/misc/index.rst
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs_input/api/stats/misc/percentile.rst b/docs_input/api/stats/misc/percentile.rst
new file mode 100644
index 000000000..76711ed00
--- /dev/null
+++ b/docs_input/api/stats/misc/percentile.rst
@@ -0,0 +1,24 @@
+.. _percentile_func:
+
+percentile
+##########
+
+Find the q-th percentile of an input sequence. ``q`` is a value between 0 and 100 representing the percentile. A value
+of 0 is equivalent to mean, 100 is max, and 50 is the median when using the ``LINEAR`` method.
+
+.. note::
+    Multiple q values are not supported yet
+
+Supported methods for interpolation are: LINEAR, HAZEN, WEIBULL, LOWER, HIGHER, MIDPOINT, NEAREST, MEDIAN_UNBIASED, and NORMAL_UNBIASED
+
+.. doxygenfunction:: percentile(const InType &in, unsigned char q, PercentileMethod method = PercentileMethod::LINEAR)
+.. doxygenfunction:: percentile(const InType &in, unsigned char q, const int (&dims)[D], PercentileMethod method = PercentileMethod::LINEAR)
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../../test/00_operators/ReductionTests.cu
+   :language: cpp
+   :start-after: example-begin percentile-test-1
+   :end-before: example-end percentile-test-1
+   :dedent:
diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
index 1f00b0572..1780fe624 100644
--- a/examples/fft_conv.cu
+++ b/examples/fft_conv.cu
@@ -169,7 +169,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
   }
 
-
   std::cout << "Verification successful" << std::endl;
 
   CUDA_CHECK_LAST_ERROR();
diff --git a/include/matx/core/make_tensor.h b/include/matx/core/make_tensor.h
index a51767cdc..42d5e094d 100644
--- a/include/matx/core/make_tensor.h
+++ b/include/matx/core/make_tensor.h
@@ -72,9 +72,9 @@ auto make_tensor( const index_t (&shape)[RANK],
  * @param stream cuda stream to allocate in (only applicable to async allocations)
  **/
 template <typename TensorType, std::enable_if_t< is_tensor_view_v<TensorType>, bool> = true>
-void make_tensor( TensorType &tensor, 
-                  const index_t (&shape)[TensorType::Rank()], 
-                  matxMemorySpace_t space = MATX_MANAGED_MEMORY, 
+void make_tensor( TensorType &tensor,
+                  const index_t (&shape)[TensorType::Rank()],
+                  matxMemorySpace_t space = MATX_MANAGED_MEMORY,
                   cudaStream_t stream = 0) {
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
diff --git a/include/matx/operators/at.h b/include/matx/operators/at.h
new file mode 100644
index 000000000..b6d8da449
--- /dev/null
+++ b/include/matx/operators/at.h
@@ -0,0 +1,81 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "matx/operators/base_operator.h"
+
+namespace matx
+{
+
+  /**
+   * Returns the current tensor index for the given dimension.
+   */
+  namespace detail {
+    template <typename Op, typename... Is>
+    class AtOp : public BaseOp<AtOp<Op, Is...>>
+    {
+      private:
+        Op op_;
+        std::array<index_t, sizeof...(Is)> idx_;
+
+      public:
+        using matxop = bool;
+        using scalar_type = typename Op::scalar_type;
+
+        __MATX_INLINE__ std::string str() const { return "at()"; } 
+        __MATX_INLINE__ AtOp(Op op, Is... is) : op_(op), idx_{is...} {};  
+
+        template <typename... Is2>
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()([[maybe_unused]] Is2... indices) const 
+        {
+          return mapply(op_, idx_);
+        }
+
+        static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
+        {
+          return 0;
+        }
+
+        constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size([[maybe_unused]] int dim) const
+        {
+          return index_t(0);
+        }
+    };
+  }   
+
+
+  template <typename Op, typename... Is, std::enable_if_t<((std::is_integral_v<Is>) && ...), bool> = true>
+  __MATX_INLINE__ auto at(const Op op, Is... indices) {
+    return detail::AtOp(op, indices...);
+  }
+} // end namespace matx
diff --git a/include/matx/operators/flatten.h b/include/matx/operators/flatten.h
index 342383394..a8107fe9c 100644
--- a/include/matx/operators/flatten.h
+++ b/include/matx/operators/flatten.h
@@ -103,7 +103,12 @@ namespace matx
   template <typename T1>
     auto __MATX_INLINE__ flatten(const T1 &a)
     {
-      return detail::FlattenOp<T1>(a);
+      if constexpr (T1::Rank() <= 1) {
+        return a;
+      }
+      else {
+        return detail::FlattenOp<T1>(a);
+      }
     };
 
 } // end namespace matx
diff --git a/include/matx/operators/operators.h b/include/matx/operators/operators.h
index 2d1f68691..bbc735bed 100644
--- a/include/matx/operators/operators.h
+++ b/include/matx/operators/operators.h
@@ -36,6 +36,7 @@
 #include "matx/operators/binary_operators.h"
 
 #include "matx/operators/ambgfun.h"
+#include "matx/operators/at.h"
 #include "matx/operators/cart2sph.h"
 #include "matx/operators/collapse.h"
 #include "matx/operators/concat.h"
@@ -73,6 +74,7 @@
 #include "matx/operators/legendre.h"
 #include "matx/operators/lu.h"
 #include "matx/operators/matmul.h"
+#include "matx/operators/percentile.h"
 #include "matx/operators/permute.h"
 #include "matx/operators/planar.h"
 #include "matx/operators/qr.h"
diff --git a/include/matx/operators/percentile.h b/include/matx/operators/percentile.h
new file mode 100644
index 000000000..02a5f1b6f
--- /dev/null
+++ b/include/matx/operators/percentile.h
@@ -0,0 +1,160 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "matx/core/type_utils.h"
+#include "matx/operators/base_operator.h"
+#include "matx/operators/permute.h"
+#include "matx/transforms/percentile.h"
+
+namespace matx {
+
+
+namespace detail {
+  template<typename OpA, int ORank>
+  class PercentileOp : public BaseOp<PercentileOp<OpA,ORank>>
+  {
+    private:
+      OpA a_;
+      uint32_t q_;
+      PercentileMethod method_;
+      std::array<index_t, ORank> out_dims_; 
+      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+
+    public:
+      using matxop = bool;
+      using scalar_type = typename remove_cvref_t<OpA>::scalar_type;
+      using matx_transform_op = bool;
+      using prod_xform_op = bool;
+
+      __MATX_INLINE__ std::string str() const { return "percentile(" + get_type_str(a_) + ")"; }
+      __MATX_INLINE__ PercentileOp(OpA a, unsigned char q, PercentileMethod method) : a_(a), q_(q), method_(method) {
+        for (int r = 0; r < ORank; r++) {
+          out_dims_[r]    = (r == ORank - 1) ? 1 : a_.Size(r);
+        }
+      };
+
+      template <typename... Is>
+      __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const {
+        return tmp_out_(indices...);
+      };
+
+      template <typename Out, typename Executor>
+      void Exec(Out &&out, Executor &&ex) const {
+        percentile_impl(std::get<0>(out), a_, q_, method_, ex);
+      }
+
+      static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
+      {
+        return ORank;
+      }
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }
+
+        if constexpr (is_device_executor_v<Executor>) {
+          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
+        }
+        else {
+          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);
+        }
+
+        Exec(std::make_tuple(tmp_out_), std::forward<Executor>(ex));
+      }
+
+      constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
+      {
+        return out_dims_[dim];
+      }
+
+  };
+}
+
+/**
+ * Compute product of numbers along axes
+ *
+ * Returns a tensor representing the product of all items in the reduction
+ *
+ * @tparam InType
+ *   Input data type
+ * @tparam D
+ *   Num of dimensions to reduce over
+ *
+ * @param in
+ *   Input data to reduce
+ * @param q
+ *   Percentile to compute (between 0-100)
+ * @param dims
+ *   Array containing dimensions to compute over
+ * @param method
+ *   Method of interpolation
+ * @returns Operator with reduced values of prod-reduce computed
+ */
+template <typename InType, int D>
+__MATX_INLINE__ auto percentile(const InType &in, unsigned char q, const int (&dims)[D], PercentileMethod method = PercentileMethod::LINEAR)
+{
+  static_assert(D < InType::Rank(), "reduction dimensions must be <= Rank of input");
+  MATX_ASSERT_STR(q < 100, matxInvalidParameter, "Percentile must be < 100");
+  auto perm = detail::getPermuteDims<InType::Rank()>(dims);
+  auto permop = permute(in, perm);
+
+  return detail::PercentileOp<decltype(permop), InType::Rank() - D>(permop, q, method);
+}
+
+/**
+ * Compute product of numbers
+ *
+ * Returns a tensor representing the product of all items in the reduction
+ *
+ * @tparam InType
+ *   Input data type
+ *
+ * @param in
+ *   Input data to reduce
+ * @param q
+ *   Percentile to compute (between 0-100)
+ * @param method
+ *   Method of interpolation
+ * @returns Operator with reduced values of prod-reduce computed
+ */
+template <typename InType>
+__MATX_INLINE__ auto percentile(const InType &in, unsigned char q, PercentileMethod method = PercentileMethod::LINEAR)
+{
+  return detail::PercentileOp<decltype(in), 0>(in, q, method);
+}
+
+}
\ No newline at end of file
diff --git a/include/matx/transforms/percentile.h b/include/matx/transforms/percentile.h
new file mode 100644
index 000000000..8b34ac8c5
--- /dev/null
+++ b/include/matx/transforms/percentile.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <cfloat>
+
+#include "matx/core/cache.h"
+#include "matx/core/error.h"
+#include "matx/core/get_grid_dims.h"
+#include "matx/core/nvtx.h"
+#include "matx/core/tensor.h"
+#include "matx/core/type_utils.h"
+#include "matx/core/utils.h"
+#include "matx/transforms/cub.h"
+#include "matx/transforms/copy.h"
+#include "matx/core/half.h"
+
+namespace matx {
+
+  enum class PercentileMethod {
+    LINEAR,
+    LOWER,
+    HIGHER,
+    HAZEN,
+    WEIBULL,
+    MEDIAN_UNBIASED,
+    NORMAL_UNBIASED,
+    MIDPOINT,
+    NEAREST
+  }; 
+
+namespace detail {
+
+/**
+ * Calculate the median of values in a tensor
+ *
+ * Calculates the median of rows in a tensor. The median is computed by sorting
+ * the data into a temporary tensor, then picking the middle element of each
+ * row. For an even number of items, the mean of the two middle elements is
+ * selected. Currently only works on tensor views as input since it uses CUB
+ * sorting as a backend, and the tensor views must be rank 2 reducing to rank 1,
+ * or rank 1 reducing to rank 0.
+ *
+ * @tparam T
+ *   Output data type
+ * @tparam RANK
+ *   Rank of output tensor
+ * @tparam RANK_IN
+ *   Input rank
+ *
+ * @param dest
+ *   Destination view of reduction
+ * @param in
+ *   Input data to reduce
+ * @param exec
+ *   CUDA executor
+ */
+template <typename OutType, typename InType, typename Executor>
+void __MATX_INLINE__ percentile_impl(OutType dest, const InType &in, uint32_t q, PercentileMethod method, Executor &&exec)
+{
+  MATX_NVTX_START("percentile_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
+
+  double alpha;
+  double beta;
+  double subidx;
+
+  // Compute alpha and beta if used
+  switch (method) {
+    case PercentileMethod::LINEAR: {
+      alpha = 1.0;
+      beta  = 1.0;
+      break;
+    }
+    case PercentileMethod::HAZEN: {
+      alpha = 0.5;
+      beta  = 0.5;
+      break;
+    }
+    case PercentileMethod::WEIBULL: {
+      alpha = 0;
+      beta  = 0;
+      break;
+    }
+    case PercentileMethod::MEDIAN_UNBIASED: {
+      alpha = 1./3;
+      beta  = 1./3;
+      break;
+    }
+    case PercentileMethod::NORMAL_UNBIASED: {
+      alpha = 3./8;
+      beta  = 3./8;
+      break;
+    }
+    case PercentileMethod::LOWER: [[fallthrough]];
+    case PercentileMethod::HIGHER: [[fallthrough]];
+    case PercentileMethod::NEAREST: [[fallthrough]];
+    case PercentileMethod::MIDPOINT:
+      break;
+    default: {
+      MATX_ASSERT_STR(false, matxInvalidParameter, "Method for percentile() not supported yet");
+      return;
+    }
+  }
+
+  if constexpr (OutType::Rank() == 0) {
+    auto insize = TotalSize(in);
+    matx::tensor_t<typename InType::scalar_type, 1> sort_out;
+    if constexpr (is_device_executor_v<Executor>) {
+      make_tensor(sort_out, {insize}, MATX_ASYNC_DEVICE_MEMORY, exec.getStream());
+    }
+    else {
+      make_tensor(sort_out, {insize}, MATX_HOST_MEMORY);  
+    }
+
+    sort_impl(sort_out, flatten(in), SORT_DIR_ASC, exec);
+
+    // If we're landing directly onto an index after the q multiplication we should make sure that's the case
+    // and not allow floating point error to move us to the wrong index.
+    double base_index = ((q * (insize - 1)) % 100) == 0 ? static_cast<double>(q * (insize - 1) / 100) : static_cast<double>(insize - 1) * q/100.;
+
+    if (q == 0) {
+      (dest = at(sort_out, 0)).run(exec);
+    }
+    else if (q == 100) {
+      (dest = at(sort_out, insize - 1)).run(exec);
+    }
+    else {
+      switch (method) {
+        case PercentileMethod::LINEAR: [[fallthrough]];
+        case PercentileMethod::HAZEN: [[fallthrough]];
+        case PercentileMethod::WEIBULL: [[fallthrough]];
+        case PercentileMethod::MEDIAN_UNBIASED: [[fallthrough]];
+        case PercentileMethod::NORMAL_UNBIASED:
+        {
+          subidx = q/100. * (static_cast<double>(insize) - alpha - beta + 1) + alpha - 1; 
+          auto int_val = at(sort_out, static_cast<index_t>(subidx));
+          (dest = at(sort_out,  static_cast<index_t>(subidx)) + 
+                                as_type<typename InType::scalar_type>(
+                                  static_cast<double>(subidx - static_cast<index_t>(subidx)) * 
+                                  as_type<double>(at(sort_out, static_cast<index_t>(subidx + 1)) - int_val)
+                                )
+          ).run(exec);
+          break;
+        }
+        case PercentileMethod::LOWER: {
+          (dest = at(sort_out, static_cast<index_t>(base_index))).run(exec);
+          break;
+        }
+        case PercentileMethod::HIGHER: {
+          (dest = at(sort_out, static_cast<index_t>(cuda::std::ceil(base_index)))).run(exec);
+          break;
+        }
+        case PercentileMethod::MIDPOINT: {
+          (dest = as_type<typename InType::scalar_type>(
+                  (at(sort_out, static_cast<index_t>(cuda::std::ceil(base_index))) + 
+                   at(sort_out, static_cast<index_t>(cuda::std::ceil(base_index + 1)))) / 
+                   static_cast<typename InType::scalar_type>(2))
+          ).run(exec);
+          break;
+        }
+        case PercentileMethod::NEAREST: {
+          (dest = at(sort_out, static_cast<index_t>(cuda::std::round(base_index)))).run(exec);
+          break;
+        }        
+        default:
+          break;
+      }
+    }
+  }
+}
+
+  }
+}
\ No newline at end of file
diff --git a/include/matx/transforms/reduce.h b/include/matx/transforms/reduce.h
index 16da526ce..c646464c8 100644
--- a/include/matx/transforms/reduce.h
+++ b/include/matx/transforms/reduce.h
@@ -1841,6 +1841,75 @@ void __MATX_INLINE__ median_impl(OutType dest, const InType &in, [[maybe_unused]
 }
 
 
+// /**
+//  * Calculate the median of values in a tensor
+//  *
+//  * Calculates the median of rows in a tensor. The median is computed by sorting
+//  * the data into a temporary tensor, then picking the middle element of each
+//  * row. For an even number of items, the mean of the two middle elements is
+//  * selected. Currently only works on tensor views as input since it uses CUB
+//  * sorting as a backend, and the tensor views must be rank 2 reducing to rank 1,
+//  * or rank 1 reducing to rank 0.
+//  *
+//  * @tparam T
+//  *   Output data type
+//  * @tparam RANK
+//  *   Rank of output tensor
+//  * @tparam RANK_IN
+//  *   Input rank
+//  *
+//  * @param dest
+//  *   Destination view of reduction
+//  * @param in
+//  *   Input data to reduce
+//  * @param exec
+//  *   Single thread host executor
+//  */
+// template <typename OutType, typename InType>
+// void __MATX_INLINE__ percentile_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+// {
+//   MATX_NVTX_START("percentile_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
+//   auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) { 
+//     if constexpr (OutType::Rank() == 0) {
+//       auto insize = TotalSize(in);
+//       auto tin = new typename InType::scalar_type[insize];      
+//       std::partial_sort_copy( lin, 
+//                               lin + insize, 
+//                               tin, 
+//                               tin + insize);
+//       if ((insize % 2) == 0) {
+//         *lout = (tin[insize / 2] + tin[insize / 2 - 1]) / 2.0f;
+//       }
+//       else {
+//         *lout = tin[insize / 2];
+//       }
+
+//       delete [] tin;          
+//     }
+//     else {
+//       auto insize = lin.Size(1);
+//       auto tin = new typename InType::scalar_type[insize];      
+//       for (index_t b = 0; b < lin.Size(0); b++) {
+//         std::partial_sort_copy( lin + lbegin[b], 
+//                                 lin + lend[b], 
+//                                 tin, 
+//                                 tin + insize);
+
+//         if ((insize % 2) == 0) {        
+//           *(lout + b) = (tin[insize / 2] + tin[insize / 2 - 1]) / 2.0f;
+//         }
+//         else {
+//           *(lout + b) = tin[insize / 2];
+//         }     
+//       }
+
+//       delete [] tin;            
+//     }
+//   };
+  
+//   ReduceInput(ft, dest, in);  
+// }
+
 
 /**
  * Compute sum of numbers
diff --git a/test/00_operators/OperatorTests.cu b/test/00_operators/OperatorTests.cu
index dcb0cd427..20f8f3316 100644
--- a/test/00_operators/OperatorTests.cu
+++ b/test/00_operators/OperatorTests.cu
@@ -663,6 +663,38 @@ TYPED_TEST(OperatorTestsNumericAllExecs, CloneOp)
   MATX_EXIT_HANDLER();
 }
 
+TYPED_TEST(OperatorTestsNumericNonComplexAllExecs, AtOp)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = std::tuple_element_t<0, TypeParam>;
+  using ExecType = std::tuple_element_t<1, TypeParam>;
+
+  ExecType exec{};
+  auto t2 = make_tensor<TestType>({2,10});
+
+  // example-begin at-test-1
+  auto t1 = make_tensor<TestType>({10});
+  auto t0 = make_tensor<TestType>({});
+
+  t1.SetVals({10, 20, 30, 40, 50, 60, 70, 80, 90, 100});
+  (t2 = t1).run(exec);
+
+  // Select the fourth element from `t1` as part of the execution. Value should match 
+  // `t1(3)` after execution
+  (t0 = at(t1, 3)).run(exec);
+  // example-end at-test-1
+  cudaStreamSynchronize(0);
+
+  ASSERT_EQ(t0(), t1(3));
+
+  (t0 = at(t2, 1, 4)).run(exec);
+  cudaStreamSynchronize(0);
+
+  ASSERT_EQ(t0(), t2(1, 4));  
+
+  MATX_EXIT_HANDLER();
+}
+
 
 
 TYPED_TEST(OperatorTestsFloatNonComplexAllExecs, SliceStrideOp)
diff --git a/test/00_operators/ReductionTests.cu b/test/00_operators/ReductionTests.cu
index 8ae243af6..4754d6c28 100644
--- a/test/00_operators/ReductionTests.cu
+++ b/test/00_operators/ReductionTests.cu
@@ -690,6 +690,81 @@ TYPED_TEST(ReductionTestsNumericNonComplexAllExecs, All)
   MATX_EXIT_HANDLER();
 }
 
+
+TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Percentile)
+{
+  using TestType = std::tuple_element_t<0, TypeParam>;
+  using ExecType = std::tuple_element_t<1, TypeParam>;
+
+  auto pb = std::make_unique<detail::MatXPybind>();
+  const index_t dsize = 6;
+  pb->InitAndRunTVGenerator<TestType>("00_reductions", "percentile", "run", {dsize});
+
+  ExecType exec{};
+
+  MATX_ENTER_HANDLER();
+  {
+    auto t1e = make_tensor<TestType>({dsize});
+    auto t1o = make_tensor<TestType>({dsize+1});
+    auto t0 = make_tensor<TestType>({});
+    pb->NumpyToTensorView(t1e, "t1e");
+    pb->NumpyToTensorView(t1o, "t1o");
+
+    // example-begin percentile-test-1
+    // Find the 50th percentile value in `t1e` using linear interpolation between midpoints
+    (t0 = percentile(t1e, 50, PercentileMethod::LINEAR)).run(exec);
+    // example-end percentile-test-1
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_linear50", 0.01);
+
+    (t0 = percentile(t1e, 80, PercentileMethod::LINEAR)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_linear80", 0.01);
+
+    (t0 = percentile(t1e, 50, PercentileMethod::LOWER)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_lower50", 0.01);
+
+    (t0 = percentile(t1e, 80, PercentileMethod::LOWER)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_lower80", 0.01);
+
+    (t0 = percentile(t1e, 50, PercentileMethod::HIGHER)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_higher50", 0.01);
+
+    (t0 = percentile(t1e, 80, PercentileMethod::HIGHER)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1e_higher80", 0.01);
+
+    (t0 = percentile(t1o, 50, PercentileMethod::LINEAR)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_linear50", 0.01);
+
+    (t0 = percentile(t1o, 80, PercentileMethod::LINEAR)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_linear80", 0.01);
+
+    (t0 = percentile(t1o, 50, PercentileMethod::LOWER)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_lower50", 0.01);
+
+    (t0 = percentile(t1o, 80, PercentileMethod::LOWER)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_lower80", 0.01);
+
+    (t0 = percentile(t1o, 50, PercentileMethod::HIGHER)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_higher50", 0.01);
+
+    (t0 = percentile(t1o, 80, PercentileMethod::HIGHER)).run(exec);
+    cudaStreamSynchronize(0);
+    MATX_TEST_ASSERT_COMPARE(pb, t0, "t1o_higher80", 0.01);    
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
 TYPED_TEST(ReductionTestsFloatNonComplexNonHalfAllExecs, Median)
 {
   MATX_ENTER_HANDLER();
diff --git a/test/test_vectors/generators/00_reductions.py b/test/test_vectors/generators/00_reductions.py
index fceae996e..e2435b1b2 100644
--- a/test/test_vectors/generators/00_reductions.py
+++ b/test/test_vectors/generators/00_reductions.py
@@ -24,3 +24,29 @@ def run(self):
         self.res['t1_sm'] = special.softmax(self.t1)
         self.res['t3_sm_axis2'] = special.softmax(self.t3, axis=2)
         return self.res
+
+class percentile:
+    def __init__(self, dtype: str, size: List[int]):
+        np.random.seed(1234)
+        self.t1e = matx_common.randn_ndarray((size[0],), dtype)
+        self.t1o = matx_common.randn_ndarray((size[0] + 1,), dtype)
+        self.res = {
+            't1e': self.t1e,
+            't1o': self.t1o
+        }
+
+    def run(self):
+        self.res['t1e_linear50'] = np.percentile(self.t1e, 50, interpolation='linear')
+        self.res['t1e_linear80'] = np.percentile(self.t1e, 80, interpolation='linear')
+        self.res['t1e_lower50']  = np.percentile(self.t1e, 50, interpolation='lower')
+        self.res['t1e_lower80']  = np.percentile(self.t1e, 80, interpolation='lower')
+        self.res['t1e_higher50'] = np.percentile(self.t1e, 50, interpolation='higher')
+        self.res['t1e_higher80'] = np.percentile(self.t1e, 80, interpolation='higher')
+
+        self.res['t1o_linear50'] = np.percentile(self.t1o, 50, interpolation='linear')
+        self.res['t1o_linear80'] = np.percentile(self.t1o, 80, interpolation='linear')
+        self.res['t1o_lower50']  = np.percentile(self.t1o, 50, interpolation='lower')
+        self.res['t1o_lower80']  = np.percentile(self.t1o, 80, interpolation='lower')
+        self.res['t1o_higher50'] = np.percentile(self.t1o, 50, interpolation='higher')
+        self.res['t1o_higher80'] = np.percentile(self.t1o, 80, interpolation='higher')
+        return self.res