NVTX Macros (#276)

NVTX Macros added for automated NVTX Ranges for the MatX API. enable with MATX_NVTX_FLAGS=ON with cmake build.
NVIDIA · Oct 6, 2022 · e5de7bd · e5de7bd
1 parent 988414e
commit e5de7bd
Show file tree

Hide file tree

Showing 28 changed files with 731 additions and 52 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,6 +11,7 @@ endif()
 option(MATX_BUILD_EXAMPLES "Build examples" OFF)
 option(MATX_BUILD_TESTS "Build unit tests" OFF)
 option(MATX_BUILD_BENCHMARKS "Build benchmarks" OFF)
+option(MATX_NVTX_FLAGS "Enable NVTX Macros" OFF)
 option(MATX_BUILD_DOCS "Build documentation. Mutually exclusive with all other options" OFF)
 option(MATX_BUILD_32_BIT "Build with 32-bit indexing support" OFF)
 option(MATX_MULTI_GPU "Multi-GPU support" OFF)
@@ -241,7 +242,9 @@ if (NOT_SUBPROJECT)
 endif()
 
 
-
+if (MATX_NVTX_FLAGS)
+    add_definitions(-DMATX_NVTX_FLAGS)
+endif()
 if (MATX_BUILD_32_BIT)
     add_definitions(-DINDEX_32_BIT)
     target_compile_definitions(matx INTERFACE INDEX_32_BIT)

diff --git a/bench/00_transform/conv.cu b/bench/00_transform/conv.cu
@@ -1,6 +1,7 @@
 #include "matx.h"
 #include <nvbench/nvbench.cuh>
 #include "matx/core/half_complex.h"
+#include "matx/core/nvtx.h"
 
 using namespace matx;
 
@@ -12,20 +13,20 @@ template <typename ValueType>
 void conv1d_4d_batch(nvbench::state &state,
                             nvbench::type_list<ValueType>)
 {
-
-
   auto out = make_tensor<ValueType>({4, 2, 14, 288 + 4096 + 133 - 1});
   auto at = make_tensor<ValueType>({ 4, 2, 14, 133});
   auto bt = make_tensor<ValueType>({ 4, 2, 14, 288 + 4096});
 
   out.PrefetchDevice(0);
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
-
-  cudaDeviceSynchronize();
 
+  cudaDeviceSynchronize();
+  MATX_NVTX_START_RANGE( "Exec", matx_nvxtLogLevels::MATX_NVTX_LOG_ALL, 1 )
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { conv1d(out, at, bt, MATX_C_MODE_FULL, launch.get_stream()); });
+  MATX_NVTX_END_RANGE( 1 )
+
 }
 NVBENCH_BENCH_TYPES(conv1d_4d_batch, NVBENCH_TYPE_AXES(conv_types));
 
@@ -43,7 +44,7 @@ void conv1d_2d_batch(nvbench::state &state,
   out.PrefetchDevice(0);
   at.PrefetchDevice(0);
   bt.PrefetchDevice(0);
-  
+
   cudaDeviceSynchronize();
 
   state.exec(

diff --git a/docs_input/api/index.rst b/docs_input/api/index.rst
@@ -15,3 +15,4 @@ API Reference
    utilities.rst
    type_traits.rst
    einsum.rst
+   nvtx.rst
diff --git a/docs_input/api/nvtx.rst b/docs_input/api/nvtx.rst
@@ -0,0 +1,70 @@
+NVTX Profiling
+########################
+
+Overview
+-----------------
+MatX provides an NVTX API to enable native compile-in profiling capabilities. The MatX NVTX API enable a user to 
+easily profile all MatX calls using built-in NVTX ranges, while also providing a convenient API for the user to insert 
+custom ranges in their own code. This API provides many convenience features such as:
+
+- A convenient compile-in/compile-out MACRO based API 
+- verbosity levels allowing varying levels of profiling detail
+- Built-in color rotation
+- Automatic scope management and range naming 
+- Overloaded API for manual range specification
+
+The MatX NVTX API is implemented as a set of C++ Macros, allowing the user to compile all calls out of the project for 
+maximum performance when profiling is not needed. 
+
+Enabling NVTX API and Setting Log Level
+-----------------
+To enable the NVTX Profiling API, simple compile with the ``MATX_NVTX_FLAG=ON`` enabled in the cmake command.
+Once the flags are enabled at compile time, the project defaults to logging at the API level, which will provide NVTX
+ranges for all MatX API calls. If another logging level is desired, this can be changed using the ``matx::setNVTXLogLevel()`` call. 
+Possible log levels are defined in ``matx_nvxtLogLevels``.
+
+Using the NVTX API
+-----------------
+The MatX NVTX API consists of two modes: auto managed, and manual range. The auto-managed API will automatically match the NVTX range to 
+the scope in which it is declared, establishing the NVTX range from the call’s instantiation to the end of its parent scope. Only a single 
+call is needed, with optional inputs defined below. If no message is provided, the call defaults to using the calling function’s name as 
+the NVTX range’s message.
+
+The Manual Range NVTX API requires the user to make a call to the NVTX API at both the beginning and end of the desired range. The Manual 
+Range API uses a user defined handle (int) to reference the NVTX range. A Manual NVTX Range must be fully qualified on every instantiation. 
+
+NVTX Examples
+-----------------
+
+.. list-table::
+  :widths: 60 40
+  :header-rows: 1
+
+  * - Command 
+    - Result
+  * - MATX_NVTX_START("")
+    - NVTX range scoped to this function, named the same as function with log level of Internal 
+  * - MATX_NVTX_START("MY_MESSAGE")
+    - NVTX range scoped to this function, named “MY_MESSAGE” with log level of Internal
+  * - MATX_NVTX_START("MY_MESSAGE", matx::MATX_NVTX_LOG_API )
+    - NVTX range scoped to this function, named “MY_MESSAGE” with log level of API
+  * - MATX_NVTX_START_RANGE( "MY_MESSAGE", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1 )
+    - NVTX range with manual scope, named “MY_MESSAGE”, log level of USER, and handle ID of 1
+  * - MATX_NVTX_END_RANGE(1)
+    - Ends the NVTX range of range with a handle of 1 used in NVTX_START_RANGE        
+
+Code examples are provided in the ``simple_pipeline`` code to show user utilization of the MatX NVTX API. 
+
+MatX NVTX API 
+-----------------
+.. doxygenfunction:: matx::setNVTXLogLevel
+.. doxygenfunction:: matx::registerEvent
+.. doxygenfunction:: matx::endEvent
+
+MatX NVTX Logging Levels
+-----------------
+.. doxygenenum:: matx::matx_nvxtLogLevels
+
+MatX NVTX Auto Range Colors
+-----------------
+.. doxygenvariable:: matx::nvtxColors    
diff --git a/examples/simple_pipeline.cu b/examples/simple_pipeline.cu
@@ -50,29 +50,50 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // cuda stream to place work in
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-
+
+  // manually set to log all NVTX levels
+  MATX_NVTX_SET_LOG_LEVEL( matx_nvxtLogLevels::MATX_NVTX_LOG_ALL );
+
   // create some events for timing
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
-
+
+  MATX_NVTX_START_RANGE("Pipeline Initialize", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1)
   printf("Initializing data structures...\n");
   auto radar =
       RadarPipeline(numPulses, numSamples, waveformLength, numChannels, stream);
   radar.GetInputView()->PrefetchDevice(stream);
-
+  MATX_NVTX_END_RANGE(1)
+
+  MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
   printf("Running test...\n");
   cudaStreamSynchronize(stream);
   cudaEventRecord(start, stream);
+
   for (uint32_t i = 0; i < iterations; i++) {
+    MATX_NVTX_START_RANGE("PulseCompression", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 21)
     radar.PulseCompression();
+    MATX_NVTX_END_RANGE(21)  
+
+    MATX_NVTX_START_RANGE("ThreePulseCanceller", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 22)
     radar.ThreePulseCanceller();
+    MATX_NVTX_END_RANGE(22)
+
+    MATX_NVTX_START_RANGE("DopplerProcessing", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 23)
     radar.DopplerProcessing();
+    MATX_NVTX_END_RANGE(23)
+
+    MATX_NVTX_START_RANGE("CFARDetections", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 24)
     radar.CFARDetections();
+    MATX_NVTX_END_RANGE(24)
   }
 
   cudaEventRecord(stop, stream);
   cudaStreamSynchronize(stream);
+  MATX_NVTX_END_RANGE(2)
+
+  MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3)
   float time_ms;
   cudaEventElapsedTime(&time_ms, start, stop);
   float time_s = time_ms * .001f;
@@ -90,7 +111,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   matxPrintMemoryStatistics();
 
   printf("Done\n");
-
+  MATX_NVTX_END_RANGE(3)
   MATX_EXIT_HANDLER();
   return 0;
 }
diff --git a/include/matx.h b/include/matx.h
@@ -33,15 +33,17 @@
 #pragma once
 #include <cuda_runtime_api.h>
 #include <cuda/std/ccomplex>
+
 #include "matx/core/defines.h"
 #include "matx/core/error.h"
 #include "matx/core/file_io.h"
 #include "matx/core/half_complex.h"
 #include "matx/core/half.h"
+#include "matx/core/nvtx.h"
 #include "matx/core/pybind.h"
 #include "matx/core/tensor.h"
-#include "matx/core/viz.h"
 #include "matx/core/utils.h"
+#include "matx/core/viz.h"
 
 #include "matx/executors/executors.h"
 #include "matx/generators/generators.h"

diff --git a/include/matx/core/allocator.h b/include/matx/core/allocator.h
@@ -42,6 +42,7 @@
 #endif
 
 #include "matx/core/error.h"
+#include "matx/core/nvtx.h"
 
 #pragma once
 
@@ -201,7 +202,9 @@ inline void matxAlloc(void **ptr, size_t bytes,
                       cudaStream_t stream = 0)
 {
   [[maybe_unused]] cudaError_t err = cudaSuccess;
-
+
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
+
   switch (space) {
   case MATX_MANAGED_MEMORY:
     err = cudaMallocManaged(ptr, bytes);
@@ -244,6 +247,8 @@ inline void matxAlloc(void **ptr, size_t bytes,
  */
 inline void matxFree(void *ptr)
 {
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
+
   if (ptr == nullptr) {
     return;
   }

diff --git a/include/matx/core/file_io.h b/include/matx/core/file_io.h
@@ -41,9 +41,10 @@
 // #include <cudf/io/csv.hpp>
 // #include <cudf/types.hpp>
 
+#include "matx/core/error.h"
+#include "matx/core/nvtx.h"
 #include "matx/core/pybind.h"
 #include "matx/core/tensor.h"
-#include "matx/core/error.h"
 
 
 #if MATX_ENABLE_FILEIO
@@ -126,6 +127,8 @@ template <typename TensorType>
 void ReadCSV(TensorType &t, const std::string fname,
              const std::string delimiter, bool header = true)
 {
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
+
   if (TensorType::Rank() != 1 && TensorType::Rank() != 2) {
     MATX_THROW(matxInvalidDim,
                "CSV reading limited to tensors of rank 1 and 2");
@@ -152,6 +155,8 @@ template <typename TensorType>
 void WriteCSV(const TensorType &t, const std::string fname,
               const std::string delimiter)
 {
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
+
   if (TensorType::Rank() != 1 && TensorType::Rank() != 2) {
     MATX_THROW(matxInvalidDim,
                "CSV reading limited to tensors of rank 1 and 2");
@@ -186,6 +191,8 @@ template <typename TensorType>
 void ReadMAT(TensorType &t, const std::string fname,
              const std::string var)
 {
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
+
   auto pb = std::make_unique<detail::MatXPybind>();
 
   auto sp = pybind11::module_::import("scipy.io");
@@ -213,7 +220,8 @@ template <typename TensorType>
 void WriteMAT(const TensorType &t, const std::string fname,
               const std::string var)
 {
-
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
+
   auto pb = std::make_unique<detail::MatXPybind>();
   auto np = pybind11::module_::import("numpy");
   auto sp = pybind11::module_::import("scipy.io");