From eff3854a4bf3c5594eb75ff61e6acf239dc78cf3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 29 Jan 2025 00:03:08 +0000
Subject: [PATCH] Fix: Lower PTX version for JIT

---
 .vscode/settings.json              |  6 ++++++
 CMakeLists.txt                     |  4 ++--
 less_slow.cpp                      | 19 ++++++++++---------
 less_slow.cu                       | 26 ++++++++++++++++++--------
 less_slow_ptx.ptx => less_slow.ptx |  9 ++++++---
 5 files changed, 42 insertions(+), 22 deletions(-)
 rename less_slow_ptx.ptx => less_slow.ptx (89%)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index facb58b..be69d44 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -9,6 +9,7 @@
     "bfloat",
     "bioinformatics",
     "BLAS",
+    "bmma",
     "Boccara",
     "bootcamps",
     "Byrne",
@@ -72,12 +73,15 @@
     "Peta",
     "Pikus",
     "pmf",
+    "POPCOUNT",
     "popcountll",
+    "Pranjal",
     "prefetcher",
     "pthread",
     "PTXAS",
     "RDMA",
     "reorderable",
+    "Shankhdhar",
     "simdjson",
     "sinf",
     "SLEEF",
@@ -101,7 +105,9 @@
     "VNNI",
     "VPCLMULQDQ",
     "Weis",
+    "WGMMA",
     "wmma",
+    "Worklog",
     "XCOMP",
     "XFEATURE",
     "XTILE",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6485dab..5a08898 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -326,10 +326,10 @@ if(USE_NVIDIA_CCCL)
   target_sources(less_slow PRIVATE less_slow.cu)
 
   # Copy the PTX Intermediate Representation file to the runtime directory
-  set_source_files_properties(less_slow_ptx.ptx PROPERTIES LANGUAGE "")
+  set_source_files_properties(less_slow.ptx PROPERTIES LANGUAGE "")
   add_custom_command(
     TARGET less_slow POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/less_slow_ptx.ptx ${CMAKE_CURRENT_BINARY_DIR}/less_slow_ptx.ptx
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/less_slow.ptx ${CMAKE_CURRENT_BINARY_DIR}/less_slow.ptx
   )
 endif()
 
diff --git a/less_slow.cpp b/less_slow.cpp
index 2254590..6491783 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -1983,8 +1983,8 @@ extern __global__ void tops_tf32f32_sm80tc_16x16x8_1024unroll_cuda_kernel();
 extern __global__ void tops_f64f64_sm80tc_8x8x4_1024unroll_cuda_kernel();
 extern __global__ void tops_b1i32and_sm80tc_8x8x128_1024unroll_cuda_kernel();
 
-BENCHMARK_CAPTURE(                                                                       //
-    theoretic_tops_cuda, f16_sm70tc, tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel, //
+BENCHMARK_CAPTURE(                                                                          //
+    theoretic_tops_cuda, f16f16_sm70tc, tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel, //
     16, 16, 16, 1024, 70)
     ->MinTime(10);
 BENCHMARK_CAPTURE(                                                                          //
@@ -2026,10 +2026,10 @@ static void theoretic_tops_ptx(                  //
     bm::State &state,                            //
     std::string kernel_name,                     //
     std::size_t m, std::size_t n, std::size_t k, //
-    int required_capability = 70) {
+    std::size_t repetitions, int required_capability) {
 
     // Resolve the absolute path to the PTX file
-    std::string ptx_file = "less_slow_ptx.ptx";
+    std::string ptx_file = "less_slow.ptx";
     std::filesystem::path ptx_path = std::filesystem::absolute(ptx_file);
     if (!std::filesystem::exists(ptx_path)) {
         state.SkipWithError("Failed to find PTX file.");
@@ -2126,17 +2126,18 @@ static void theoretic_tops_ptx(                  //
         cuCtxSynchronize();
     }
 
-    std::size_t const tops_per_cycle = m * n * k * 2;
-    state.counters["TOP"] = benchmark::Counter(tops_per_cycle * state.iterations(), benchmark::Counter::kIsRate);
+    std::size_t const tops_per_cycle = m * n * k * 2 * repetitions;
+    std::size_t const tops_per_gpu = tops_per_cycle * num_sms; //? Warps compute each tile product collectively!
+    state.counters["TOP"] = benchmark::Counter(tops_per_gpu * state.iterations(), benchmark::Counter::kIsRate);
 
     // Clean up
     cuModuleUnload(module_);
     cuCtxDestroy(context);
 }
 // Benchmark configurations with explicit compute capability requirements
-BENCHMARK_CAPTURE(theoretic_tops_ptx, f16_tc_sm70, "tops_f16_sm70tc_ptx_kernel", 16, 8, 8, 70)->MinTime(10);
-BENCHMARK_CAPTURE(theoretic_tops_ptx, bf16_tc_sm80, "tops_bf16_tc_ptx_kernel_sm80", 16, 8, 8, 80)->MinTime(10);
-BENCHMARK_CAPTURE(theoretic_tops_ptx, f8_tc_sm90, "tops_f8_tc_ptx_kernel_sm90", 16, 8, 8, 90)->MinTime(10);
+BENCHMARK_CAPTURE(theoretic_tops_ptx, f16f32_sm70tc, "tops_f16f32_sm70tc_16x16x16_1024loop_ptx_kernel", 16, 16, 16,
+                  1024, 70)
+    ->MinTime(10);
 
 #endif
 
diff --git a/less_slow.cu b/less_slow.cu
index cc59ffa..564d252 100644
--- a/less_slow.cu
+++ b/less_slow.cu
@@ -6,19 +6,22 @@
  *  The contents of this file complement the contents of the `less_slow.cpp`
  *  file with GPGPU kernels showcasing:
  *
- *  - How to coordinate CUDA cores within a single block or warp?
+ *  - How to use Tensor Cores for matrix multiplications?
+ *    What's the difference between `mma` and `wgmma` on Hopper?
+ *  - TODO: How to coordinate CUDA cores within a single block or warp?
  *    A.k.a. how to use shared memory, warp shuffle intrinsics, and reductions?
- *  - What are CUDA math intrinsics and how much faster are they?
+ *  - TODO: What are CUDA math intrinsics and how much faster are they?
  *    A.k.a. when to use `__sinf` over `sinf` or `__fdividef` over `a / b`?
- *  - What's the Physical Page Caching behavior on GPUs?
- *  - How to schedule advanced computational graphs on GPUs?
+ *  - TODO: What's the Physical Page Caching behavior on GPUs?
+ *  - TODO: How to schedule advanced computational graphs on GPUs?
  *    A.k.a. CUDA streams vs Graph Node API vs Cooperative Groups?
  *
  *  To compile this file, dump the SASS code, and check for Tensor Cores usage
  *  on Volta SM70 GPUs, use the following commands:
  *
- *      nvcc -arch=sm_70 -Xptxas -v -lineinfo -cubin -o less_slow_from_cu.cubin less_slow.cu
- *      cuobjdump -sass less_slow_from_cu.cubin | grep -i mma
+ *  $ nvcc -arch=sm_70 -Xptxas -v -lineinfo -ptx -o less_slow_from_cu.ptx less_slow.cu
+ *  $ nvcc -arch=sm_70 -Xptxas -v -lineinfo -cubin -o less_slow_from_cu.cubin less_slow.cu
+ *  $ cuobjdump -sass less_slow_from_cu.cubin | grep -i mma
  *
  *  Keep in mind the following TC generations:
  *
@@ -34,6 +37,9 @@
  *    Feature                              | V100     | A100     | H100
  *    -------------------------------------|----------|----------|----------
  *    Compute Capability                   | 7.0      | 8.0      | 9.0
+ *    PTX Version                          | 6+       | 7+       | 8+
+ *    CUDA Releases                        | 9-10     | 11+      | 12+
+ *    -------------------------------------|----------|----------|----------
  *    Threads / Warp                       | 32       | 32       | 32
  *    Max Warps / SM                       | 64       | 64       | 64
  *    Max Threads / SM                     | 2048     | 2048     | 2048
@@ -43,7 +49,7 @@
  *    Max Registers / Thread Block (CTA)   | 65536    | 65536    | 65536
  *    Max Registers / Thread               | 255      | 255      | 255
  *    Max Thread Block Size (# of threads) | 1024     | 1024     | 1024
- *    FP32 Cores / SM                      | 64       | 64       | 128
+ *    -------------------------------------|----------|----------|----------
  *    Ratio of SM Registers to FP32 Cores  | 1024     | 1024     | 512
  *    Shared Memory Size / SM              | ≤ 96 KB  | ≤ 164 KB | ≤ 228 KB
  *    Tensor Core Generation               | 1st      | 3rd      | 5th
@@ -151,6 +157,8 @@ __device__ inline void tops_tc_cuda_kernel() {
     if (threadIdx.x == 2147483647) wmma::store_matrix_sync(nullptr, c_frag, 16, wmma::mem_row_major);
 }
 
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) //? Binary Matrices require SM75 or higher
+
 /**
  *  To process binary matrices we can't rely on addition and multiplication.
  *  A different set of mathematical operations is required, such as @b XOR or
@@ -170,6 +178,8 @@ __device__ inline void binary_tops_tc_cuda_kernel( //
     if (threadIdx.x == 2147483647) wmma::store_matrix_sync(nullptr, c_frag, 16, wmma::mem_row_major);
 }
 
+#endif
+
 #pragma region Volta
 
 __global__ void tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel() {
@@ -281,5 +291,5 @@ __global__ void tops_b1i32and_sm80tc_8x8x128_1024unroll_cuda_kernel() {
  *  @see "Fast Matrix-Multiplication with WGMMA on NVIDIA Hopper GPUs" by Colfax:
  *       https://research.colfax-intl.com/cutlass-tutorial-wgmma-hopper/
  *  @see "Outperforming cuBLAS on H100: a Worklog" by Pranjal Shankhdhar:
- *      https://cudaforfun.substack.com/p/outperforming-cublas-on-h100-a-worklog
+ *       https://cudaforfun.substack.com/p/outperforming-cublas-on-h100-a-worklog
  */
\ No newline at end of file
diff --git a/less_slow_ptx.ptx b/less_slow.ptx
similarity index 89%
rename from less_slow_ptx.ptx
rename to less_slow.ptx
index 49dbde7..e572f51 100644
--- a/less_slow_ptx.ptx
+++ b/less_slow.ptx
@@ -17,7 +17,8 @@
 // You can validate this file by asking the Nvidia PTX Assembler to compile it
 // to `.cubin` for some target architecture:
 //
-//   $ ptxas -o less_slow_from_ptx.cubin -arch=sm_70 less_slow.ptx
+// $ ptxas -o less_slow_from_ptx.cubin -arch=sm_70 less_slow.ptx
+// $ cuobjdump -sass less_slow_from_ptx.cubin | grep -i mma
 //
 // ## Register File
 //
@@ -31,7 +32,7 @@
 // To read from them, simple use the `mov` instruction.
 // ----------------------------------------------------------------------------
 
-.version 8.5                          // PTX version 8.5
+.version 6.5                          // PTX version 6.5 is enough for Volta GPUs
 .target sm_70                         // Target architecture (SM 7.0 - Volta GPUs)
 .address_size 64                      // 64-bit addressing
 
@@ -42,7 +43,9 @@
     .reg .b32  %r<3>;                 // General-purpose registers for loop counter and packed inputs
     .reg .pred %p;                    // Predicate register for conditional branching
 
-    // Initialize the loop counter
+    // Initialize the loop counter; but keep in mind, that an algorithm with a
+    // loop will be a lot slower than PTX kernels generated by the CUDA compiler.
+    // Those end up unrolling thousands of iterations into a single kernel!
     mov.u32    %r0, 0;                // %r0 = loop counter, start at 0
     mov.u32    %r1, 1024;             // %r1 = loop limit (1024 iterations)