Improve: Naming scheme for PTX kernels

ashvardanian · Feb 7, 2025 · c1b21c8 · c1b21c8
1 parent b7eab6d
commit c1b21c8
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 26 deletions.
diff --git a/less_slow.cpp b/less_slow.cpp
@@ -2148,12 +2148,12 @@ static void theoretic_tops_ptx(                  //
     ptx_file = ptx_path.string();
 
     CUdevice device = 0;
-    CUcontext context = 0;
-    CUmodule module_ = 0;
-    CUfunction kernel = 0;
+    CUcontext context = nullptr;
+    CUmodule module_ = nullptr;
+    CUfunction kernel = nullptr;
     CUresult result = CUDA_SUCCESS;
     auto last_error_string = [&result]() -> std::string {
-        char const *error_string;
+        char const *error_string = nullptr;
         cuGetErrorString(result, &error_string);
         return error_string;
     };
@@ -2195,7 +2195,7 @@ static void theoretic_tops_ptx(                  //
     }
 
     // Create context
-    int context_flags = CU_CTX_SCHED_SPIN | CU_CTX_LMEM_RESIZE_TO_MAX | CU_CTX_SYNC_MEMOPS;
+    int context_flags = 0; // CU_CTX_SCHED_SPIN | CU_CTX_LMEM_RESIZE_TO_MAX | CU_CTX_SYNC_MEMOPS;
     result = cuCtxCreate(&context, context_flags, device);
     if (result != CUDA_SUCCESS) {
         state.SkipWithError("Failed to create CUDA context: " + last_error_string());
@@ -2242,10 +2242,14 @@ static void theoretic_tops_ptx(                  //
             block_dim.x, block_dim.y, block_dim.z, //
             0, nullptr, kernel_args, nullptr);
         if (result != CUDA_SUCCESS) {
-            state.SkipWithError("Kernel launch failed: " + last_error_string());
+            state.SkipWithError("Failed to launch the kernel: " + last_error_string());
+            break;
+        }
+        result = cuCtxSynchronize();
+        if (result != CUDA_SUCCESS) {
+            state.SkipWithError("Failed while running the kernel: " + last_error_string());
             break;
         }
-        cuCtxSynchronize();
     }
 
     std::size_t const tops_per_cycle = m * n * k * 2 * repetitions;
@@ -2275,27 +2279,27 @@ BENCHMARK_CAPTURE(                                                         //
     8, 8, 4, 1024, 90)
     ->MinTime(10);
 
-BENCHMARK_CAPTURE(                                                             //
-    theoretic_tops_ptx, tf32tf32_sm90tc,                                       //
-    "less_slow_sm90a.ptx", "tops_tf32tf32_sm90tc_16x16x8_1024loop_ptx_kernel", //
+BENCHMARK_CAPTURE(                                                            //
+    theoretic_tops_ptx, tf32f32_sm90tc,                                       //
+    "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_16x16x8_1024loop_ptx_kernel", //
     16, 16, 8, 1024, 90)
     ->MinTime(10);
 
-BENCHMARK_CAPTURE(                                                              //
-    theoretic_tops_ptx, tf32tf32_sm90tc_wgmma_smallest,                         //
-    "less_slow_sm90a.ptx", "tops_tf32tf32_sm90tc_m64n16k8_1024loop_ptx_kernel", //
+BENCHMARK_CAPTURE(                                                             //
+    theoretic_tops_ptx, tf32f32_sm90tc_wgmma_smallest,                         //
+    "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n16k8_1024loop_ptx_kernel", //
     64, 16, 8, 1024, 90)
     ->MinTime(10);
 
-BENCHMARK_CAPTURE(                                                               //
-    theoretic_tops_ptx, tf32tf32_sm90tc_wgmma_largest,                           //
-    "less_slow_sm90a.ptx", "tops_tf32tf32_sm90tc_m64n256k8_1024loop_ptx_kernel", //
+BENCHMARK_CAPTURE(                                                              //
+    theoretic_tops_ptx, tf32f32_sm90tc_wgmma_largest,                           //
+    "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n256k8_1024loop_ptx_kernel", //
     64, 256, 8, 1024, 90)
     ->MinTime(10);
 
-BENCHMARK_CAPTURE(                                                                //
-    theoretic_tops_ptx, b1b1and_sm90tc_wgmma,                                     //
-    "less_slow_sm90a.ptx", "tops_b1b1and_sm90tc_m64n256k256_1024loop_ptx_kernel", //
+BENCHMARK_CAPTURE(                                                                 //
+    theoretic_tops_ptx, b1i32and_sm90tc_wgmma,                                     //
+    "less_slow_sm90a.ptx", "tops_b1i32and_sm90tc_m64n256k256_1024loop_ptx_kernel", //
     64, 256, 256, 1024, 90)
     ->MinTime(10);
 

diff --git a/less_slow_sm90a.ptx b/less_slow_sm90a.ptx
@@ -32,9 +32,9 @@
  *  Let's define some global memory buffers, visible on both device and host
  *  side, to output multiplication results.
  */
-.visible .global .align 4 .s32 dummy_sink_s32[4];
-.visible .global .align 4 .f32 dummy_sink_f32[4];
-.visible .global .align 8 .f64 dummy_sink_f64[4];
+.visible .global .align 8 .f64 dummy_sink_f64[32];
+.visible .global .align 4 .s32 dummy_sink_s32[32];
+.visible .global .align 4 .f32 dummy_sink_f32[32];
 
 /**
  *  Our previous Volta kernel should work just fine here, but we can make it
@@ -209,7 +209,7 @@ loop_exit:
  *  is confusingly 19 bits wide! The synchronous variant would look familiar:
  */
 
- .visible .entry tops_tf32tf32_sm90tc_16x16x8_1024loop_ptx_kernel()
+ .visible .entry tops_tf32f32_sm90tc_16x16x8_1024loop_ptx_kernel()
 {
     // Accumulator registers used for both input and output of the MMA operation
     .reg .b32 accum<8>;
@@ -325,7 +325,7 @@ loop_exit:
  *  The `scale` parameters can be used to either negate the inputs, or disable
  *  additive bias accumulation in the output.
  */
-.visible .entry tops_tf32tf32_sm90tc_m64n16k8_1024loop_ptx_kernel()
+.visible .entry tops_tf32f32_sm90tc_m64n16k8_1024loop_ptx_kernel()
 {
     // Accumulator registers used for both input and output of this MMA
     .reg .f32 accum<8>;
@@ -401,7 +401,7 @@ loop_exit:
  *  dimension from 16 to 256? It would require 128 accumulators.
  */
 
-.visible .entry tops_tf32tf32_sm90tc_m64n256k8_1024loop_ptx_kernel()
+.visible .entry tops_tf32f32_sm90tc_m64n256k8_1024loop_ptx_kernel()
 {
     // Accumulator registers used for both input and output of this MMA
     .reg .f32 accum<128>;
@@ -518,7 +518,7 @@ loop_exit:
  *  dimension by 2x, making the instructions no more usable for small matrices.
  */
 
-.visible .entry tops_b1b1and_sm90tc_m64n256k256_1024loop_ptx_kernel()
+.visible .entry tops_b1i32and_sm90tc_m64n256k256_1024loop_ptx_kernel()
 {
     // Accumulator registers used for both input and output of the MMA operation
     .reg .s32 accum<128>;