Skip to content

Commit

Permalink
Improve: Naming scheme for PTX kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Feb 7, 2025
1 parent b7eab6d commit c1b21c8
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 26 deletions.
42 changes: 23 additions & 19 deletions less_slow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2148,12 +2148,12 @@ static void theoretic_tops_ptx( //
ptx_file = ptx_path.string();

CUdevice device = 0;
CUcontext context = 0;
CUmodule module_ = 0;
CUfunction kernel = 0;
CUcontext context = nullptr;
CUmodule module_ = nullptr;
CUfunction kernel = nullptr;
CUresult result = CUDA_SUCCESS;
auto last_error_string = [&result]() -> std::string {
char const *error_string;
char const *error_string = nullptr;
cuGetErrorString(result, &error_string);
return error_string;
};
Expand Down Expand Up @@ -2195,7 +2195,7 @@ static void theoretic_tops_ptx( //
}

// Create context
int context_flags = CU_CTX_SCHED_SPIN | CU_CTX_LMEM_RESIZE_TO_MAX | CU_CTX_SYNC_MEMOPS;
int context_flags = 0; // CU_CTX_SCHED_SPIN | CU_CTX_LMEM_RESIZE_TO_MAX | CU_CTX_SYNC_MEMOPS;
result = cuCtxCreate(&context, context_flags, device);
if (result != CUDA_SUCCESS) {
state.SkipWithError("Failed to create CUDA context: " + last_error_string());
Expand Down Expand Up @@ -2242,10 +2242,14 @@ static void theoretic_tops_ptx( //
block_dim.x, block_dim.y, block_dim.z, //
0, nullptr, kernel_args, nullptr);
if (result != CUDA_SUCCESS) {
state.SkipWithError("Kernel launch failed: " + last_error_string());
state.SkipWithError("Failed to launch the kernel: " + last_error_string());
break;
}
result = cuCtxSynchronize();
if (result != CUDA_SUCCESS) {
state.SkipWithError("Failed while running the kernel: " + last_error_string());
break;
}
cuCtxSynchronize();
}

std::size_t const tops_per_cycle = m * n * k * 2 * repetitions;
Expand Down Expand Up @@ -2275,27 +2279,27 @@ BENCHMARK_CAPTURE( //
8, 8, 4, 1024, 90)
->MinTime(10);

BENCHMARK_CAPTURE( //
theoretic_tops_ptx, tf32tf32_sm90tc, //
"less_slow_sm90a.ptx", "tops_tf32tf32_sm90tc_16x16x8_1024loop_ptx_kernel", //
BENCHMARK_CAPTURE( //
theoretic_tops_ptx, tf32f32_sm90tc, //
"less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_16x16x8_1024loop_ptx_kernel", //
16, 16, 8, 1024, 90)
->MinTime(10);

BENCHMARK_CAPTURE( //
theoretic_tops_ptx, tf32tf32_sm90tc_wgmma_smallest, //
"less_slow_sm90a.ptx", "tops_tf32tf32_sm90tc_m64n16k8_1024loop_ptx_kernel", //
BENCHMARK_CAPTURE( //
theoretic_tops_ptx, tf32f32_sm90tc_wgmma_smallest, //
"less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n16k8_1024loop_ptx_kernel", //
64, 16, 8, 1024, 90)
->MinTime(10);

BENCHMARK_CAPTURE( //
theoretic_tops_ptx, tf32tf32_sm90tc_wgmma_largest, //
"less_slow_sm90a.ptx", "tops_tf32tf32_sm90tc_m64n256k8_1024loop_ptx_kernel", //
BENCHMARK_CAPTURE( //
theoretic_tops_ptx, tf32f32_sm90tc_wgmma_largest, //
"less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n256k8_1024loop_ptx_kernel", //
64, 256, 8, 1024, 90)
->MinTime(10);

BENCHMARK_CAPTURE( //
theoretic_tops_ptx, b1b1and_sm90tc_wgmma, //
"less_slow_sm90a.ptx", "tops_b1b1and_sm90tc_m64n256k256_1024loop_ptx_kernel", //
BENCHMARK_CAPTURE( //
theoretic_tops_ptx, b1i32and_sm90tc_wgmma, //
"less_slow_sm90a.ptx", "tops_b1i32and_sm90tc_m64n256k256_1024loop_ptx_kernel", //
64, 256, 256, 1024, 90)
->MinTime(10);

Expand Down
14 changes: 7 additions & 7 deletions less_slow_sm90a.ptx
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@
* Let's define some global memory buffers, visible on both device and host
* side, to output multiplication results.
*/
.visible .global .align 4 .s32 dummy_sink_s32[4];
.visible .global .align 4 .f32 dummy_sink_f32[4];
.visible .global .align 8 .f64 dummy_sink_f64[4];
.visible .global .align 8 .f64 dummy_sink_f64[32];
.visible .global .align 4 .s32 dummy_sink_s32[32];
.visible .global .align 4 .f32 dummy_sink_f32[32];

/**
* Our previous Volta kernel should work just fine here, but we can make it
Expand Down Expand Up @@ -209,7 +209,7 @@ loop_exit:
* is confusingly 19 bits wide! The synchronous variant would look familiar:
*/

.visible .entry tops_tf32tf32_sm90tc_16x16x8_1024loop_ptx_kernel()
.visible .entry tops_tf32f32_sm90tc_16x16x8_1024loop_ptx_kernel()
{
// Accumulator registers used for both input and output of the MMA operation
.reg .b32 accum<8>;
Expand Down Expand Up @@ -325,7 +325,7 @@ loop_exit:
* The `scale` parameters can be used to either negate the inputs, or disable
* additive bias accumulation in the output.
*/
.visible .entry tops_tf32tf32_sm90tc_m64n16k8_1024loop_ptx_kernel()
.visible .entry tops_tf32f32_sm90tc_m64n16k8_1024loop_ptx_kernel()
{
// Accumulator registers used for both input and output of this MMA
.reg .f32 accum<8>;
Expand Down Expand Up @@ -401,7 +401,7 @@ loop_exit:
* dimension from 16 to 256? It would require 128 accumulators.
*/

.visible .entry tops_tf32tf32_sm90tc_m64n256k8_1024loop_ptx_kernel()
.visible .entry tops_tf32f32_sm90tc_m64n256k8_1024loop_ptx_kernel()
{
// Accumulator registers used for both input and output of this MMA
.reg .f32 accum<128>;
Expand Down Expand Up @@ -518,7 +518,7 @@ loop_exit:
* dimension by 2x, making the instructions no more usable for small matrices.
*/

.visible .entry tops_b1b1and_sm90tc_m64n256k256_1024loop_ptx_kernel()
.visible .entry tops_b1i32and_sm90tc_m64n256k256_1024loop_ptx_kernel()
{
// Accumulator registers used for both input and output of the MMA operation
.reg .s32 accum<128>;
Expand Down

0 comments on commit c1b21c8

Please sign in to comment.