diff --git a/less_slow.cpp b/less_slow.cpp index c2ac850..5bbc566 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -2148,12 +2148,12 @@ static void theoretic_tops_ptx( // ptx_file = ptx_path.string(); CUdevice device = 0; - CUcontext context = 0; - CUmodule module_ = 0; - CUfunction kernel = 0; + CUcontext context = nullptr; + CUmodule module_ = nullptr; + CUfunction kernel = nullptr; CUresult result = CUDA_SUCCESS; auto last_error_string = [&result]() -> std::string { - char const *error_string; + char const *error_string = nullptr; cuGetErrorString(result, &error_string); return error_string; }; @@ -2195,7 +2195,7 @@ static void theoretic_tops_ptx( // } // Create context - int context_flags = CU_CTX_SCHED_SPIN | CU_CTX_LMEM_RESIZE_TO_MAX | CU_CTX_SYNC_MEMOPS; + int context_flags = 0; // CU_CTX_SCHED_SPIN | CU_CTX_LMEM_RESIZE_TO_MAX | CU_CTX_SYNC_MEMOPS; result = cuCtxCreate(&context, context_flags, device); if (result != CUDA_SUCCESS) { state.SkipWithError("Failed to create CUDA context: " + last_error_string()); @@ -2242,10 +2242,14 @@ static void theoretic_tops_ptx( // block_dim.x, block_dim.y, block_dim.z, // 0, nullptr, kernel_args, nullptr); if (result != CUDA_SUCCESS) { - state.SkipWithError("Kernel launch failed: " + last_error_string()); + state.SkipWithError("Failed to launch the kernel: " + last_error_string()); + break; + } + result = cuCtxSynchronize(); + if (result != CUDA_SUCCESS) { + state.SkipWithError("Failed while running the kernel: " + last_error_string()); break; } - cuCtxSynchronize(); } std::size_t const tops_per_cycle = m * n * k * 2 * repetitions; @@ -2275,27 +2279,27 @@ BENCHMARK_CAPTURE( // 8, 8, 4, 1024, 90) ->MinTime(10); -BENCHMARK_CAPTURE( // - theoretic_tops_ptx, tf32tf32_sm90tc, // - "less_slow_sm90a.ptx", "tops_tf32tf32_sm90tc_16x16x8_1024loop_ptx_kernel", // +BENCHMARK_CAPTURE( // + theoretic_tops_ptx, tf32f32_sm90tc, // + "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_16x16x8_1024loop_ptx_kernel", // 16, 16, 8, 1024, 90) ->MinTime(10); -BENCHMARK_CAPTURE( // - theoretic_tops_ptx, tf32tf32_sm90tc_wgmma_smallest, // - "less_slow_sm90a.ptx", "tops_tf32tf32_sm90tc_m64n16k8_1024loop_ptx_kernel", // +BENCHMARK_CAPTURE( // + theoretic_tops_ptx, tf32f32_sm90tc_wgmma_smallest, // + "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n16k8_1024loop_ptx_kernel", // 64, 16, 8, 1024, 90) ->MinTime(10); -BENCHMARK_CAPTURE( // - theoretic_tops_ptx, tf32tf32_sm90tc_wgmma_largest, // - "less_slow_sm90a.ptx", "tops_tf32tf32_sm90tc_m64n256k8_1024loop_ptx_kernel", // +BENCHMARK_CAPTURE( // + theoretic_tops_ptx, tf32f32_sm90tc_wgmma_largest, // + "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n256k8_1024loop_ptx_kernel", // 64, 256, 8, 1024, 90) ->MinTime(10); -BENCHMARK_CAPTURE( // - theoretic_tops_ptx, b1b1and_sm90tc_wgmma, // - "less_slow_sm90a.ptx", "tops_b1b1and_sm90tc_m64n256k256_1024loop_ptx_kernel", // +BENCHMARK_CAPTURE( // + theoretic_tops_ptx, b1i32and_sm90tc_wgmma, // + "less_slow_sm90a.ptx", "tops_b1i32and_sm90tc_m64n256k256_1024loop_ptx_kernel", // 64, 256, 256, 1024, 90) ->MinTime(10); diff --git a/less_slow_sm90a.ptx b/less_slow_sm90a.ptx index 7577f8a..2190b27 100644 --- a/less_slow_sm90a.ptx +++ b/less_slow_sm90a.ptx @@ -32,9 +32,9 @@ * Let's define some global memory buffers, visible on both device and host * side, to output multiplication results. */ -.visible .global .align 4 .s32 dummy_sink_s32[4]; -.visible .global .align 4 .f32 dummy_sink_f32[4]; -.visible .global .align 8 .f64 dummy_sink_f64[4]; +.visible .global .align 8 .f64 dummy_sink_f64[32]; +.visible .global .align 4 .s32 dummy_sink_s32[32]; +.visible .global .align 4 .f32 dummy_sink_f32[32]; /** * Our previous Volta kernel should work just fine here, but we can make it @@ -209,7 +209,7 @@ loop_exit: * is confusingly 19 bits wide! The synchronous variant would look familiar: */ - .visible .entry tops_tf32tf32_sm90tc_16x16x8_1024loop_ptx_kernel() + .visible .entry tops_tf32f32_sm90tc_16x16x8_1024loop_ptx_kernel() { // Accumulator registers used for both input and output of the MMA operation .reg .b32 accum<8>; @@ -325,7 +325,7 @@ loop_exit: * The `scale` parameters can be used to either negate the inputs, or disable * additive bias accumulation in the output. */ -.visible .entry tops_tf32tf32_sm90tc_m64n16k8_1024loop_ptx_kernel() +.visible .entry tops_tf32f32_sm90tc_m64n16k8_1024loop_ptx_kernel() { // Accumulator registers used for both input and output of this MMA .reg .f32 accum<8>; @@ -401,7 +401,7 @@ loop_exit: * dimension from 16 to 256? It would require 128 accumulators. */ -.visible .entry tops_tf32tf32_sm90tc_m64n256k8_1024loop_ptx_kernel() +.visible .entry tops_tf32f32_sm90tc_m64n256k8_1024loop_ptx_kernel() { // Accumulator registers used for both input and output of this MMA .reg .f32 accum<128>; @@ -518,7 +518,7 @@ loop_exit: * dimension by 2x, making the instructions no more usable for small matrices. */ -.visible .entry tops_b1b1and_sm90tc_m64n256k256_1024loop_ptx_kernel() +.visible .entry tops_b1i32and_sm90tc_m64n256k256_1024loop_ptx_kernel() { // Accumulator registers used for both input and output of the MMA operation .reg .s32 accum<128>;