From eff3854a4bf3c5594eb75ff61e6acf239dc78cf3 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 29 Jan 2025 00:03:08 +0000 Subject: [PATCH] Fix: Lower PTX version for JIT --- .vscode/settings.json | 6 ++++++ CMakeLists.txt | 4 ++-- less_slow.cpp | 19 ++++++++++--------- less_slow.cu | 26 ++++++++++++++++++-------- less_slow_ptx.ptx => less_slow.ptx | 9 ++++++--- 5 files changed, 42 insertions(+), 22 deletions(-) rename less_slow_ptx.ptx => less_slow.ptx (89%) diff --git a/.vscode/settings.json b/.vscode/settings.json index facb58b..be69d44 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,6 +9,7 @@ "bfloat", "bioinformatics", "BLAS", + "bmma", "Boccara", "bootcamps", "Byrne", @@ -72,12 +73,15 @@ "Peta", "Pikus", "pmf", + "POPCOUNT", "popcountll", + "Pranjal", "prefetcher", "pthread", "PTXAS", "RDMA", "reorderable", + "Shankhdhar", "simdjson", "sinf", "SLEEF", @@ -101,7 +105,9 @@ "VNNI", "VPCLMULQDQ", "Weis", + "WGMMA", "wmma", + "Worklog", "XCOMP", "XFEATURE", "XTILE", diff --git a/CMakeLists.txt b/CMakeLists.txt index 6485dab..5a08898 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -326,10 +326,10 @@ if(USE_NVIDIA_CCCL) target_sources(less_slow PRIVATE less_slow.cu) # Copy the PTX Intermediate Representation file to the runtime directory - set_source_files_properties(less_slow_ptx.ptx PROPERTIES LANGUAGE "") + set_source_files_properties(less_slow.ptx PROPERTIES LANGUAGE "") add_custom_command( TARGET less_slow POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/less_slow_ptx.ptx ${CMAKE_CURRENT_BINARY_DIR}/less_slow_ptx.ptx + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/less_slow.ptx ${CMAKE_CURRENT_BINARY_DIR}/less_slow.ptx ) endif() diff --git a/less_slow.cpp b/less_slow.cpp index 2254590..6491783 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -1983,8 +1983,8 @@ extern __global__ void tops_tf32f32_sm80tc_16x16x8_1024unroll_cuda_kernel(); extern __global__ void tops_f64f64_sm80tc_8x8x4_1024unroll_cuda_kernel(); extern __global__ void tops_b1i32and_sm80tc_8x8x128_1024unroll_cuda_kernel(); -BENCHMARK_CAPTURE( // - theoretic_tops_cuda, f16_sm70tc, tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel, // +BENCHMARK_CAPTURE( // + theoretic_tops_cuda, f16f16_sm70tc, tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel, // 16, 16, 16, 1024, 70) ->MinTime(10); BENCHMARK_CAPTURE( // @@ -2026,10 +2026,10 @@ static void theoretic_tops_ptx( // bm::State &state, // std::string kernel_name, // std::size_t m, std::size_t n, std::size_t k, // - int required_capability = 70) { + std::size_t repetitions, int required_capability) { // Resolve the absolute path to the PTX file - std::string ptx_file = "less_slow_ptx.ptx"; + std::string ptx_file = "less_slow.ptx"; std::filesystem::path ptx_path = std::filesystem::absolute(ptx_file); if (!std::filesystem::exists(ptx_path)) { state.SkipWithError("Failed to find PTX file."); @@ -2126,17 +2126,18 @@ static void theoretic_tops_ptx( // cuCtxSynchronize(); } - std::size_t const tops_per_cycle = m * n * k * 2; - state.counters["TOP"] = benchmark::Counter(tops_per_cycle * state.iterations(), benchmark::Counter::kIsRate); + std::size_t const tops_per_cycle = m * n * k * 2 * repetitions; + std::size_t const tops_per_gpu = tops_per_cycle * num_sms; //? Warps compute each tile product collectively! + state.counters["TOP"] = benchmark::Counter(tops_per_gpu * state.iterations(), benchmark::Counter::kIsRate); // Clean up cuModuleUnload(module_); cuCtxDestroy(context); } // Benchmark configurations with explicit compute capability requirements -BENCHMARK_CAPTURE(theoretic_tops_ptx, f16_tc_sm70, "tops_f16_sm70tc_ptx_kernel", 16, 8, 8, 70)->MinTime(10); -BENCHMARK_CAPTURE(theoretic_tops_ptx, bf16_tc_sm80, "tops_bf16_tc_ptx_kernel_sm80", 16, 8, 8, 80)->MinTime(10); -BENCHMARK_CAPTURE(theoretic_tops_ptx, f8_tc_sm90, "tops_f8_tc_ptx_kernel_sm90", 16, 8, 8, 90)->MinTime(10); +BENCHMARK_CAPTURE(theoretic_tops_ptx, f16f32_sm70tc, "tops_f16f32_sm70tc_16x16x16_1024loop_ptx_kernel", 16, 16, 16, + 1024, 70) + ->MinTime(10); #endif diff --git a/less_slow.cu b/less_slow.cu index cc59ffa..564d252 100644 --- a/less_slow.cu +++ b/less_slow.cu @@ -6,19 +6,22 @@ * The contents of this file complement the contents of the `less_slow.cpp` * file with GPGPU kernels showcasing: * - * - How to coordinate CUDA cores within a single block or warp? + * - How to use Tensor Cores for matrix multiplications? + * What's the difference between `mma` and `wgmma` on Hopper? + * - TODO: How to coordinate CUDA cores within a single block or warp? * A.k.a. how to use shared memory, warp shuffle intrinsics, and reductions? - * - What are CUDA math intrinsics and how much faster are they? + * - TODO: What are CUDA math intrinsics and how much faster are they? * A.k.a. when to use `__sinf` over `sinf` or `__fdividef` over `a / b`? - * - What's the Physical Page Caching behavior on GPUs? - * - How to schedule advanced computational graphs on GPUs? + * - TODO: What's the Physical Page Caching behavior on GPUs? + * - TODO: How to schedule advanced computational graphs on GPUs? * A.k.a. CUDA streams vs Graph Node API vs Cooperative Groups? * * To compile this file, dump the SASS code, and check for Tensor Cores usage * on Volta SM70 GPUs, use the following commands: * - * nvcc -arch=sm_70 -Xptxas -v -lineinfo -cubin -o less_slow_from_cu.cubin less_slow.cu - * cuobjdump -sass less_slow_from_cu.cubin | grep -i mma + * $ nvcc -arch=sm_70 -Xptxas -v -lineinfo -ptx -o less_slow_from_cu.ptx less_slow.cu + * $ nvcc -arch=sm_70 -Xptxas -v -lineinfo -cubin -o less_slow_from_cu.cubin less_slow.cu + * $ cuobjdump -sass less_slow_from_cu.cubin | grep -i mma * * Keep in mind the following TC generations: * @@ -34,6 +37,9 @@ * Feature | V100 | A100 | H100 * -------------------------------------|----------|----------|---------- * Compute Capability | 7.0 | 8.0 | 9.0 + * PTX Version | 6+ | 7+ | 8+ + * CUDA Releases | 9-10 | 11+ | 12+ + * -------------------------------------|----------|----------|---------- * Threads / Warp | 32 | 32 | 32 * Max Warps / SM | 64 | 64 | 64 * Max Threads / SM | 2048 | 2048 | 2048 @@ -43,7 +49,7 @@ * Max Registers / Thread Block (CTA) | 65536 | 65536 | 65536 * Max Registers / Thread | 255 | 255 | 255 * Max Thread Block Size (# of threads) | 1024 | 1024 | 1024 - * FP32 Cores / SM | 64 | 64 | 128 + * -------------------------------------|----------|----------|---------- * Ratio of SM Registers to FP32 Cores | 1024 | 1024 | 512 * Shared Memory Size / SM | ≤ 96 KB | ≤ 164 KB | ≤ 228 KB * Tensor Core Generation | 1st | 3rd | 5th @@ -151,6 +157,8 @@ __device__ inline void tops_tc_cuda_kernel() { if (threadIdx.x == 2147483647) wmma::store_matrix_sync(nullptr, c_frag, 16, wmma::mem_row_major); } +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) //? Binary Matrices require SM75 or higher + /** * To process binary matrices we can't rely on addition and multiplication. * A different set of mathematical operations is required, such as @b XOR or @@ -170,6 +178,8 @@ __device__ inline void binary_tops_tc_cuda_kernel( // if (threadIdx.x == 2147483647) wmma::store_matrix_sync(nullptr, c_frag, 16, wmma::mem_row_major); } +#endif + #pragma region Volta __global__ void tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel() { @@ -281,5 +291,5 @@ __global__ void tops_b1i32and_sm80tc_8x8x128_1024unroll_cuda_kernel() { * @see "Fast Matrix-Multiplication with WGMMA on NVIDIA Hopper GPUs" by Colfax: * https://research.colfax-intl.com/cutlass-tutorial-wgmma-hopper/ * @see "Outperforming cuBLAS on H100: a Worklog" by Pranjal Shankhdhar: - * https://cudaforfun.substack.com/p/outperforming-cublas-on-h100-a-worklog + * https://cudaforfun.substack.com/p/outperforming-cublas-on-h100-a-worklog */ \ No newline at end of file diff --git a/less_slow_ptx.ptx b/less_slow.ptx similarity index 89% rename from less_slow_ptx.ptx rename to less_slow.ptx index 49dbde7..e572f51 100644 --- a/less_slow_ptx.ptx +++ b/less_slow.ptx @@ -17,7 +17,8 @@ // You can validate this file by asking the Nvidia PTX Assembler to compile it // to `.cubin` for some target architecture: // -// $ ptxas -o less_slow_from_ptx.cubin -arch=sm_70 less_slow.ptx +// $ ptxas -o less_slow_from_ptx.cubin -arch=sm_70 less_slow.ptx +// $ cuobjdump -sass less_slow_from_ptx.cubin | grep -i mma // // ## Register File // @@ -31,7 +32,7 @@ // To read from them, simple use the `mov` instruction. // ---------------------------------------------------------------------------- -.version 8.5 // PTX version 8.5 +.version 6.5 // PTX version 6.5 is enough for Volta GPUs .target sm_70 // Target architecture (SM 7.0 - Volta GPUs) .address_size 64 // 64-bit addressing @@ -42,7 +43,9 @@ .reg .b32 %r<3>; // General-purpose registers for loop counter and packed inputs .reg .pred %p; // Predicate register for conditional branching - // Initialize the loop counter + // Initialize the loop counter; but keep in mind, that an algorithm with a + // loop will be a lot slower than PTX kernels generated by the CUDA compiler. + // Those end up unrolling thousands of iterations into a single kernel! mov.u32 %r0, 0; // %r0 = loop counter, start at 0 mov.u32 %r1, 1024; // %r1 = loop limit (1024 iterations)