Skip to content

Commit

Permalink
Fix: Lower PTX version for JIT
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Jan 29, 2025
1 parent 514db0f commit eff3854
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 22 deletions.
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"bfloat",
"bioinformatics",
"BLAS",
"bmma",
"Boccara",
"bootcamps",
"Byrne",
Expand Down Expand Up @@ -72,12 +73,15 @@
"Peta",
"Pikus",
"pmf",
"POPCOUNT",
"popcountll",
"Pranjal",
"prefetcher",
"pthread",
"PTXAS",
"RDMA",
"reorderable",
"Shankhdhar",
"simdjson",
"sinf",
"SLEEF",
Expand All @@ -101,7 +105,9 @@
"VNNI",
"VPCLMULQDQ",
"Weis",
"WGMMA",
"wmma",
"Worklog",
"XCOMP",
"XFEATURE",
"XTILE",
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -326,10 +326,10 @@ if(USE_NVIDIA_CCCL)
target_sources(less_slow PRIVATE less_slow.cu)

# Copy the PTX Intermediate Representation file to the runtime directory
set_source_files_properties(less_slow_ptx.ptx PROPERTIES LANGUAGE "")
set_source_files_properties(less_slow.ptx PROPERTIES LANGUAGE "")
add_custom_command(
TARGET less_slow POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/less_slow_ptx.ptx ${CMAKE_CURRENT_BINARY_DIR}/less_slow_ptx.ptx
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/less_slow.ptx ${CMAKE_CURRENT_BINARY_DIR}/less_slow.ptx
)
endif()

Expand Down
19 changes: 10 additions & 9 deletions less_slow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1983,8 +1983,8 @@ extern __global__ void tops_tf32f32_sm80tc_16x16x8_1024unroll_cuda_kernel();
extern __global__ void tops_f64f64_sm80tc_8x8x4_1024unroll_cuda_kernel();
extern __global__ void tops_b1i32and_sm80tc_8x8x128_1024unroll_cuda_kernel();

BENCHMARK_CAPTURE( //
theoretic_tops_cuda, f16_sm70tc, tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel, //
BENCHMARK_CAPTURE( //
theoretic_tops_cuda, f16f16_sm70tc, tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel, //
16, 16, 16, 1024, 70)
->MinTime(10);
BENCHMARK_CAPTURE( //
Expand Down Expand Up @@ -2026,10 +2026,10 @@ static void theoretic_tops_ptx( //
bm::State &state, //
std::string kernel_name, //
std::size_t m, std::size_t n, std::size_t k, //
int required_capability = 70) {
std::size_t repetitions, int required_capability) {

// Resolve the absolute path to the PTX file
std::string ptx_file = "less_slow_ptx.ptx";
std::string ptx_file = "less_slow.ptx";
std::filesystem::path ptx_path = std::filesystem::absolute(ptx_file);
if (!std::filesystem::exists(ptx_path)) {
state.SkipWithError("Failed to find PTX file.");
Expand Down Expand Up @@ -2126,17 +2126,18 @@ static void theoretic_tops_ptx( //
cuCtxSynchronize();
}

std::size_t const tops_per_cycle = m * n * k * 2;
state.counters["TOP"] = benchmark::Counter(tops_per_cycle * state.iterations(), benchmark::Counter::kIsRate);
std::size_t const tops_per_cycle = m * n * k * 2 * repetitions;
std::size_t const tops_per_gpu = tops_per_cycle * num_sms; //? Warps compute each tile product collectively!
state.counters["TOP"] = benchmark::Counter(tops_per_gpu * state.iterations(), benchmark::Counter::kIsRate);

// Clean up
cuModuleUnload(module_);
cuCtxDestroy(context);
}
// Benchmark configurations with explicit compute capability requirements
BENCHMARK_CAPTURE(theoretic_tops_ptx, f16_tc_sm70, "tops_f16_sm70tc_ptx_kernel", 16, 8, 8, 70)->MinTime(10);
BENCHMARK_CAPTURE(theoretic_tops_ptx, bf16_tc_sm80, "tops_bf16_tc_ptx_kernel_sm80", 16, 8, 8, 80)->MinTime(10);
BENCHMARK_CAPTURE(theoretic_tops_ptx, f8_tc_sm90, "tops_f8_tc_ptx_kernel_sm90", 16, 8, 8, 90)->MinTime(10);
BENCHMARK_CAPTURE(theoretic_tops_ptx, f16f32_sm70tc, "tops_f16f32_sm70tc_16x16x16_1024loop_ptx_kernel", 16, 16, 16,
1024, 70)
->MinTime(10);

#endif

Expand Down
26 changes: 18 additions & 8 deletions less_slow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,22 @@
* The contents of this file complement the contents of the `less_slow.cpp`
* file with GPGPU kernels showcasing:
*
* - How to coordinate CUDA cores within a single block or warp?
* - How to use Tensor Cores for matrix multiplications?
* What's the difference between `mma` and `wgmma` on Hopper?
* - TODO: How to coordinate CUDA cores within a single block or warp?
* A.k.a. how to use shared memory, warp shuffle intrinsics, and reductions?
* - What are CUDA math intrinsics and how much faster are they?
* - TODO: What are CUDA math intrinsics and how much faster are they?
* A.k.a. when to use `__sinf` over `sinf` or `__fdividef` over `a / b`?
* - What's the Physical Page Caching behavior on GPUs?
* - How to schedule advanced computational graphs on GPUs?
* - TODO: What's the Physical Page Caching behavior on GPUs?
* - TODO: How to schedule advanced computational graphs on GPUs?
* A.k.a. CUDA streams vs Graph Node API vs Cooperative Groups?
*
* To compile this file, dump the SASS code, and check for Tensor Cores usage
* on Volta SM70 GPUs, use the following commands:
*
* nvcc -arch=sm_70 -Xptxas -v -lineinfo -cubin -o less_slow_from_cu.cubin less_slow.cu
* cuobjdump -sass less_slow_from_cu.cubin | grep -i mma
* $ nvcc -arch=sm_70 -Xptxas -v -lineinfo -ptx -o less_slow_from_cu.ptx less_slow.cu
* $ nvcc -arch=sm_70 -Xptxas -v -lineinfo -cubin -o less_slow_from_cu.cubin less_slow.cu
* $ cuobjdump -sass less_slow_from_cu.cubin | grep -i mma
*
* Keep in mind the following TC generations:
*
Expand All @@ -34,6 +37,9 @@
* Feature | V100 | A100 | H100
* -------------------------------------|----------|----------|----------
* Compute Capability | 7.0 | 8.0 | 9.0
* PTX Version | 6+ | 7+ | 8+
* CUDA Releases | 9-10 | 11+ | 12+
* -------------------------------------|----------|----------|----------
* Threads / Warp | 32 | 32 | 32
* Max Warps / SM | 64 | 64 | 64
* Max Threads / SM | 2048 | 2048 | 2048
Expand All @@ -43,7 +49,7 @@
* Max Registers / Thread Block (CTA) | 65536 | 65536 | 65536
* Max Registers / Thread | 255 | 255 | 255
* Max Thread Block Size (# of threads) | 1024 | 1024 | 1024
* FP32 Cores / SM | 64 | 64 | 128
* -------------------------------------|----------|----------|----------
* Ratio of SM Registers to FP32 Cores | 1024 | 1024 | 512
* Shared Memory Size / SM | ≤ 96 KB | ≤ 164 KB | ≤ 228 KB
* Tensor Core Generation | 1st | 3rd | 5th
Expand Down Expand Up @@ -151,6 +157,8 @@ __device__ inline void tops_tc_cuda_kernel() {
if (threadIdx.x == 2147483647) wmma::store_matrix_sync(nullptr, c_frag, 16, wmma::mem_row_major);
}

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) //? Binary Matrices require SM75 or higher

/**
* To process binary matrices we can't rely on addition and multiplication.
* A different set of mathematical operations is required, such as @b XOR or
Expand All @@ -170,6 +178,8 @@ __device__ inline void binary_tops_tc_cuda_kernel( //
if (threadIdx.x == 2147483647) wmma::store_matrix_sync(nullptr, c_frag, 16, wmma::mem_row_major);
}

#endif

#pragma region Volta

__global__ void tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel() {
Expand Down Expand Up @@ -281,5 +291,5 @@ __global__ void tops_b1i32and_sm80tc_8x8x128_1024unroll_cuda_kernel() {
* @see "Fast Matrix-Multiplication with WGMMA on NVIDIA Hopper GPUs" by Colfax:
* https://research.colfax-intl.com/cutlass-tutorial-wgmma-hopper/
* @see "Outperforming cuBLAS on H100: a Worklog" by Pranjal Shankhdhar:
* https://cudaforfun.substack.com/p/outperforming-cublas-on-h100-a-worklog
* https://cudaforfun.substack.com/p/outperforming-cublas-on-h100-a-worklog
*/
9 changes: 6 additions & 3 deletions less_slow_ptx.ptx → less_slow.ptx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
// You can validate this file by asking the Nvidia PTX Assembler to compile it
// to `.cubin` for some target architecture:
//
// $ ptxas -o less_slow_from_ptx.cubin -arch=sm_70 less_slow.ptx
// $ ptxas -o less_slow_from_ptx.cubin -arch=sm_70 less_slow.ptx
// $ cuobjdump -sass less_slow_from_ptx.cubin | grep -i mma
//
// ## Register File
//
Expand All @@ -31,7 +32,7 @@
// To read from them, simple use the `mov` instruction.
// ----------------------------------------------------------------------------

.version 8.5 // PTX version 8.5
.version 6.5 // PTX version 6.5 is enough for Volta GPUs
.target sm_70 // Target architecture (SM 7.0 - Volta GPUs)
.address_size 64 // 64-bit addressing

Expand All @@ -42,7 +43,9 @@
.reg .b32 %r<3>; // General-purpose registers for loop counter and packed inputs
.reg .pred %p; // Predicate register for conditional branching

// Initialize the loop counter
// Initialize the loop counter; but keep in mind, that an algorithm with a
// loop will be a lot slower than PTX kernels generated by the CUDA compiler.
// Those end up unrolling thousands of iterations into a single kernel!
mov.u32 %r0, 0; // %r0 = loop counter, start at 0
mov.u32 %r1, 1024; // %r1 = loop limit (1024 iterations)

Expand Down

0 comments on commit eff3854

Please sign in to comment.