cuda: add comments to array-add.cu

danbev · Feb 7, 2025 · eb7f2f8 · eb7f2f8
1 parent f4a4481
commit eb7f2f8
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 11 deletions.
diff --git a/gpu/cuda/.gitignore b/gpu/cuda/.gitignore
@@ -5,3 +5,4 @@ info
 wmma
 streams
 graphs
+array-add.ptx
diff --git a/gpu/cuda/Makefile b/gpu/cuda/Makefile
@@ -2,7 +2,7 @@ info: src/info.cu
 	nvcc -o $@ $<
 
 wmma: src/wmma.cu
-	# GeForce RTX 4080 has compute compatibility 8.6 (https://developer.nvidia.com/cuda-gpus)
+	# GeForce RTX 4070 has compute compatibility 8.6 (https://developer.nvidia.com/cuda-gpus)
 	nvcc -arch=sm_89 -o $@ $<
 
 hello-world-ptx: src/hello-world.cu
@@ -27,7 +27,11 @@ graphs: src/graphs.cu
 dump-array-add: array-add
 	cuobjdump $<
 
+array-add-ptx:
+	nvcc -ptx src/array-add.cu
+	@cat array-add.ptx
+
 
 .PHONY: clean
 clean:
-	${RM} hello-world threads inc hello-world.ptx minimal wmma streams graphs array-add
+	@${RM} threads inc hello-world.ptx info wmma streams graphs array-add
diff --git a/gpu/cuda/src/array-add.cu b/gpu/cuda/src/array-add.cu
@@ -1,11 +1,13 @@
 #include <stdio.h>
 
-__global__ void add_arrays(int *a, int *b, int *c, int size) {
+// Kernal function that runs on the GPU
+__global__ void add_arrays(int* a, int* b, int* c, int size) {
     printf("blockIdx.x = %d, blockDim.x = %d, threadIdx.x = %d\n", blockIdx.x, blockDim.x, threadIdx.x);
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    printf("GPU: adding i = %d\n", i);
-    if (i < size) {
-        c[i] = a[i] + b[i];
+    // Calculate the index of array index that this thread will process.
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        c[idx] = a[idx] + b[idx];
+        printf("[GPU] array index [%d]: adding %d + %d = %d\n", idx, a[idx], b[idx], c[idx]);
     }
 }
 
@@ -42,9 +44,9 @@ int main() {
     cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
     cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
 
-    dim3 grid(1);
-    dim3 blocks(N);
-    add_arrays<<<grid, blocks>>>(d_a, d_b, d_c, N);
+    dim3 blocks(2);  // blocks per grid
+    dim3 threads(3); // threads per block
+    add_arrays<<<blocks, threads>>>(d_a, d_b, d_c, N);
 
     cudaDeviceSynchronize();
 
@@ -54,7 +56,7 @@ int main() {
         return 1;
     }
 
-    // Copy the array that the device has incremented back to the host
+    // Copy the array that the device has computed back to the host
     cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
 
     printf("Added on GPU:\n");
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ info @@
     wmma
     streams
     graphs
+    array-add.ptx