-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubtraction.cu
131 lines (108 loc) · 3.51 KB
/
subtraction.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#define N 5000
__global__ void matrixSubtract(const float *A, const float *B, float *C)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < N && col < N)
{
int index = row * N + col;
C[index] = A[index] - B[index];
}
}
void printMatrix(const float *matrix)
{
if (N <= 5)
{
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
std::cout << matrix[i * N + j] << " ";
}
std::cout << std::endl;
}
} else{
std::cout << "Matrix too large." << std::endl;
}
}
void matrixSubtractCPU(const float *A, const float *B, float *C)
{
for (int i = 0; i < N; ++i)
{
for (int j = 0; j < N; ++j)
{
C[i * N + j] = A[i * N + j] - B[i * N + j];
}
}
}
int main()
{
float *A = new float[N * N];
float *B = new float[N * N];
float *C = new float[N * N];
float *d_A, *d_B, *d_C;
// Initialize matrices A and B
for (int i = 0; i < N * N; ++i)
{
A[i] = static_cast<float>(i + 1);
}
for (int i = 0; i < N * N; ++i)
{
B[i] = static_cast<float>(N * N - 1 - i);
}
std::cout << "Matrix A:" << std::endl;
printMatrix(A);
std::cout << "Matrix B:" << std::endl;
printMatrix(B);
// Allocate memory on GPU
cudaMalloc((void **)&d_A, N * N * sizeof(float));
cudaMalloc((void **)&d_B, N * N * sizeof(float));
cudaMalloc((void **)&d_C, N * N * sizeof(float));
// Copy data to GPU
cudaMemcpy(d_A, A, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * N * sizeof(float), cudaMemcpyHostToDevice);
// Define CUDA kernel parameters
dim3 threadsPerBlock(1000, 1000);
dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
(N + threadsPerBlock.y - 1) / threadsPerBlock.y);
// Record CUDA kernel execution time
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
// Launch CUDA kernel
matrixSubtract<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);
cudaDeviceSynchronize();
// Stop and compute CUDA execution time
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "CUDA kernel execution time: " << milliseconds << " ms" << std::endl;
// Copy result back to host
cudaMemcpy(C, d_C, N * N * sizeof(float), cudaMemcpyDeviceToHost);
// Print result
std::cout << "Result matrix C (A - B) using CUDA:" << std::endl;
printMatrix(C);
// Free GPU memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Measure CPU execution time
auto startCPU = std::chrono::high_resolution_clock::now();
matrixSubtractCPU(A, B, C);
auto stopCPU = std::chrono::high_resolution_clock::now();
auto durationCPU = std::chrono::duration_cast<std::chrono::nanoseconds>(stopCPU - startCPU).count();
std::cout << "CPU execution time: " << durationCPU * 1e-6 << " ms" << std::endl;
// Print CPU result
std::cout << "Result matrix C (A - B) using CPU:" << std::endl;
printMatrix(C);
// Free heap memory
delete[] A;
delete[] B;
delete[] C;
return 0;
}