// Kernel definition
global void MatAdd(float A[N][N], float B[N][N],
float C[N][N])
{
int i = threadIdx.x;
int j = threadIdx.y;
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
…
// Kernel invocation with one block of N * N * 1 threads
int numBlocks = 1;
dim3 threadsPerBlock(N, N);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
…
}
$ cat t322.cu
#include <iostream>
// N cannot be larger than 32
const int N = 32;
const float Aval = 1.0f;
const float Bval = 2.0f;
// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = threadIdx.x;
int j = threadIdx.y;
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
// Kernel invocation with one block of N * N * 1 threads
typedef float d_arr[N];
float *h_A, *h_B, *h_C;
d_arr *A, *B, *C;
h_A = new float[N*N];
h_B = new float[N*N];
h_C = new float[N*N];
cudaMalloc((void **)&A, N*N*sizeof(h_A[0]));
cudaMalloc((void **)&B, N*N*sizeof(h_A[0]));
cudaMalloc((void **)&C, N*N*sizeof(h_A[0]));
for (int i = 0; i < N*N; i++) {
h_A[i] = Aval;
h_B[i] = Bval;}
cudaMemcpy(A, h_A, N*N*sizeof(h_A[0]), cudaMemcpyHostToDevice);
cudaMemcpy(B, h_B, N*N*sizeof(h_A[0]), cudaMemcpyHostToDevice);
int numBlocks = 1;
dim3 threadsPerBlock(N, N);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
cudaMemcpy(h_C, C, N*N*sizeof(h_A[0]), cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
for (int i = 0; i < N*N; i++) if (h_C[i] != (Aval+Bval)) {std::cout << "mismatch at: " << i << " was: " << h_C[i] << " should be: " << (Aval+Bval) << std::endl; return -1;}
return 0;
}
$ nvcc -o t322 t322.cu
$ cuda-memcheck ./t322
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$