The complete code for the int main program is?

// Kernel definition
global void MatAdd(float A[N][N], float B[N][N],
float C[N][N])
{
int i = threadIdx.x;
int j = threadIdx.y;
C[i][j] = A[i][j] + B[i][j];
}

int main()
{

// Kernel invocation with one block of N * N * 1 threads
int numBlocks = 1;
dim3 threadsPerBlock(N, N);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);

}

$ cat t322.cu
#include <iostream>

// N cannot be larger than 32
const int N = 32;
const float Aval = 1.0f;
const float Bval = 2.0f;

// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
  int i = threadIdx.x;
  int j = threadIdx.y;
  C[i][j] = A[i][j] + B[i][j];
}

int main()
{

// Kernel invocation with one block of N * N * 1 threads
  typedef float d_arr[N];
  float *h_A, *h_B, *h_C;
  d_arr *A, *B, *C;
  h_A = new float[N*N];
  h_B = new float[N*N];
  h_C = new float[N*N];
  cudaMalloc((void **)&A, N*N*sizeof(h_A[0]));
  cudaMalloc((void **)&B, N*N*sizeof(h_A[0]));
  cudaMalloc((void **)&C, N*N*sizeof(h_A[0]));
  for (int i = 0; i < N*N; i++) {
    h_A[i] = Aval;
    h_B[i] = Bval;}
  cudaMemcpy(A, h_A, N*N*sizeof(h_A[0]), cudaMemcpyHostToDevice);
  cudaMemcpy(B, h_B, N*N*sizeof(h_A[0]), cudaMemcpyHostToDevice);
  int numBlocks = 1;
  dim3 threadsPerBlock(N, N);
  MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
  cudaMemcpy(h_C, C, N*N*sizeof(h_A[0]), cudaMemcpyDeviceToHost);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  for (int i = 0; i < N*N; i++) if (h_C[i] != (Aval+Bval)) {std::cout << "mismatch at: " << i << " was: "  << h_C[i] << " should be: " << (Aval+Bval) << std::endl; return -1;}
  return 0;
}
$ nvcc -o t322 t322.cu
$ cuda-memcheck ./t322
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$

Thanks!