I tried a very simple CUDA program in order to learn the function API cudaMemcpy2D();
Here below is my src code, the result shows is not correct for the computing the matrix operation for A = B + C;
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define N 4
global static void MaxAdd(int *A, int *B, int *C, int pitch)
{
int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y;
if (xid<N && yid<N)
{
C[yid*pitch+xid]= A[yid*pitch+xid] + B[yid*pitch+xid];
}
}
int main()
{
int A[N][N] = {{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1}};
int B[N][N] = {{2,2,2,2},{2,2,2,2},{2,2,2,2},{2,2,2,2}};
int C[N][N] = {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}};
int *gpu_A;
int *gpu_B;
int *gpu_C;
cudaMalloc((void**)&gpu_A, sizeof(int)*N*N);
cudaError_t myError = cudaMemcpy2D(gpu_A, sizeof(int)*N, A, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);
printf("%d\n", myError);
cudaMalloc((void**)&gpu_B, sizeof(int)*N*N);
cudaMemcpy2D(gpu_B, sizeof(int)*N, B, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);
cudaMalloc((void**)&gpu_C, sizeof(int)*N*N);
cudaMemcpy2D(gpu_C, sizeof(int)*N, C, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);
dim3 dimBlock(4, 4);
dim3 dimGrid(1);
MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, sizeof(int)*N);
cudaMemcpy2D(C, sizeof(int)*N, gpu_C, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyDeviceToHost);
cudaFree(gpu_A);
cudaFree(gpu_B);
cudaFree(gpu_C);
printf("================================\n");
for (int i=0; i<N; i++)
{
for(int j=0; j<N; j++)
{
if(j == N-1)
printf("\n");
else
printf("C[%d][%d]=%d---", i, j, C[i][j]);
}
}
return 0;
}
I would very appreciate if anyone could help me, thanks a lot in advance!