need help for cudaMemcpy2D()

I tried a very simple CUDA program in order to learn the function API cudaMemcpy2D();
Here below is my src code, the result shows is not correct for the computing the matrix operation for A = B + C;

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define N 4

global static void MaxAdd(int *A, int *B, int *C, int pitch)
{
int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y;

if (xid<N && yid<N)
{
	C[yid*pitch+xid]= A[yid*pitch+xid] + B[yid*pitch+xid];
}	

}

int main()
{
int A[N][N] = {{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1}};
int B[N][N] = {{2,2,2,2},{2,2,2,2},{2,2,2,2},{2,2,2,2}};
int C[N][N] = {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}};

int *gpu_A;
int *gpu_B;
int *gpu_C;

cudaMalloc((void**)&gpu_A, sizeof(int)*N*N);
cudaError_t myError = cudaMemcpy2D(gpu_A, sizeof(int)*N, A, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);
printf("%d\n", myError);

cudaMalloc((void**)&gpu_B, sizeof(int)*N*N);
cudaMemcpy2D(gpu_B, sizeof(int)*N, B, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

cudaMalloc((void**)&gpu_C, sizeof(int)*N*N);
cudaMemcpy2D(gpu_C, sizeof(int)*N, C, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

dim3 dimBlock(4, 4);
dim3 dimGrid(1);	

MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, sizeof(int)*N);

cudaMemcpy2D(C, sizeof(int)*N, gpu_C, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyDeviceToHost);

cudaFree(gpu_A);
cudaFree(gpu_B);
cudaFree(gpu_C);

printf("================================\n");

for (int i=0; i<N; i++)
{
	for(int j=0; j<N; j++)
	{
		if(j == N-1)
			printf("\n");
		else
			printf("C[%d][%d]=%d---", i, j, C[i][j]);
	}
}

return 0;

}

I would very appreciate if anyone could help me, thanks a lot in advance!

MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, sizeof(int)*N);

Your pitch is in byte units, but in the kernel you’re accessing the contents of the arrays in int units.

So you should call

MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, 4);

instead.

N.

Hi Nico, thank you for your reply, now the code works fine. Thannnnnnk you so much!!! :rolleyes:

Hi Nico, thank you again, I changed my code a little bit, only put the cudaMallocPitch() into practice, but problem comes, I cannot get the correct result only the first row of the matric C is correct. Would you plz give me some ideas what’s wrong with my code. Thanks!

My new revised code below:

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define N 4

global static void MaxAdd(int *A, int *B, int *C, int pitch)
{
int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y;

if (xid<N && yid<N)
{
	C[yid*pitch+xid]= A[yid*pitch+xid] + B[yid*pitch+xid];
}	

}

int main()
{
int A[N][N] = {{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1}};
int B[N][N] = {{2,2,2,2},{2,2,2,2},{2,2,2,2},{2,2,2,2}};
int C[N][N] = {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}};

int *gpu_A;
int *gpu_B;
int *gpu_C;

size_t pitch_a, pitch_b, pitch_c;

cudaMallocPitch((void**)&gpu_A, &pitch_a, sizeof(int)*N, N);
cudaError_t myError = cudaMemcpy2D(gpu_A, pitch_a, A, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);
printf("%d\n", myError);

cudaMallocPitch((void**)&gpu_B, &pitch_b, sizeof(int)*N, N);
cudaMemcpy2D(gpu_B, pitch_b, B, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

cudaMallocPitch((void**)&gpu_C, &pitch_c, sizeof(int)*N, N);
cudaMemcpy2D(gpu_C, pitch_c, C, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

dim3 dimBlock(4, 4);
dim3 dimGrid(1);	

MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, 4);

cudaMemcpy2D(C, sizeof(int)*N, gpu_C, pitch_c, sizeof(int)*N, N, cudaMemcpyDeviceToHost);

cudaFree(gpu_A);
cudaFree(gpu_B);
cudaFree(gpu_C);

printf("================================\n");


for (int i=0; i<N; i++)
{
	for(int j=0; j<N; j++)
	{
		if(j == N-1)
			printf("\n");
		else
			printf("C[%d][%d]=%d---", i, j, C[i][j]);
	}
}

//getchar();

return 0;

}

With this piece of kernel code you have to be sure that the pitches for a, b and c are the same (which they are), but the pitch is determined by the MallocPitch method and is not necessarily equal to 4 in this case.
The MallocPitch function is going to pad the arrays to get faster access in cuda.
So here you should use
MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, pitch_a/sizeof(int));
with
pitch_a = pitch_b = pitch_c

N.

Hi, Nico, thank you soooooooo much, I did learn a lot!!!