need help for cudaMemcpy2D()

richarding · December 7, 2009, 11:17pm

I tried a very simple CUDA program in order to learn the function API cudaMemcpy2D();
Here below is my src code, the result shows is not correct for the computing the matrix operation for A = B + C;

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define N 4

global static void MaxAdd(int *A, int *B, int *C, int pitch)
{
int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y;

if (xid<N && yid<N)
{
	C[yid*pitch+xid]= A[yid*pitch+xid] + B[yid*pitch+xid];
}

}

int main()
{
int A[N][N] = {{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1}};
int B[N][N] = {{2,2,2,2},{2,2,2,2},{2,2,2,2},{2,2,2,2}};
int C[N][N] = {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}};

int *gpu_A;
int *gpu_B;
int *gpu_C;

cudaMalloc((void**)&gpu_A, sizeof(int)*N*N);
cudaError_t myError = cudaMemcpy2D(gpu_A, sizeof(int)*N, A, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);
printf("%d\n", myError);

cudaMalloc((void**)&gpu_B, sizeof(int)*N*N);
cudaMemcpy2D(gpu_B, sizeof(int)*N, B, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

cudaMalloc((void**)&gpu_C, sizeof(int)*N*N);
cudaMemcpy2D(gpu_C, sizeof(int)*N, C, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

dim3 dimBlock(4, 4);
dim3 dimGrid(1);	

MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, sizeof(int)*N);

cudaMemcpy2D(C, sizeof(int)*N, gpu_C, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyDeviceToHost);

cudaFree(gpu_A);
cudaFree(gpu_B);
cudaFree(gpu_C);

printf("================================\n");

for (int i=0; i<N; i++)
{
	for(int j=0; j<N; j++)
	{
		if(j == N-1)
			printf("\n");
		else
			printf("C[%d][%d]=%d---", i, j, C[i][j]);
	}
}

return 0;

}

I would very appreciate if anyone could help me, thanks a lot in advance!

Nico · December 8, 2009, 8:14pm

MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, sizeof(int)*N);

Your pitch is in byte units, but in the kernel you’re accessing the contents of the arrays in int units.

So you should call

MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, 4);

instead.

N.

richarding · December 8, 2009, 8:19pm

Hi Nico, thank you for your reply, now the code works fine. Thannnnnnk you so much!!! :rolleyes:

richarding · December 8, 2009, 8:48pm

Hi Nico, thank you again, I changed my code a little bit, only put the cudaMallocPitch() into practice, but problem comes, I cannot get the correct result only the first row of the matric C is correct. Would you plz give me some ideas what’s wrong with my code. Thanks!

My new revised code below:

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define N 4

global static void MaxAdd(int *A, int *B, int *C, int pitch)
{
int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y;

if (xid<N && yid<N)
{
	C[yid*pitch+xid]= A[yid*pitch+xid] + B[yid*pitch+xid];
}

}

int main()
{
int A[N][N] = {{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1}};
int B[N][N] = {{2,2,2,2},{2,2,2,2},{2,2,2,2},{2,2,2,2}};
int C[N][N] = {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}};

int *gpu_A;
int *gpu_B;
int *gpu_C;

size_t pitch_a, pitch_b, pitch_c;

cudaMallocPitch((void**)&gpu_A, &pitch_a, sizeof(int)*N, N);
cudaError_t myError = cudaMemcpy2D(gpu_A, pitch_a, A, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);
printf("%d\n", myError);

cudaMallocPitch((void**)&gpu_B, &pitch_b, sizeof(int)*N, N);
cudaMemcpy2D(gpu_B, pitch_b, B, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

cudaMallocPitch((void**)&gpu_C, &pitch_c, sizeof(int)*N, N);
cudaMemcpy2D(gpu_C, pitch_c, C, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

dim3 dimBlock(4, 4);
dim3 dimGrid(1);	

MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, 4);

cudaMemcpy2D(C, sizeof(int)*N, gpu_C, pitch_c, sizeof(int)*N, N, cudaMemcpyDeviceToHost);

cudaFree(gpu_A);
cudaFree(gpu_B);
cudaFree(gpu_C);

printf("================================\n");


for (int i=0; i<N; i++)
{
	for(int j=0; j<N; j++)
	{
		if(j == N-1)
			printf("\n");
		else
			printf("C[%d][%d]=%d---", i, j, C[i][j]);
	}
}

//getchar();

return 0;

}

Nico · December 8, 2009, 9:02pm

With this piece of kernel code you have to be sure that the pitches for a, b and c are the same (which they are), but the pitch is determined by the MallocPitch method and is not necessarily equal to 4 in this case.
The MallocPitch function is going to pad the arrays to get faster access in cuda.
So here you should use
MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, pitch_a/sizeof(int));
with
pitch_a = pitch_b = pitch_c

N.

richarding · December 8, 2009, 9:07pm

Hi Nico, thank you again, I changed my code a little bit, only put the cudaMallocPitch() into practice, but problem comes, I cannot get the correct result only the first row of the matric C is correct. Would you plz give me some ideas what’s wrong with my code. Thanks!

My new revised code below:

include <stdio.h>

include <stdlib.h>

include <cuda_runtime.h>

define N 4

global static void MaxAdd(int *A, int *B, int *C, int pitch)

{

int xid = blockIdx.x * blockDim.x + threadIdx.x;

int yid = blockIdx.y * blockDim.y + threadIdx.y;



if (xid<N && yid<N)

{

	C[yid*pitch+xid]= A[yid*pitch+xid] + B[yid*pitch+xid];

}

}

int main()

{

int A[N][N] = {{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1}};

int B[N][N] = {{2,2,2,2},{2,2,2,2},{2,2,2,2},{2,2,2,2}};

int C[N][N] = {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}};

int *gpu_A;

int *gpu_B;

int *gpu_C;

size_t pitch_a, pitch_b, pitch_c;

cudaMallocPitch((void**)&gpu_A, &pitch_a, sizeof(int)*N, N);

cudaError_t myError = cudaMemcpy2D(gpu_A, pitch_a, A, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

printf("%d\n", myError);

cudaMallocPitch((void**)&gpu_B, &pitch_b, sizeof(int)*N, N);

cudaMemcpy2D(gpu_B, pitch_b, B, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

cudaMallocPitch((void**)&gpu_C, &pitch_c, sizeof(int)*N, N);

cudaMemcpy2D(gpu_C, pitch_c, C, sizeof(int)*N, sizeof(int)*N, N, cudaMemcpyHostToDevice);

dim3 dimBlock(4, 4);

dim3 dimGrid(1);	

MaxAdd<<<dimGrid, dimBlock>>>(gpu_A, gpu_B, gpu_C, 4);

cudaMemcpy2D(C, sizeof(int)*N, gpu_C, pitch_c, sizeof(int)*N, N, cudaMemcpyDeviceToHost);

cudaFree(gpu_A);

cudaFree(gpu_B);

cudaFree(gpu_C);

printf("================================\n");

for (int i=0; i<N; i++)

{

	for(int j=0; j<N; j++)

	{

		if(j == N-1)

			printf("\n");

		else

			printf("C[%d][%d]=%d---", i, j, C[i][j]);

	}

}

//getchar();

return 0;

}

Hi, Nico, thank you soooooooo much, I did learn a lot!!!

Topic		Replies	Views
Getting an Error Using CudaMalloc3d CUDA Programming and Performance	10	2502	December 10, 2015
cudaMemcpy2D help CUDA Programming and Performance	4	10578	July 28, 2009
problem with cudaMallocPitch and cudaMemcpy2D CUDA Programming and Performance	5	6353	April 22, 2009
Copying 2D array from host to device CUDA Programming and Performance	7	7257	July 27, 2010
Simple Malloc on host CUDA Programming and Performance	4	1256	October 22, 2009
How to correct the time units on the results CUDA Programming and Performance	2	449	May 11, 2018
Bad performance using MallocPitch and Memcpy2D CUDA Programming and Performance	9	2828	May 24, 2017
Error when attempting to use cudaMemCpy() CUDA Programming and Performance	5	57	August 9, 2024
CUDA 2D Array Problem Need help to manipulate 2D arrays in CUDA CUDA Programming and Performance	4	26443	March 17, 2011
cudaMalloc3D and friends proper use for whatever data type CUDA Programming and Performance	6	5923	July 14, 2010

need help for cudaMemcpy2D()

Related topics