cudaMemcpy2D / Grid size / MxN double matrix Problem copying a MxN double matrix from Host to Device

Hi all,

I am new in CUDA programming so I am asking some help. I am trying to copy a MxN double matrix from host to device and return some computation from device back to the host.

Following is my source code.

When dealing with a dimension of 100x20 and with a kernel of 4 blocks with 500 threads in each block “matAdd<<<(2,2), (50,10)>>>” the code returns an increment in all cell in the matrix. That’s great!

The problem occurs when I increase the matrix to 100x200. I’ve set up a kernel with 40 blocks with 500 threads in each block “matAdd<<<(4,10), (50,10)>>>”. This theoretically should supply the number of elements in the matrix but it returns a lot of zero elements in the matrix copied from the device.

I need to solve this problem because I will work with 1000x200 and 10000x200 matrices.

Could someone help me with this issue?

[codebox]

#define BEES 100

#define DIM 20

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main( int argc, char** argv)

{

//simple allocated host memory

double A_h[BEES][DIM], B_h[BEES][DIM];

//device memory

double *A_d, *B_d;

cudaError_t status = cudaSuccess;

size_t pitch_A, pitch_B;

size_t size=BEES*sizeof(double);

//allocate device memory

status = cudaMallocPitch((void **)(&A_d), &pitch_A, size, DIM);

if(status != cudaSuccess)

{

	fprintf(stderr, "%s\n", cudaGetErrorString(status));

return 0;

}

status = cudaMallocPitch((void **)(&B_d), &pitch_B, size, DIM);

if(status != cudaSuccess)

{

	fprintf(stderr, "%s\n", cudaGetErrorString(status));

return 0;

}

for (int i=0; i<BEES; i++)

for (int j=0; j<DIM; j++) 

	A_h[i][j]=j;

//copy host to device memory

status = cudaMemcpy2D(A_d, pitch_A, A_h, size, size, DIM, cudaMemcpyHostToDevice);

if(status != cudaSuccess)

{

	fprintf(stderr, "%s\n", cudaGetErrorString(status));

return 0;

}

//Kernel invocation

dim3 Block(50,10); //number of threads per block x * y. Maximun number of threads per block: 512.

dim3 Grid(2,2); // number of blocks x * y.

matAdd<<<Grid, Block>>>(A_d, B_d, pitch_A, pitch_B, size, BEES);

//copy device to host memory

status = cudaMemcpy2D(B_h, size, B_d, pitch_B, size, DIM, cudaMemcpyDeviceToHost);

if(status != cudaSuccess)

{

	fprintf(stderr, "%s\n", cudaGetErrorString(status));

return 0;

}

//print results

for (int i=0; i<BEES; i++)

{

for (int j=0; j<DIM; j++) 

	printf("%.2f ", A_h[i][j]);

printf("\n");

}

printf(“\n”);

for (int i=0; i<BEES; i++)

{

for (int j=0; j<DIM; j++) 

	printf("%.2f ", B_h[i][j]);

printf("\n");

}

//free device memory

cudaFree(B_d);

cudaFree(A_d);

cudaThreadExit();

cutilExit(argc, argv);

}

global void matAdd(double *A_d, double *B_d, size_t pitch_A, size_t pitch_B, int height, int width)

{

int idx=blockIdx.x*blockDim.x+threadIdx.x;

int idy=blockIdx.y*blockDim.y+threadIdx.y;

//add +1 to every element

if ( (idx< height) && (idy < width) )

*((double *)((char *)B_d+idy*pitch_B)+idx) = *((double *)((char *)A_d+idy*pitch_A)+idx)+1;

}

[/codebox]

There is a similar issue like this from yesterday:

[url=“http://forums.nvidia.com/index.php?showtopic=163224”]The Official NVIDIA Forums | NVIDIA

I didn’t use cudaMemcpy2D or Pitch, but maybe your issue is similar?

Matt

Tks for the reply and for the other topic link.

The topic that you sent discuss a kernel that isn’t running and the access is upon texture memory. It’s not the case here. My program run perfectly unless some dimensions that I pointed before.

I’m trying other experimentations but without success…

Any tip?

You want your image pitch to be aligned with the warp, or something like that, I’m not sure how best to explain it.

Try this:

//Round a / b to nearest higher integer value

inline int iDivUp(int a, int b)

{

	return (a % b != 0) ? (a / b + 1) : (a / b);

}

//Align a to nearest higher multiple of b

inline int iAlignUp(int a, int b)

{

	return (a % b != 0) ?  (a - a % b + b) : a;

}

//other code here memory allocation, etc

//get ready to launch the kernal

	int align = iAlignUp(BEES, 32);

	dim3 threads(16, 16, 1);

	dim3 blocks(iDivUp(BEES, threads.x), iDivUp(DIM, threads.y));

	printf("thread block size = %d x %d \n", threads.x, threads.y);

	printf(" block size = %d x %d \n", blocks.x, blocks.y);

	

	matAdd<<<blocks, threads>>>(A_d, B_d, align, DIM);

	cudaThreadSynchronize();

	 // check if kernel execution generated and error

	cutilCheckMsg("Kernel execution failed");

Obviously change the threads to whatever works best

Also your kernel is a bit obtuse. This is easier to read:

__global__ void matAdd(float *A_d, float *B_d, int width, int height)

{

  int idx=blockIdx.x*blockDim.x+threadIdx.x;

  int idy=blockIdx.y*blockDim.y+threadIdx.y; 

  int index = width * idy + idx;

//add +1 to every element

  if ( (idx< width) && (idy < height) )

	B_d[index] = A_d[index] +1;

}

I don’t know if you need doubles, but I put everything in floats… It seems to work for me.