Hi all,
I am new in CUDA programming so I am asking some help. I am trying to copy a MxN double matrix from host to device and return some computation from device back to the host.
Following is my source code.
When dealing with a dimension of 100x20 and with a kernel of 4 blocks with 500 threads in each block “matAdd<<<(2,2), (50,10)>>>” the code returns an increment in all cell in the matrix. That’s great!
The problem occurs when I increase the matrix to 100x200. I’ve set up a kernel with 40 blocks with 500 threads in each block “matAdd<<<(4,10), (50,10)>>>”. This theoretically should supply the number of elements in the matrix but it returns a lot of zero elements in the matrix copied from the device.
I need to solve this problem because I will work with 1000x200 and 10000x200 matrices.
Could someone help me with this issue?
[codebox]
#define BEES 100
#define DIM 20
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
//simple allocated host memory
double A_h[BEES][DIM], B_h[BEES][DIM];
//device memory
double *A_d, *B_d;
cudaError_t status = cudaSuccess;
size_t pitch_A, pitch_B;
size_t size=BEES*sizeof(double);
//allocate device memory
status = cudaMallocPitch((void **)(&A_d), &pitch_A, size, DIM);
if(status != cudaSuccess)
{
fprintf(stderr, "%s\n", cudaGetErrorString(status));
return 0;
}
status = cudaMallocPitch((void **)(&B_d), &pitch_B, size, DIM);
if(status != cudaSuccess)
{
fprintf(stderr, "%s\n", cudaGetErrorString(status));
return 0;
}
for (int i=0; i<BEES; i++)
for (int j=0; j<DIM; j++)
A_h[i][j]=j;
//copy host to device memory
status = cudaMemcpy2D(A_d, pitch_A, A_h, size, size, DIM, cudaMemcpyHostToDevice);
if(status != cudaSuccess)
{
fprintf(stderr, "%s\n", cudaGetErrorString(status));
return 0;
}
//Kernel invocation
dim3 Block(50,10); //number of threads per block x * y. Maximun number of threads per block: 512.
dim3 Grid(2,2); // number of blocks x * y.
matAdd<<<Grid, Block>>>(A_d, B_d, pitch_A, pitch_B, size, BEES);
//copy device to host memory
status = cudaMemcpy2D(B_h, size, B_d, pitch_B, size, DIM, cudaMemcpyDeviceToHost);
if(status != cudaSuccess)
{
fprintf(stderr, "%s\n", cudaGetErrorString(status));
return 0;
}
//print results
for (int i=0; i<BEES; i++)
{
for (int j=0; j<DIM; j++)
printf("%.2f ", A_h[i][j]);
printf("\n");
}
printf(“\n”);
for (int i=0; i<BEES; i++)
{
for (int j=0; j<DIM; j++)
printf("%.2f ", B_h[i][j]);
printf("\n");
}
//free device memory
cudaFree(B_d);
cudaFree(A_d);
cudaThreadExit();
cutilExit(argc, argv);
}
global void matAdd(double *A_d, double *B_d, size_t pitch_A, size_t pitch_B, int height, int width)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
int idy=blockIdx.y*blockDim.y+threadIdx.y;
//add +1 to every element
if ( (idx< height) && (idy < width) )
*((double *)((char *)B_d+idy*pitch_B)+idx) = *((double *)((char *)A_d+idy*pitch_A)+idx)+1;
}
[/codebox]