Hi,
I am having some trouble in understanding the way the padding works while using the pitch memory. I have implemented a small program using cudaMallocPitch() , but I am not able to access the entire matrix allocated in the pitch in the kernel.While printing out the values it looks like the threads are accessing the padding instead of the matrix. Could someone explain where I am going wrong?
The program:
global void mat_add(int a_d,size_t pitch_a,int width,int height)
{
int i = blockIdx.x10 + threadIdx.x;
int c;
int row=(int)((char*)a_d+i*pitch_a);
for(c=0;c<width;c++){
row[c]=row[c]+1;
printf(“i:%d val:%d\n”,i,row[c]);
}
}
int main(void)
{
int **a_h,*a_d;
int i,j;
size_t pitch_a;
int width=10;
int height=30;
a_h=(int **)calloc(height,sizeof(int *));
for(i=0;i<height;i++)
a_h[i]=(int *)calloc(width,sizeof(int));
// memory allocation for device
cudaMallocPitch((void**)&a_d,&pitch_a,width*sizeof(int),height);
printf(" pitch a %d \n",pitch_a);
// assigning values to host array
for(i=0;i<height;i++)
{
for(j=0;j<width;j++)
{
a_h[i][j] =10;
}
}
// copy data from host to device
cudaMemcpy2D(a_d,pitch_a,a_h,sizeof(int)*width,sizeof(int)*w
idth,height,cudaMemcpyHostToDevice);
// call device function
mat_add<<<3,10>>>(a_d,pitch_a,width,height);
free(a_h);
cudaFree(a_d);
}
Thanks,
Sudha