Hi,
I tried to accelerate an image processing function using Pitch, but I have really bad performance.
For instance, with basic cudaMemcpy and cudaMalloc the kernel processed in: 1462 usec (good perf)
Now with memcpy2D and cudaMallocPitch, the kernel processed in: 56299 usec (really bad perf)
Something must be wrong with my code. Do you have any idea ?
Here is the host part:
//image size
int nY = fpIn.height();
int nX = fpIn.width();
//pitch
size_t pitch1;
size_t pitch2;
//image ptrs on host
float* imIn=static_cast<float*>(fpIn.data()); //pointer-> input image
unsigned short* imRef=static_cast<unsigned short*>(ucOff.data()); //pointer-> ref image
//device ptrs
float* d_imIn;
float* d_imRef;
//GPU alloc
cudaMallocPitch((void**)&d_imIn,&pitch1,nX*sizeof(float),nY);
cudaMallocPitch((void**)&d_imRef,&pitch2,nX*sizeof(unsigned short),nY);
//copyHTD
cudaMemcpy2D(d_imIn,pitch1,imIn,nX*sizeof(float),nX*sizeof(float),nY,cudaMemcpyHostToDevice);
cudaMemcpy2D(d_imRef,pitch2,imRef,nX*sizeof(unsigned short),nX*sizeof(unsigned short),nY,cudaMemcpyHostToDevice);
//launch kernel
dim3 Db(32,32), Dg(nX/32,nY/32); //image size:1024x1024
K1_with_pitch<<<Dg,Db>>>(d_imIn, d_imRef, nY, nX, pitch1>>2,pitch2>>2); //pitch in pix not in bytes
//copyDTH
cudaMemcpy2D(imIn,nX*sizeof(float),d_imIn,pitch1,nX*sizeof(float),nY,cudaMemcpyDeviceToHost);
and the device part:
__global__ void K1_with_pitch(float* imIn, unsigned short* imRef, int nY, int nX, size_t pitch1, size_t pitch2)
{
int i=blockDim.x*blockIdx.x+threadIdx.x;
int j=blockDim.y*blockIdx.y+threadIdx.y;
//if((i>=nY) || (j>=nX)) return;
if((i<nY) && (j<nX)) imIn[j+i*pitch1] = imIn[j+i*pitch1] - static_cast<float>(imRef[j+i*pitch2]); // I changed "[j+i*Nx]" to "[j+i*pitch]"
}
Thanks.