I am wondering why kernel pad_kernel_fast runs 16x fast than pad_kernel_slow? The only difference between them is dst[idx] = 3.23424324 and dst[idx] = srcvalue.
global void pad_kernel_fast(float *dst,float *src,int width,int height)
{
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
float srcvalue = src[iy * width + ix];
dst[(iy + 2) * (width + 5) + ix + 2] = [b]3.23424324[/b];
}
global void pad_kernel_slow(float *dst,float *src,int width,int height)
{
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
float srcvalue = src[iy * width + ix];
dst[(iy + 2) * (width + 5) + ix + 2] = [b]srcvalue[/b];
}
Regards,
zlf