Hi all,
I tried to implement a simple remapping function (using bi-linear interpolation algo) with texture memory (2d pitched texture with 8 bits * 4 channels) and 2d pitched array (global memory - uchar3) separately. With 4 neighbor texels fetching for interpolation calculation, the implementation of texture memory resulted ~10% slower than global memory (219ms on 2d pitched array and 248ms on tex2D).
Texture memory implementation:
__device__ uchar4 bilinear(const cudaTextureObject_t src, float x, float y) {
const int x1 = __float2int_rd(x);
const int y1 = __float2int_rd(y);
const int x2 = x1 + 1;
const int y2 = y1 + 1;
uchar4 out = VecTraits<uchar4>::all(0);
uchar4 pxData = tex2D<uchar4>(src, x1, y1);
pxData = tex2D<uchar4>(src, x2, y1);
pxData = tex2D<uchar4>(src, x1, y2);
pxData = tex2D<uchar4>(src, x2, y2);
return out;
}
__global__ void remap(
const cudaTextureObject_t __restrict__ input,
uchar3* output,
const float* __restrict__ mapX,
const float* __restrict__ mapY,
int width, int height
) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
const float xcoord = mapX[y * width + x];
const float ycoord = mapY[y * width + x];
uchar4 px = bilinear<channelType, uchar4>(input, xcoord, ycoord);
output[y * width + x] = make_uchar3(px.x, px.y, px.z);
}
}
2d pitched array implementation:
__device__ uchar3 gPP(const uchar3* src, int x, int y, size_t width) {
return *((uchar3 *)((char *)src + y * width) + x);
}
__device__ uchar3 bilinear(const T* src, float x, float y, int width) {
const int x1 = __float2int_rd(x);
const int y1 = __float2int_rd(y);
const int x2 = x1 + 1;
const int y2 = y1 + 1;
uchar3 out = VecTraits<uchar3>::all(0);
uchar3 pxData = gPP(src, x1, y1, width);
pxData = gPP(src, x2, y1, width);
pxData = gPP(src, x1, y2, width);
pxData = gPP(src, x2, y2, width);
return out;
}
__global__ void remap(
const uchar3* __restrict__ input,
uchar3* output,
const float* __restrict__ mapX,
const float* __restrict__ mapY,
int width, int height, size_t pitch
) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
const float xcoord = mapX[y * width + x];
const float ycoord = mapY[y * width + x];
output[y * width + x] = bilinear(input, xcoord, ycoord, pitch);
}
}
Basically, do fetching 1 texel per thread is faster than getting 1 element from array, but continuously fetching multi-texels per thread seems slower. Please help me to point our my mistakes. Thanks a lot!