Texture memory fetching is slower than global memory (CC 6.1). What am I doing wrong?

Hi all,

I tried to implement a simple remapping function (using bi-linear interpolation algo) with texture memory (2d pitched texture with 8 bits * 4 channels) and 2d pitched array (global memory - uchar3) separately. With 4 neighbor texels fetching for interpolation calculation, the implementation of texture memory resulted ~10% slower than global memory (219ms on 2d pitched array and 248ms on tex2D).

Texture memory implementation:

__device__ uchar4 bilinear(const cudaTextureObject_t src, float x, float y) {
	const int x1 = __float2int_rd(x);
	const int y1 = __float2int_rd(y);
	const int x2 = x1 + 1;
	const int y2 = y1 + 1;

	uchar4 out = VecTraits<uchar4>::all(0);

	uchar4 pxData = tex2D<uchar4>(src, x1, y1);

	pxData = tex2D<uchar4>(src, x2, y1);

	pxData = tex2D<uchar4>(src, x1, y2);

	pxData = tex2D<uchar4>(src, x2, y2);

	return out;
}

__global__ void remap(
	const cudaTextureObject_t __restrict__ input,
	uchar3* output,
	const float* __restrict__ mapX,
	const float* __restrict__ mapY,
	int width, int height
) {
	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x < width && y < height) {
		const float xcoord = mapX[y * width + x];
		const float ycoord = mapY[y * width + x];

		uchar4 px = bilinear<channelType, uchar4>(input, xcoord, ycoord);
		output[y * width + x] = make_uchar3(px.x, px.y, px.z);
	}
}

2d pitched array implementation:

__device__ uchar3 gPP(const uchar3* src, int x, int y, size_t width) {
	return *((uchar3 *)((char *)src + y * width) + x);
}

__device__ uchar3 bilinear(const T* src, float x, float y, int width) {
	const int x1 = __float2int_rd(x);
	const int y1 = __float2int_rd(y);
	const int x2 = x1 + 1;
	const int y2 = y1 + 1;

	uchar3 out = VecTraits<uchar3>::all(0);

	uchar3 pxData = gPP(src, x1, y1, width);

	pxData = gPP(src, x2, y1, width);

	pxData = gPP(src, x1, y2, width);

	pxData = gPP(src, x2, y2, width);

	return out;
}

__global__ void remap(
	const uchar3* __restrict__ input,
	uchar3* output,
	const float* __restrict__ mapX,
	const float* __restrict__ mapY,
	int width, int height, size_t pitch
) {
	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x < width && y < height) {
		const float xcoord = mapX[y * width + x];
		const float ycoord = mapY[y * width + x];

		output[y * width + x] = bilinear(input, xcoord, ycoord, pitch);
	}
}

Basically, do fetching 1 texel per thread is faster than getting 1 element from array, but continuously fetching multi-texels per thread seems slower. Please help me to point our my mistakes. Thanks a lot!