Linear interpolation with textures using unsigned data

I am trying to apply an undistortion map (in X and Y direction) to an image using textures. The input image I use is in unsigned char. I also want my output type to be unsigned char.
Now I have a program in MATLAB doing the interpolation in float which leads to a good result. I am able to recreate the interpolation in C++/CUDA in unsigned char, but my output image has some areas which are a bit more pixelated compared to the MATLAB approach. This is the code I use in C++:

__global__ void undistortionKernel(uchar4* outputImage, size_t pitchOutputImage,
	cudaTextureObject_t inputImageTex, 
	const float* XundistMap, size_t XpitchUndistmap,
	const float* YundistMap, size_t YpitchUndistmap,
	int width, int height)
{
	const int x = blockIdx.x * blockDim.x + threadIdx.x;
	const int y = blockDim.y * blockIdx.y + threadIdx.y;

	const float tx = (XundistMap[y * XpitchUndistmap + x]);
	const float ty = (YundistMap[y * YpitchUndistmap + x]);

	if (x >= width || y >= height) return;

	uchar4 outputImageTemp = tex2D<uchar4>(inputImageTex, tx, ty);
	outputImage[y * pitchOutputImage + x] = outputImageTemp;
}

void undistortImage(unsigned char* outputImage, size_t pitchOutputImage, unsigned char* inputImage, size_t pitchInputImage, const float* undistMapX, size_t pitchUndistmapX, const float* undistMapY, size_t pitchUndistmapY, int width, int height, cudaStream_t stream)
{
	cudaResourceDesc resDesc = {};
	resDesc.resType = cudaResourceTypePitch2D;
	resDesc.res.pitch2D.width = width;
	resDesc.res.pitch2D.height = height;
	resDesc.res.pitch2D.devPtr = inputImage;
	resDesc.res.pitch2D.pitchInBytes = pitchInputImage;
	resDesc.res.pitch2D.desc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);

	cudaTextureDesc texDesc = {};
	texDesc.readMode = cudaReadModeElementType;
	texDesc.addressMode[0] = cudaAddressModeClamp;
	texDesc.addressMode[1] = cudaAddressModeClamp;

	cudaTextureObject_t inputImageTex;
	CUDA_CHECK(cudaCreateTextureObject(&inputImageTex, &resDesc, &texDesc, 0));

	dim3 block(32, 8);
	dim3 grid = paddedGrid(block.x, block.y, width, height);

	undistortionKernel << <grid, block, 0, stream >> >
		(reinterpret_cast<uchar4*>(outputImage),
			pitchOutputImage / sizeof(uchar4),
			inputImageTex, 
			undistMapX,	pitchUndistmapX / sizeof(float),
			undistMapY,	pitchUndistmapY / sizeof(float),
			width, height);


	CUDA_CHECK(cudaDestroyTextureObject(inputImageTex));
}

Now I tried to adapt my code to doing the interpolation with float (the changes in the kernel should be obvious, in the function calling the kernel I changed to using “cudaChannelFormatKindFloat”), but my output image is black. My error check also tells me this: “Cuda Error: invalid channel descriptor” when using cudaCreateTextureObject. This is the code:

__device__ unsigned char float2uchar(float floatin)
{
	unsigned char char_out;
	int int_temp = __float2uint_rn(floatin);
	char_out = (char)int_temp;
	return char_out;
}

__global__ void undistortionKernel(uchar4* outputImage, size_t pitchOutputImage,
	cudaTextureObject_t inputImageTex, 
	const float* XundistMap, size_t XpitchUndistmap,
	const float* YundistMap, size_t YpitchUndistmap,
	int width, int height)
{
	const int x = blockIdx.x * blockDim.x + threadIdx.x;
	const int y = blockDim.y * blockIdx.y + threadIdx.y;

	const float tx = (XundistMap[y * XpitchUndistmap + x]);
	const float ty = (YundistMap[y * YpitchUndistmap + x]);

	if (x >= width || y >= height) return;

	float4 outputImageTempFloat4 = tex2D<float4>(inputImageTex, tx, ty);
	uchar4 outputImageTemp;

	outputImageTemp.x = float2uchar(outputImageTempFloat4.x);
	outputImageTemp.y = float2uchar(outputImageTempFloat4.y);
	outputImageTemp.z = float2uchar(outputImageTempFloat4.z);
	outputImageTemp.w = float2uchar(outputImageTempFloat4.w);

	outputImage[y * pitchOutputImage + x] = outputImageTemp;

}

void undistortImage(unsigned char* outputImage, size_t pitchOutputImage, unsigned char* inputImage, size_t pitchInputImage, const float* undistMapX, size_t pitchUndistmapX, const float* undistMapY, size_t pitchUndistmapY, int width, int height, cudaStream_t stream)
{
	cudaResourceDesc resDesc = {};
	resDesc.resType = cudaResourceTypePitch2D;
	resDesc.res.pitch2D.width = width;
	resDesc.res.pitch2D.height = height;
	resDesc.res.pitch2D.devPtr = inputImage;
	resDesc.res.pitch2D.pitchInBytes = pitchInputImage;
	resDesc.res.pitch2D.desc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindFloat);

	cudaTextureDesc texDesc = {};
	texDesc.readMode = cudaReadModeElementType;
	texDesc.addressMode[0] = cudaAddressModeClamp;
	texDesc.addressMode[1] = cudaAddressModeClamp;

	texDesc.filterMode = cudaFilterModeLinear;
	cudaTextureObject_t inputImageTex;
	CUDA_CHECK(cudaCreateTextureObject(&inputImageTex, &resDesc, &texDesc, 0));

	dim3 block(32, 8);
	dim3 grid = paddedGrid(block.x, block.y, width, height);

	undistortionKernel << <grid, block, 0, stream >> >
		(reinterpret_cast<uchar4*>(outputImage),
			pitchOutputImage / sizeof(uchar4),
			inputImageTex, 
			undistMapX,	pitchUndistmapX / sizeof(float),
			undistMapY,	pitchUndistmapY / sizeof(float),
			width, height);

	CUDA_CHECK(cudaDestroyTextureObject(inputImageTex));
}

I am not sure how to load an unsigned char array to a float texture. Any hints and advices are very much appreciated :)

You can use a char or a uchar4 texture with cudaReadModeNormalizedFloat in combination with cudaFilterModeLinear and cudaAddressModeClamp

the tex2D operation would return a float4 that is normalized between 0 and 1 for uchar4, and between (approximately) -1 and 1 for char4. Any scaling to another numeric range would require a post-multiplication.

Your texel x,y coordinates are required to be within the 0-1 range (normalized texture coordinates)

Be aware that the built-in hardware texture interpolator is a bit low precision: “The CUDA C Programming Guide says that the interpolation coefficients are stored in 9-bit fixed point format with 8 bits of fractional value”. Could this be the reason you are seeing some blockiness?

Here is a code example for a interpolated uchar (not uchar4) texture access, but extending it to uchar4 should not be hard

https://stackoverflow.com/questions/17075617/setting-up-a-cuda-2d-unsigned-char-texture-for-linear-interpolation

For a manual bilinear interpolation that results in higher precision, you could use a solution like given in this thread. Here the bilinear interpolation happens in code, not in hardware. Note this interpolation would be required for all vector components of the uchar4!

https://devtalk.nvidia.com/default/topic/452590/texture-interpolation-double-precision-possible-/

Christian

Thanks a lot for the answer Christian.

I am trying to work with the second code you provided, as there is no need for normalized texture coordinates. Here is my implementation:

template<class T, class R>  // return type, texture type
__device__
R tex2DBilinear(const texture<T, 2, cudaReadModeNormalizedFloat> tex, float x, float y)
{
	x -= 0.5f;
	y -= 0.5f;

	float px = floorf(x);   // integer position
	float py = floorf(y);

	float fx = x - px;	  // fractional position
	float fy = y - py;

	px += 0.5f;
	py += 0.5f;

	return lerp(lerp(tex2D(tex, px, py), tex2D(tex, px + 1.0f, py), fx),
		lerp(tex2D(tex, px, py + 1.0f), tex2D(tex, px + 1.0f, py + 1.0f), fx), fy);
}

__global__ void undistortionKernel(uchar4* outputImage, size_t pitchOutputImage,
	uchar4* inputImage, size_t pitchInputImage,
	const float* XundistMap, size_t XpitchUndistmap,
	const float* YundistMap, size_t YpitchUndistmap,
	int width, int height)
{
	const int x = blockIdx.x * blockDim.x + threadIdx.x;
	const int y = blockDim.y * blockIdx.y + threadIdx.y;

	if (x >= width || y >= height) return;

	const float tx = XundistMap[y * XpitchUndistmap + x];
	const float ty = YundistMap[y * YpitchUndistmap + x];

	unsigned char tempUcharX = tex2DBilinear<unsigned char, unsigned char>(inputImage[y * pitchInputImage + x].x, tx, ty);
	unsigned char tempUcharY = tex2DBilinear<unsigned char, unsigned char>(inputImage[y * pitchInputImage + x].y, tx, ty);
	unsigned char tempUcharZ = tex2DBilinear<unsigned char, unsigned char>(inputImage[y * pitchInputImage + x].z, tx, ty);
	unsigned char tempUcharW = tex2DBilinear<unsigned char, unsigned char>(inputImage[y * pitchInputImage + x].w, tx, ty);

	outputImage[y * pitchOutputImage + x].x = tempUcharX;
	outputImage[y * pitchOutputImage + x].y = tempUcharY;
	outputImage[y * pitchOutputImage + x].z = tempUcharZ;
	outputImage[y * pitchOutputImage + x].w = tempUcharW;
}

You wrote that this interpolation is required for all vector components of the uchar4, am I doing it the right way here? The thing is, my code does not compile, Visual Studio gives me “identifier “lerp” is undefined”, even if I include cmath into my script. Do you have an idea where this error stems from?
Thanks in advance for any further help.

check if your CUDA SDK (and accompanying code samples) comes with helper_math.h

I found a version in a random github repository that has some overloads for the lerp() function that works with float4 vectors

https://github.com/pathscale/nvidia_sdk_samples/blob/master/bandwidthTest/common/inc/helper_math.h

Thanks again Christian, I could solve that problem.

Do you know if it is possible to use the manual bilinear interpolation (https://devtalk.nvidia.com/default/topic/452590/texture-interpolation-double-precision-possible-/) with texture objects instead of texture references? I am not sure how to handle texture objects as an input argument with the tex2DBilinear function. It confuses me that in the example code a texture reference is created in the input argument list of the device function, as this does not even compile for me.

I edited the interpolation slightly to match my task:

__device__
float4 tex2DBilinear(cudaTextureObject_t tex, float x, float y)
{
	x -= 1.0f;
	y -= 1.0f;

	float px = x;
	float py = y;

	// fractional position
	float fx = x - px;			
	float fy = y - py;

	float4 val_0, val_1, val_2, val_3;

	val_0 = tex2D<float4>(tex, px, py);
	val_1 = tex2D<float4>(tex, px + 1.0f, py);
	val_2 = tex2D<float4>(tex, px, py + 1.0f);
	val_3 = tex2D<float4>(tex, px + 1.0f, py + 1.0f);

	return lerp(lerp(val_0, val_1, fx),
		lerp(val_2, val_3, fx), fy);
}

 __device__ uchar4 convertTexFloatToUChar(const float4 _src)
{
	uchar4 _dst;
	_dst.x = (unsigned char)(_src.x * 255.9999f);
	_dst.y = (unsigned char)(_src.y * 255.9999f);
	_dst.z = (unsigned char)(_src.z * 255.9999f);
	_dst.w = (unsigned char)(_src.w * 255.9999f);

	return _dst;
}

__global__ void undistortionKernel(uchar4* outputImage, size_t pitchOutputImage,
	cudaTextureObject_t inputImageTex, 
	const float* XundistMap, size_t XpitchUndistmap,
	const float* YundistMap, size_t YpitchUndistmap,
	int width, int height)
{
	const int x = blockIdx.x * blockDim.x + threadIdx.x;
	const int y = blockDim.y * blockIdx.y + threadIdx.y;

	if (x >= width || y >= height) return;

	//const float tx =      (XundistMap[y * XpitchUndistmap + x]);
	//const float ty =      (YundistMap[y * YpitchUndistmap + x]);

	const float tx = (0.5f + XundistMap[y * XpitchUndistmap + x]);
	const float ty = (0.5f + YundistMap[y * YpitchUndistmap + x]);

	//uchar4 outputImageTemp = tex2D<uchar4>(inputImageTex, tx, ty);
	float4 outputImageTempFloat4 = tex2DBilinearAbs(inputImageTex, tx, ty);
	//uchar4 outputImageTemp = float42char4(outputImageTempFloat4);
	uchar4 outputImageTemp = convertTexFloatToUChar(outputImageTempFloat4);
	
	outputImage[y * pitchOutputImage + x] = outputImageTemp;
}

As you’ve found out, texture references cannot be passed around as function arguments. They’re globals with respect to the .cu module in which they are defined, so you would not have to pass them around as arguments. That also makes them less flexible to use.

If you want to use texture objects, that’s fine. These just requires a different setup in host code.

Christian