Creating texture object from cv::Mat

I am trying to create a texture object from a RGB openCV Mat to do some simple image processing. It works well with a single channel image, but with the RGB (or rather BGR) image my CUDA error check gives me “invalid channel descriptor” when trying to create the cudaTextureObject_t with cudaCreateTextureObject. Here is a snippet of the working code for the single channel image:

using namespace cv;
	
int width = 1920;
int height = 1080;

Mat matMono(height, width, CV_8UC1);

size_t bytesPerRowMono = width * 1 * sizeof(unsigned char);
size_t bytesPerRow4ch = width * 4 * sizeof(unsigned char);
uint8_t* Mono_gpu, * Mono_filled_gpu;
size_t pitchMono, pitchMono_filled;

grabMonoImage(matMono); 

CUDA_CHECK(cudaMallocPitch(&Mono_gpu, &pitchMono, bytesPerRowMono, height));
CUDA_CHECK(cudaMallocPitch(&Mono_filled_gpu, &pitchMono_filled, bytesPerRow4ch, height));

CUDA_CHECK(cudaMemcpy2D(Mono_gpu, pitchMono, matMono.ptr(), matMono.step, bytesPerRowMono, height, cudaMemcpyHostToDevice));

fillMono(Mono_filled_gpu, pitchMono_filled, Mono_gpu, pitchMono, width, height, stream);

This is the function that creates the texture object (in line 10) and then does some image processing.

void fillMono(unsigned char* filledImage, size_t pitchFilledImage, 
unsigned char* MonoImage, size_t pitchMonoImage, int width, int height, cudaStream_t stream)
{
	cudaResourceDesc resDesc = {};
	resDesc.resType = cudaResourceTypePitch2D;
	resDesc.res.pitch2D.width = width;
	resDesc.res.pitch2D.height = height;
	resDesc.res.pitch2D.devPtr = MonoImage;
	resDesc.res.pitch2D.pitchInBytes = pitchMonoImage;
	resDesc.res.pitch2D.desc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);

	cudaTextureDesc texDesc = {};
	texDesc.readMode = cudaReadModeElementType;
	texDesc.addressMode[0] = cudaAddressModeBorder;
	texDesc.addressMode[1] = cudaAddressModeBorder;

	cudaTextureObject_t MonoImageTex;
	CUDA_CHECK(cudaCreateTextureObject(&MonoImageTex, &resDesc, &texDesc, 0));

	dim3 block(32, 8);
	dim3 grid = paddedGrid(block.x, block.y, width, height);

	fillRGBKernel << <grid, block, 0, stream >> > (reinterpret_cast<uchar4*>(filledImage),
		pitchFilledImage / sizeof(uchar4), MonoImageTex, width, height);

	CUDA_CHECK(cudaDestroyTextureObject(MonoImageTex));
}

Here is the code for the RGB image:

using namespace cv;

int width = 1920;
int height = 1080;

Mat matMono(height, width, CV_8UC3);

size_t bytesPerRowRGB = width * 3 * sizeof(unsigned char);
size_t bytesPerRow4ch = width * 4 * sizeof(unsigned char);
uint8_t* RGB_gpu, * RGB_filled_gpu;
size_t pitchRGB, pitchRGB_filled;

grabRGBImage(matRGB); 

CUDA_CHECK(cudaMallocPitch(&RGB_gpu, &pitchRGB, bytesPerRowRGB, height));
CUDA_CHECK(cudaMallocPitch(&RGB_filled_gpu, &pitchRGB_filled, bytesPerRow4ch, height));

CUDA_CHECK(cudaMemcpy2D(RGB_gpu, pitchRGB, matRGB.ptr(), matRGB.step, bytesPerRowRGB, height, cudaMemcpyHostToDevice));

fillRGB(RGB_filled_gpu, pitchRGB_filled, RGB_gpu, pitchRGB, width, height, stream);

This is where my program fails (line 10):

void fillRGB(unsigned char* filledImage, size_t pitchFilledImage, unsigned char* RGBImage, size_t pitchRGBImage,
	int width, int height, cudaStream_t stream)
{
	cudaResourceDesc resDesc = {};
	resDesc.resType = cudaResourceTypePitch2D;
	resDesc.res.pitch2D.width = width;
	resDesc.res.pitch2D.height = height;
	resDesc.res.pitch2D.devPtr = RGBImage;
	resDesc.res.pitch2D.pitchInBytes = pitchRGBImage;
	resDesc.res.pitch2D.desc = cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindUnsigned);

	cudaTextureDesc texDesc = {};
	texDesc.readMode = cudaReadModeElementType;
	texDesc.addressMode[0] = cudaAddressModeBorder;
	texDesc.addressMode[1] = cudaAddressModeBorder;

	cudaTextureObject_t RGBImageTex = 0;
	CUDA_CHECK(cudaCreateTextureObject(&RGBImageTex, &resDesc, &texDesc, 0));

	dim3 block(32, 8);
	dim3 grid = paddedGrid(block.x, block.y, width, height);

	fillRGBKernel << <grid, block, 0, stream >> > (reinterpret_cast<uchar4*>(filledImage),
		pitchFilledImage / sizeof(uchar4), RGBImageTex, width, height);

	CUDA_CHECK(cudaDestroyTextureObject(RGBImageTex));
}

The only differences are the pitches (matMono.step is 1920; pitchMono is 2048; matRGB.step is 5760; pitchRGB is 6144) and the input arguments for cudaCreateChannelDesc. I think the differences in the pitches come from openCV storing the images different than CUDA?
I checked the images that are grabbed from the cameras, they both look normal after image acquisiton and still have the right dimensions and types.

Help is very much appreciated!

You cannot bind a 3-channel interleaved RGB image to a texture object. 4-channel RGBA image are working.
See also https://devtalk.nvidia.com/default/topic/997795/bind-a-uchar3-texture/ and https://devtalk.nvidia.com/default/topic/481747/is-texture-60-float3-3-62-possible-cudaerrorinvalidchanneldescriptor/?offset=5

By the way, I suppose with the larger L1 cache of recent GPU architectures (Volta, Turing) there might be not a big advantage of texture objects. The filtering (bilinear) and border handling can be also done ‘by hand’.