How to access cv::Mat object from a CUDA kernel?

Hi,
I want to convert cvImageToTensor() [1] function to kernel. I have tried out dozens of methods to access cv::Mat image from kernel. But I can’t.

Can anyone give me any suggestion to convert this function to a cuda kernel?

This is original code that runs on CPU:

void cvImageToTensor(const cv::Mat & image, float *tensor, nvinfer1::Dims dimensions)
{
  const size_t channels = dimensions.d[0];
  const size_t height = dimensions.d[1];
  const size_t width = dimensions.d[2];
  // TODO: validate dimensions match
  const size_t stridesCv[3] = { width * channels, channels, 1 };
  const size_t strides[3] = { height * width, width, 1 };

  for (int i = 0; i < height; i++) 
  {
    for (int j = 0; j < width; j++) 
    {
      for (int k = 0; k < channels; k++) 
      {
        const size_t offsetCv = i * stridesCv[0] + j * stridesCv[1] + k * stridesCv[2];
        const size_t offset = k * strides[0] + i * strides[1] + j * strides[2];
        tensor[offset] = (float) image.data[offsetCv];
      }
    }
  }
}

What I want to do on GPU is like that:
(Note : Please do not expect from this code to run. This is just pseudo-code.)

/**
 * This is GPU code
*/
__global__ void cvImageToTensor(const cv::Mat & image, float *tensor, const int channels,const int height,const int width)
{
  const size_t stridesCv[3] = { width * channels, channels, 1 };
  const size_t strides[3] = { height * width, width, 1 };
  
  int i = threadIdx.x + blockIdx.x * blockdim.x;
  int j = threadIdx.y + blockIdx.y * blockdim.y;
  int k = channels;

  if( i >= width && j >= height){
    return;
  }

  const size_t offsetCv = i * stridesCv[0] + j * stridesCv[1] + k * stridesCv[2];
  const size_t offset = k * strides[0] + i * strides[1] + j * strides[2];
  tensor[offset] = (float) image.data[offsetCv];

}

int main(){

	...

	// Some code for reading image
	cv::Mat image;
	image = cv::imread("./path/to/image.png",CV_LOAD_IMAGE_COLOR);

	...

	float *tensor;
	cudaMallocManaged(&tensor,...................); // Something like that. Just pseudo-code
	
	<b>// Configure kernel</b>
	int threadLimitsPerBlock = 1024;
	int numberOfThreads = height*width*channels; // height, width and channel of image
	int requiredNumberOfBlocks = (numberOfThreads/threadLimitsPerBlock)+1;
	dim3 Block(requiredNumberOfBlocks,1,1);
	dim3 Grid(threadLimitsPerBlock,1,1);

	[b]// Run kernel
	cvImageToTensor<<<Grid,Block>>>(image,tensor,channels,height,width);[/b]
	cudaDeviceSynchronize();

	...
	...

}

Best regards,

  1. https://github.com/NVIDIA-AI-IOT/tf_to_trt_image_classification/blob/master/examples/classify_image/utils.h