Hi,
I want to convert cvImageToTensor() [1] function to kernel. I have tried out dozens of methods to access cv::Mat image from kernel. But I can’t.
Can anyone give me any suggestion to convert this function to a cuda kernel?
This is original code that runs on CPU:
void cvImageToTensor(const cv::Mat & image, float *tensor, nvinfer1::Dims dimensions)
{
const size_t channels = dimensions.d[0];
const size_t height = dimensions.d[1];
const size_t width = dimensions.d[2];
// TODO: validate dimensions match
const size_t stridesCv[3] = { width * channels, channels, 1 };
const size_t strides[3] = { height * width, width, 1 };
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
for (int k = 0; k < channels; k++)
{
const size_t offsetCv = i * stridesCv[0] + j * stridesCv[1] + k * stridesCv[2];
const size_t offset = k * strides[0] + i * strides[1] + j * strides[2];
tensor[offset] = (float) image.data[offsetCv];
}
}
}
}
What I want to do on GPU is like that:
(Note : Please do not expect from this code to run. This is just pseudo-code.)
/**
* This is GPU code
*/
__global__ void cvImageToTensor(const cv::Mat & image, float *tensor, const int channels,const int height,const int width)
{
const size_t stridesCv[3] = { width * channels, channels, 1 };
const size_t strides[3] = { height * width, width, 1 };
int i = threadIdx.x + blockIdx.x * blockdim.x;
int j = threadIdx.y + blockIdx.y * blockdim.y;
int k = channels;
if( i >= width && j >= height){
return;
}
const size_t offsetCv = i * stridesCv[0] + j * stridesCv[1] + k * stridesCv[2];
const size_t offset = k * strides[0] + i * strides[1] + j * strides[2];
tensor[offset] = (float) image.data[offsetCv];
}
int main(){
...
// Some code for reading image
cv::Mat image;
image = cv::imread("./path/to/image.png",CV_LOAD_IMAGE_COLOR);
...
float *tensor;
cudaMallocManaged(&tensor,...................); // Something like that. Just pseudo-code
<b>// Configure kernel</b>
int threadLimitsPerBlock = 1024;
int numberOfThreads = height*width*channels; // height, width and channel of image
int requiredNumberOfBlocks = (numberOfThreads/threadLimitsPerBlock)+1;
dim3 Block(requiredNumberOfBlocks,1,1);
dim3 Grid(threadLimitsPerBlock,1,1);
[b]// Run kernel
cvImageToTensor<<<Grid,Block>>>(image,tensor,channels,height,width);[/b]
cudaDeviceSynchronize();
...
...
}
Best regards,