I have an algorithm that requires me to compute the minimum and maximum multiple times over an image. The image is stored in a one-channel cv::cuda::GpuMat. The image is divided into a grid which in a particular case provides 64 blocks (8x8 grid). My host implementation would perform this in about 25ms to produce mins and maxs for each block. The code below is the host implementation. The function findMinMax would be called once for each block and the results stored in another array.

```
typedef struct min_max_t {
uchar min;
uchar max;
} min_max_t;
min_max_t findMinMax(cv::Mat block)
{
min_max_t min_max;
min_max.min = 255;
min_max.max = 0;
for(int i = 0; i < block.rows; ++i) {
uchar *value_ptr = block.ptr<uchar>(i);
for(int j = 0; j < block.cols; ++j) {
if(value_ptr[j] < THRESH_DARK_PIXELS)
continue;
if(value_ptr[j] < min_max.min)
min_max.min = value_ptr[j];
if(value_ptr[j] > min_max.max)
min_max.max = value_ptr[j];
}
if(min_max.min >= min_max.max)
min_max.min = min_max.max;
return min_max;
}
```

It’s straightforward. I was hoping to use the thrust api but the GpuMat implementation doesn’t translate too easily. I used this post in order to provide an iterator over the block. I made a typedef gpu_mat_itr which refers to those iterators. The code is as follows:

```
struct isNotDark
{
__device__
bool operator()(const uchar p)
{
return (p >= THRESH_DARK_PIXELS);
}
};
min_max_t findMinMax(cv::cuda::GpuMat block)
{
cv::cuda::GpuMat block_thresh;
gpu_mat_itr_t<uchar> block_itr_begin, block_itr_end, block_thresh_itr_begin, block_thresh_itr_end;
thrust::pair< gpu_mat_itr_t<uchar>, gpu_mat_itr<uchar> > thrust_min_max_result;
min_max_t min_max;
min_max.min = 255;
min_max.max = 0;
block_itr_begin = GpuMatBeginItr<uchar>(block);
block_itr_end = GpuMatEndItr<uchar>(block);
block_thresh = cv::cuda::GpuMat(block.size(). block.type());
block_thresh_itr_begin = GpuMatBeginItr<uchar>(block_thresh);
block_thresh_itr_end = thrust::copy_if(block_itr_begin, block_itr_end, block_thresh_itr_begin, isNotDark());
thrust_min_max_result = thrust::minmax_element(block_thresh_itr_begin, block_thresh_itr_end);
min_max.min = *(thrust_min_max_result.first);
min_max.max = *(thrust_min_max_result.second);
if(min_max.min >= min_max.max)
min_max.min = min_max.max;
return min_max;
}
```

This code takes 130-150ms to produce min_max for all 64 blocks. I figured it might be the fact that I’m copying the data to another array so I removed it to see if performance might improve. I know this wouldn’t produce results I want but the performance only improved slightly to around 100ms. Is this because I’m using that iterator? Is the performance bad because it’s not explicitly stored in a thrust::device_vector? Is there any way to do this so that I can at least get something similar to the performance of my host implementation at about 25ms?

I’m currently trying to do my own kernel implementation but I’m struggling to wrap my head around reductions solutions. I figure the best approach would be to do reduction for each row and then do reduction for all resulting rows together to get the min max for a block. But this has been very difficult for me to understand and implement.

I’m on the Jetson Nano if that’s important.