Extending the color range without using opencv::cuda

Hello. There is a need to expand the color range. The only option I have chosen is to convert to HSV from BGR and back. I used this kernel, but speed of this solution is not commensurable with speed of opencv::cuda solution, what can be fixed here?

__global__ void Inversion_CUDA(unsigned char* Image, int Channels){
int x = blockIdx.x;
int y = blockIdx.y;
int idx = (x + y * gridDim.x) * Channels;
//We need to get a bgr point
//convert_one_pixel_to_hsv 
float r, g, b;
float h, s, v;
r = Image[idx + 2] / 255.0f;
g = Image[idx + 1] / 255.0f;
b = Image[idx] / 255.0f;
float max = fmax(r, fmax(g, b));
float min = fmin(r, fmin(g, b));
float diff = max - min; 
v = max;    
if(v == 0.0f) { // black
    h = s = 0.0f;
} else {
    s = diff / v * 2.0f; //add cof
    if(diff < 0.001f) { // grey
        h = 0.0f;
    } else { // color
        if(max == r) {
            h = 60.0f * (g - b)/diff;
            if(h < 0.0f) { h += 360.0f; }
        } else if(max == g) {
            h = 60.0f * (2 + (b - r)/diff);
        } else {
            h = 60.0f * (4 + (r - g)/diff);
        }
    }       
}   
//convert_one_pixel_to_bgr
float f = h/60.0f;
float hi = floorf(f);
f = f - hi;
float p = v * (1 - s);
float q = v * (1 - s * f);
float t = v * (1 - s * (1 - f));    
if(hi == 0.0f || hi == 6.0f) {
    r = v;
    g = t;
    b = p;
} else if(hi == 1.0f) {
    r = q;
    g = v;
    b = p;
} else if(hi == 2.0f) {
    r = p;
    g = v;
    b = t;
} else if(hi == 3.0f) {
    r = p;
    g = q;
    b = v;
} else if(hi == 4.0f) {
    r = t;
    g = p;
    b = v;
} else {
    r = v;
    g = p;
    b = q;
}
Image[idx + 2] = (unsigned char) __float2uint_rn(255.0f * r);
Image[idx + 1] = (unsigned char) __float2uint_rn(255.0f * g);
Image[idx] = (unsigned char) __float2uint_rn(255.0f * b);   
 }

cudaMemcpy(Dev_Input_Image, img.data, src.rows * src.cols * src.channels(), cudaMemcpyHostToDevice);
Inversion_CUDA << <Grid_Image, 1 >> >(Dev_Input_Image, src.channels());
cudaMemcpy(src.data, Dev_Input_Image, src.rows * src.cols * src.channels(), cudaMemcpyDeviceToHost);

It is not possible to install opencv::cuda because of the large number of devices that are already in use.

Hi,
Your solution looks optimal. May run sudo nvpmodel -m 0 and sudo jetson_clocks. With the commands the CPU cores and GPU are at max clock.

Thanks for the advice. My solution is four times slower than that:

    dst.upload(img,stream);   
    cv::cuda::cvtColor(dst, dst, cv::COLOR_BGR2HSV,0,stream);
    cv::cuda::split(dst, rgbchannel_cuda,stream);
    rgbchannel_cuda[1].convertTo(rgbchannel_cuda[1], rgbchannel_cuda[1].type(), 2,stream);
    cv::cuda::merge(rgbchannel_cuda,3, dst,stream);
    cv::cuda::cvtColor(dst, dst, cv::COLOR_HSV2BGR,0,stream);
    dst.download(img,stream);

We have the nano running in 5W mode.