Hello. There is a need to expand the color range. The only option I have chosen is to convert to HSV from BGR and back. I used this kernel, but speed of this solution is not commensurable with speed of opencv::cuda solution, what can be fixed here?
__global__ void Inversion_CUDA(unsigned char* Image, int Channels){
int x = blockIdx.x;
int y = blockIdx.y;
int idx = (x + y * gridDim.x) * Channels;
//We need to get a bgr point
//convert_one_pixel_to_hsv
float r, g, b;
float h, s, v;
r = Image[idx + 2] / 255.0f;
g = Image[idx + 1] / 255.0f;
b = Image[idx] / 255.0f;
float max = fmax(r, fmax(g, b));
float min = fmin(r, fmin(g, b));
float diff = max - min;
v = max;
if(v == 0.0f) { // black
h = s = 0.0f;
} else {
s = diff / v * 2.0f; //add cof
if(diff < 0.001f) { // grey
h = 0.0f;
} else { // color
if(max == r) {
h = 60.0f * (g - b)/diff;
if(h < 0.0f) { h += 360.0f; }
} else if(max == g) {
h = 60.0f * (2 + (b - r)/diff);
} else {
h = 60.0f * (4 + (r - g)/diff);
}
}
}
//convert_one_pixel_to_bgr
float f = h/60.0f;
float hi = floorf(f);
f = f - hi;
float p = v * (1 - s);
float q = v * (1 - s * f);
float t = v * (1 - s * (1 - f));
if(hi == 0.0f || hi == 6.0f) {
r = v;
g = t;
b = p;
} else if(hi == 1.0f) {
r = q;
g = v;
b = p;
} else if(hi == 2.0f) {
r = p;
g = v;
b = t;
} else if(hi == 3.0f) {
r = p;
g = q;
b = v;
} else if(hi == 4.0f) {
r = t;
g = p;
b = v;
} else {
r = v;
g = p;
b = q;
}
Image[idx + 2] = (unsigned char) __float2uint_rn(255.0f * r);
Image[idx + 1] = (unsigned char) __float2uint_rn(255.0f * g);
Image[idx] = (unsigned char) __float2uint_rn(255.0f * b);
}
cudaMemcpy(Dev_Input_Image, img.data, src.rows * src.cols * src.channels(), cudaMemcpyHostToDevice);
Inversion_CUDA << <Grid_Image, 1 >> >(Dev_Input_Image, src.channels());
cudaMemcpy(src.data, Dev_Input_Image, src.rows * src.cols * src.channels(), cudaMemcpyDeviceToHost);
It is not possible to install opencv::cuda because of the large number of devices that are already in use.