# Advices on moving algoritm calculations to GPU

Hello, im currently working on detecting and repairing contaminaited pixels in image. Heres how it looks like on CPU:

Detection:

``````for(int i=delay;i<W-delay;++i)
for(int j=delay;j<H-delay;++j){
int counter=0;

for(int x=-delay;x<=delay;++x)
for(int y=-delay;y<=delay;++y){
double dist=0;
dist+=(R[i+j*W]-R[(i+x)+(j+y)*W])*(R[i+j*W]-R[(i+x)+(j+y)*W]);
dist+=(G[i+j*W]-G[(i+x)+(j+y)*W])*(G[i+j*W]-G[(i+x)+(j+y)*W]);
dist+=(B[i+j*W]-B[(i+x)+(j+y)*W])*(B[i+j*W]-B[(i+x)+(j+y)*W]);

if(dist<given_parameter)
counter++;
}

if(counter<3)
contaminated[i+j*W]=true;
else
contaminated[i+j*W]=false;

}
``````

Contaminated table tells us if specific pixel has been estimatied as contaminated ( true ) or not (false)

Repairation:

``````for(int i=delay;i<width-delay;++i)
{
for(int j=delay;j<height-delay;++j)
{
average_b=0;
average_g=0;
average_r=0;
index=0;

if(noise_map[i+j*width])
{
for(int x=i-delay;x<i+delay+1;++x)
for(int y=j-delay;y<j+delay+1;++y)
{
if(!noise_map[x+y*width])
{
average_b+=B[x+y*width];
average_r+=R[x+y*width];
average_g+=G[x+y*width];
++index;

}

}

if(index!=0)
{
R[i+j*width]=round(average_r/index);
G[i+j*width]=round(average_g/index);
B[i+j*width]=round(average_b/index);
}
``````

Heres my approach to move detection part to GPU:

``````extern __shared__ unsigned char tab[];
unsigned char * Rs = tab12;
unsigned char * Gs = &Rs[(BLOCK+2*delay)*window];
unsigned char * Bs = &Gs[(BLOCK+2*delay)*window];

//READING FROM GLOBAL TO SHARED MEMORY
for( int i=threadIdx.x ; i < BLOCK+2*delay ; i+=blockDim.x ){
for(int j=window-1 ; j > 0 ; j-- )
if(i+blockDim.x*blockIdx.x < W ){
Rs[i + j*( BLOCK+2*delay )]=a[i+blockDim.x*blockIdx.x+(j-1)*W];
Gs[i + j*( BLOCK+2*delay )]=a1[i+blockDim.x*blockIdx.x+(j-1)*W];
Bs[i + j*( BLOCK+2*delay )]=a2[i+blockDim.x*blockIdx.x+(j-1)*W];
}

}

unsigned short counter=0;
int dist=0;

for(int i=delay ; i < H - delay ; ++i ){

//SHIFTING SHARED MEMORY
for( int k=threadIdx.x ; k < BLOCK+2*delay ; k+=blockDim.x )
for(int j=0 ; j < window - 1 ; j++ )
if(k+blockDim.x*blockIdx.x< W ){
Rs[k + j*( BLOCK+2*delay )]=Rs[k + (j+1)*( BLOCK+2*delay )];
Gs[k + j*( BLOCK+2*delay )]=Gs[k + (j+1)*( BLOCK+2*delay )];
Bs[k + j*( BLOCK+2*delay )]=Bs[k + (j+1)*( BLOCK+2*delay )];
}

for( int k=threadIdx.x ; k < BLOCK+2*delay ; k+=blockDim.x )
if(k+blockDim.x*blockIdx.x< W ){
Rs[k + (window-1)*( BLOCK+2*delay )]=a[k+blockDim.x*blockIdx.x + (i+delay)*W];
Gs[k + (window-1)*( BLOCK+2*delay )]=a1[k+blockDim.x*blockIdx.x + (i+delay)*W];
Bs[k + (window-1)*( BLOCK+2*delay )]=a2[k+blockDim.x*blockIdx.x + (i+delay)*W];
}

//CALCULATIONS
counter=0;
dist=0;

for(int x=-delay; x<=delay ; ++x )
for(int y=0 ; y<window ; ++y){
dist =0;

if(dist<p_d)
counter++;

}

if(counter>=3)
else

}
}

}
``````

and similary for repair stage. Then I have to compare every pixel before contamination and after repair. As a determinant i use MSE. Simple CPU code:

``````for(int i=delay;i<width-delay;++i)
{
for(int j=delay;j<height-delay;++j)
{

mse+=(R[i+j*width]-r[i+j*width])*(R[i+j*width]-r[i+j*width]);

mse+=(B[i+j*width]-b[i+j*width])*(B[i+j*width]-b[i+j*width]);

mse+=(G[i+j*width]-g[i+j*width])*(G[i+j*width]-g[i+j*width]);
}
}

mse*=(1/(3.0*((width-2*delay)*(height-2*delay))));
``````

MSE on GPU:

``````__global__
void mse_function(float* mse,rgb_type* R,rgb_type* G,rgb_type* B,rgb_type* R1,rgb_type* G1,rgb_type* B1,unsigned int W,unsigned int H ){

double mse_loc=0;
for(int i=0; i < H; ++i){
mse_loc=0;