Advices on moving algoritm calculations to GPU

Hello, im currently working on detecting and repairing contaminaited pixels in image. Heres how it looks like on CPU:

Detection:

for(int i=delay;i<W-delay;++i)
  for(int j=delay;j<H-delay;++j){
    int counter=0;

    for(int x=-delay;x<=delay;++x)
      for(int y=-delay;y<=delay;++y){
        double dist=0;
        dist+=(R[i+j*W]-R[(i+x)+(j+y)*W])*(R[i+j*W]-R[(i+x)+(j+y)*W]);
        dist+=(G[i+j*W]-G[(i+x)+(j+y)*W])*(G[i+j*W]-G[(i+x)+(j+y)*W]);
        dist+=(B[i+j*W]-B[(i+x)+(j+y)*W])*(B[i+j*W]-B[(i+x)+(j+y)*W]);


        if(dist<given_parameter)
          counter++;
      }

      if(counter<3)
        contaminated[i+j*W]=true;
      else
        contaminated[i+j*W]=false;

    }

Contaminated table tells us if specific pixel has been estimatied as contaminated ( true ) or not (false)

Repairation:

for(int i=delay;i<width-delay;++i)
        {
            for(int j=delay;j<height-delay;++j)
            {
                 average_b=0;
                 average_g=0;
                 average_r=0;
                 index=0;

                 if(noise_map[i+j*width])
                 {
                    for(int x=i-delay;x<i+delay+1;++x)
                        for(int y=j-delay;y<j+delay+1;++y)
                            {
                                if(!noise_map[x+y*width])
                                {
                                    average_b+=B[x+y*width];
                                    average_r+=R[x+y*width];
                                    average_g+=G[x+y*width];
                                    ++index;

                                }

                            }

                    if(index!=0)
                    {
                        R[i+j*width]=round(average_r/index);
                        G[i+j*width]=round(average_g/index);
                        B[i+j*width]=round(average_b/index);      
                    }

Heres my approach to move detection part to GPU:

extern __shared__ unsigned char tab[];
unsigned char * Rs = tab12;
unsigned char * Gs = &Rs[(BLOCK+2*delay)*window];
unsigned char * Bs = &Gs[(BLOCK+2*delay)*window];

//READING FROM GLOBAL TO SHARED MEMORY
        for( int i=threadIdx.x ; i < BLOCK+2*delay ; i+=blockDim.x ){
            for(int j=window-1 ; j > 0 ; j-- )
                if(i+blockDim.x*blockIdx.x < W ){
                Rs[i + j*( BLOCK+2*delay )]=a[i+blockDim.x*blockIdx.x+(j-1)*W];
                Gs[i + j*( BLOCK+2*delay )]=a1[i+blockDim.x*blockIdx.x+(j-1)*W];
                Bs[i + j*( BLOCK+2*delay )]=a2[i+blockDim.x*blockIdx.x+(j-1)*W];
        }

        }
        __syncthreads();

        unsigned short counter=0;
        int dist=0;

        for(int i=delay ; i < H - delay ; ++i ){

            //SHIFTING SHARED MEMORY
            for( int k=threadIdx.x ; k < BLOCK+2*delay ; k+=blockDim.x )
                for(int j=0 ; j < window - 1 ; j++ )
                    if(k+blockDim.x*blockIdx.x< W ){
                        Rs[k + j*( BLOCK+2*delay )]=Rs[k + (j+1)*( BLOCK+2*delay )];
                        Gs[k + j*( BLOCK+2*delay )]=Gs[k + (j+1)*( BLOCK+2*delay )];
                        Bs[k + j*( BLOCK+2*delay )]=Bs[k + (j+1)*( BLOCK+2*delay )];
          }

            __syncthreads();

            //READING ONLY ONE NEW PIXEL
            for( int k=threadIdx.x ; k < BLOCK+2*delay ; k+=blockDim.x )
                    if(k+blockDim.x*blockIdx.x< W ){
                            Rs[k + (window-1)*( BLOCK+2*delay )]=a[k+blockDim.x*blockIdx.x + (i+delay)*W];
                            Gs[k + (window-1)*( BLOCK+2*delay )]=a1[k+blockDim.x*blockIdx.x + (i+delay)*W];
                            Bs[k + (window-1)*( BLOCK+2*delay )]=a2[k+blockDim.x*blockIdx.x + (i+delay)*W];
                    }

            __syncthreads();

            //CALCULATIONS
            if(delay <= delay+threadIdx.x+blockIdx.x*blockDim.x && delay+threadIdx.x+blockIdx.x*blockDim.x < W - delay ){
        counter=0;
        dist=0;

                for(int x=-delay; x<=delay ; ++x )
                    for(int y=0 ; y<window ; ++y){
            dist =0;
            dist+=(Rs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Rs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)])*(Rs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Rs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)]);
            dist+=(Gs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Gs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)])*(Gs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Gs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)]);
            dist+=(Bs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Bs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)])*(Bs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Bs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)]);

            if(dist<p_d)
            counter++;

            }

                if(counter>=3)
                  help[delay+threadIdx.x+blockIdx.x*blockDim.x+i*W]=0;
                else
                  help[delay+threadIdx.x+blockIdx.x*blockDim.x+i*W]=1;


        }
        __syncthreads();
}

}

and similary for repair stage. Then I have to compare every pixel before contamination and after repair. As a determinant i use MSE. Simple CPU code:

for(int i=delay;i<width-delay;++i)
       {
           for(int j=delay;j<height-delay;++j)
           {

               mse+=(R[i+j*width]-r[i+j*width])*(R[i+j*width]-r[i+j*width]);

               mse+=(B[i+j*width]-b[i+j*width])*(B[i+j*width]-b[i+j*width]);

               mse+=(G[i+j*width]-g[i+j*width])*(G[i+j*width]-g[i+j*width]);
           }
       }

       mse*=(1/(3.0*((width-2*delay)*(height-2*delay))));

MSE on GPU:

__global__
void mse_function(float* mse,rgb_type* R,rgb_type* G,rgb_type* B,rgb_type* R1,rgb_type* G1,rgb_type* B1,unsigned int W,unsigned int H ){


        if(threadIdx.x+blockIdx.x*blockDim.x < W ){

            double mse_loc=0;
            for(int i=0; i < H; ++i){
                    mse_loc=0;
                    mse_loc+=(R[threadIdx.x+blockIdx.x*blockDim.x+i*W]-R1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)])*(R[threadIdx.x+blockIdx.x*blockDim.x+i*W]-R1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)]);
                    mse_loc+=(G[threadIdx.x+blockIdx.x*blockDim.x+i*W]-G1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)])*(G[threadIdx.x+blockIdx.x*blockDim.x+i*W]-G1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)]);
                    mse_loc+=(B[threadIdx.x+blockIdx.x*blockDim.x+i*W]-B1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)])*(B[threadIdx.x+blockIdx.x*blockDim.x+i*W]-B1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)]);
                }


                atomicAdd(mse,mse_local);

        }
}

So, on GPU every stage is seperated from each other. My Qestion is: Can i do it more effiecient way? I tend to care very much on performace, I have managed to cut execution time 20 times, but its my first approach to GPU programming and im open for hints how could i do it better. What bothers me is that im not using shared memory in MSE calculations. I tried reduction alghoritm, but it turned out 5 times slower that my attepmt. I also tried merge those 3 kernels into one, but it spited weird errors and malfunctionated memmory addresses.