Hello, im currently working on detecting and repairing contaminaited pixels in image. Heres how it looks like on CPU:
Detection:
for(int i=delay;i<W-delay;++i)
for(int j=delay;j<H-delay;++j){
int counter=0;
for(int x=-delay;x<=delay;++x)
for(int y=-delay;y<=delay;++y){
double dist=0;
dist+=(R[i+j*W]-R[(i+x)+(j+y)*W])*(R[i+j*W]-R[(i+x)+(j+y)*W]);
dist+=(G[i+j*W]-G[(i+x)+(j+y)*W])*(G[i+j*W]-G[(i+x)+(j+y)*W]);
dist+=(B[i+j*W]-B[(i+x)+(j+y)*W])*(B[i+j*W]-B[(i+x)+(j+y)*W]);
if(dist<given_parameter)
counter++;
}
if(counter<3)
contaminated[i+j*W]=true;
else
contaminated[i+j*W]=false;
}
Contaminated table tells us if specific pixel has been estimatied as contaminated ( true ) or not (false)
Repairation:
for(int i=delay;i<width-delay;++i)
{
for(int j=delay;j<height-delay;++j)
{
average_b=0;
average_g=0;
average_r=0;
index=0;
if(noise_map[i+j*width])
{
for(int x=i-delay;x<i+delay+1;++x)
for(int y=j-delay;y<j+delay+1;++y)
{
if(!noise_map[x+y*width])
{
average_b+=B[x+y*width];
average_r+=R[x+y*width];
average_g+=G[x+y*width];
++index;
}
}
if(index!=0)
{
R[i+j*width]=round(average_r/index);
G[i+j*width]=round(average_g/index);
B[i+j*width]=round(average_b/index);
}
Heres my approach to move detection part to GPU:
extern __shared__ unsigned char tab[];
unsigned char * Rs = tab12;
unsigned char * Gs = &Rs[(BLOCK+2*delay)*window];
unsigned char * Bs = &Gs[(BLOCK+2*delay)*window];
//READING FROM GLOBAL TO SHARED MEMORY
for( int i=threadIdx.x ; i < BLOCK+2*delay ; i+=blockDim.x ){
for(int j=window-1 ; j > 0 ; j-- )
if(i+blockDim.x*blockIdx.x < W ){
Rs[i + j*( BLOCK+2*delay )]=a[i+blockDim.x*blockIdx.x+(j-1)*W];
Gs[i + j*( BLOCK+2*delay )]=a1[i+blockDim.x*blockIdx.x+(j-1)*W];
Bs[i + j*( BLOCK+2*delay )]=a2[i+blockDim.x*blockIdx.x+(j-1)*W];
}
}
__syncthreads();
unsigned short counter=0;
int dist=0;
for(int i=delay ; i < H - delay ; ++i ){
//SHIFTING SHARED MEMORY
for( int k=threadIdx.x ; k < BLOCK+2*delay ; k+=blockDim.x )
for(int j=0 ; j < window - 1 ; j++ )
if(k+blockDim.x*blockIdx.x< W ){
Rs[k + j*( BLOCK+2*delay )]=Rs[k + (j+1)*( BLOCK+2*delay )];
Gs[k + j*( BLOCK+2*delay )]=Gs[k + (j+1)*( BLOCK+2*delay )];
Bs[k + j*( BLOCK+2*delay )]=Bs[k + (j+1)*( BLOCK+2*delay )];
}
__syncthreads();
//READING ONLY ONE NEW PIXEL
for( int k=threadIdx.x ; k < BLOCK+2*delay ; k+=blockDim.x )
if(k+blockDim.x*blockIdx.x< W ){
Rs[k + (window-1)*( BLOCK+2*delay )]=a[k+blockDim.x*blockIdx.x + (i+delay)*W];
Gs[k + (window-1)*( BLOCK+2*delay )]=a1[k+blockDim.x*blockIdx.x + (i+delay)*W];
Bs[k + (window-1)*( BLOCK+2*delay )]=a2[k+blockDim.x*blockIdx.x + (i+delay)*W];
}
__syncthreads();
//CALCULATIONS
if(delay <= delay+threadIdx.x+blockIdx.x*blockDim.x && delay+threadIdx.x+blockIdx.x*blockDim.x < W - delay ){
counter=0;
dist=0;
for(int x=-delay; x<=delay ; ++x )
for(int y=0 ; y<window ; ++y){
dist =0;
dist+=(Rs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Rs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)])*(Rs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Rs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)]);
dist+=(Gs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Gs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)])*(Gs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Gs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)]);
dist+=(Bs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Bs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)])*(Bs[delay+threadIdx.x+(delay)*(BLOCK+2*delay)]-Bs[delay+threadIdx.x+x+(y)*(BLOCK+2*delay)]);
if(dist<p_d)
counter++;
}
if(counter>=3)
help[delay+threadIdx.x+blockIdx.x*blockDim.x+i*W]=0;
else
help[delay+threadIdx.x+blockIdx.x*blockDim.x+i*W]=1;
}
__syncthreads();
}
}
and similary for repair stage. Then I have to compare every pixel before contamination and after repair. As a determinant i use MSE. Simple CPU code:
for(int i=delay;i<width-delay;++i)
{
for(int j=delay;j<height-delay;++j)
{
mse+=(R[i+j*width]-r[i+j*width])*(R[i+j*width]-r[i+j*width]);
mse+=(B[i+j*width]-b[i+j*width])*(B[i+j*width]-b[i+j*width]);
mse+=(G[i+j*width]-g[i+j*width])*(G[i+j*width]-g[i+j*width]);
}
}
mse*=(1/(3.0*((width-2*delay)*(height-2*delay))));
MSE on GPU:
__global__
void mse_function(float* mse,rgb_type* R,rgb_type* G,rgb_type* B,rgb_type* R1,rgb_type* G1,rgb_type* B1,unsigned int W,unsigned int H ){
if(threadIdx.x+blockIdx.x*blockDim.x < W ){
double mse_loc=0;
for(int i=0; i < H; ++i){
mse_loc=0;
mse_loc+=(R[threadIdx.x+blockIdx.x*blockDim.x+i*W]-R1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)])*(R[threadIdx.x+blockIdx.x*blockDim.x+i*W]-R1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)]);
mse_loc+=(G[threadIdx.x+blockIdx.x*blockDim.x+i*W]-G1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)])*(G[threadIdx.x+blockIdx.x*blockDim.x+i*W]-G1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)]);
mse_loc+=(B[threadIdx.x+blockIdx.x*blockDim.x+i*W]-B1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)])*(B[threadIdx.x+blockIdx.x*blockDim.x+i*W]-B1[threadIdx.x+blockIdx.x*blockDim.x+i*(W)]);
}
atomicAdd(mse,mse_local);
}
}
So, on GPU every stage is seperated from each other. My Qestion is: Can i do it more effiecient way? I tend to care very much on performace, I have managed to cut execution time 20 times, but its my first approach to GPU programming and im open for hints how could i do it better. What bothers me is that im not using shared memory in MSE calculations. I tried reduction alghoritm, but it turned out 5 times slower that my attepmt. I also tried merge those 3 kernels into one, but it spited weird errors and malfunctionated memmory addresses.