Finding local mean in an image using mex-cuda

I have an image named HSIImage, of size is 565x585, in which I have find the local mean and standard deviation at every pixel. For this I am using a window W of size 9x9, if we a re finding the mean of x(i,j) we need values in the W where x(i,j) is at its center.

For working on the corner and edge pixels, I am padding the HSIImage and naming it as HSIImage2.

This is my matlab code:

[m,n,~]  =  size(HSIImage);
HSIImage2=padarray(HSIImage,[4,4],'symmetric');

mean1    = zeros(m,n);
sd       = zeros(m,n);
phi_x=zeros(m,n);

for i=5:m+4
    for j=5:n+4
        mean1(i-4,j-4) = mean( mean(HSIImage2(i-4:i+4, j-4:j+4, 3) )); %sum / (4*4);
        sd(i-4,j-4) = std( std(HSIImage2(i-4:i+4, j-4:j+4, 3), 1)); 
    end
end
[phi_x2,mean2,sd2] = getPhi(HSIImage(:,:,3)',HSIImage2(:,:,3)',m,n);

My cuda code for finding mean and sd is

__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;

if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
    for(i=Y-4;i<=Y+4;i++){
        for(j=X-4;j<=X+4;j++){
            sum= sum + d_HSIImage2[i*col+j];
        }
    }

    d_mean[(Y-4)*col+X-4] = sum/81;
    double mean = sum/81;
    sum = 0;

    for(i=Y-4;i<=Y+4;i++){
        for(j=X-4;j<=X+4;j++){
            int index = i*col+j;
            sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
        }
    }
    d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
void mexFunction( int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
double*     HSIImage;
double*     d_HSIImage;
double*     HSIImage2;
double*     d_HSIImage2;
double      row;
double      col;
double*     phi_x;
double*     d_phi_x;
double*     mean2;
double*     d_mean;
double*     d_std;
double*     sd2;

HSIImage    = (double*)mxGetPr(prhs[0]);
HSIImage2   = (double*)mxGetPr(prhs[1]);
row         = mxGetScalar(prhs[2]);
col         = mxGetScalar(prhs[3]);

plhs[0]     = mxCreateDoubleMatrix(row,col,mxREAL);
phi_x       = mxGetPr(plhs[0]);
plhs[1]     = mxCreateDoubleMatrix(row,col,mxREAL);
mean2       = mxGetPr(plhs[1]);
plhs[2]     = mxCreateDoubleMatrix(row,col,mxREAL);
sd2         = mxGetPr(plhs[2]);

dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1);    //TILEWIDTH is 32. row=565, col=584.
dim3 block(TILE_WIDTH,TILE_WIDTH,1);

if ( cudaMalloc(&d_HSIImage,sizeof(double)*row*col)!= cudaSuccess )  
    mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_HSIImage2,sizeof(double)*(row+8)*(col+8))!= cudaSuccess )  
    mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_phi_x,sizeof(double)*row*col)!= cudaSuccess )  
    mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_mean,sizeof(double)*row*col)!= cudaSuccess )  
    mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_std,sizeof(double)*row*col)!= cudaSuccess )  
    mexErrMsgTxt("Memory allocating failure on the GPU");

cudaMemcpy(d_HSIImage,HSIImage,sizeof(double)*row*col,cudaMemcpyHostToDevice);
cudaMemcpy(d_HSIImage2,HSIImage2,sizeof(double)*(row+8)*(col+8),cudaMemcpyHostToDevice);

phi <<< grid,block >>> (d_HSIImage,d_HSIImage2,row,col,d_phi_x,d_mean,d_std);

cudaMemcpy(phi_x,d_phi_x,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(mean2,d_mean,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(sd2,d_std,sizeof(double)*row*col,cudaMemcpyDeviceToHost);

cudaFree(d_HSIImage);
cudaFree(d_HSIImage2);
cudaFree(d_phi_x);
cudaFree(d_mean);
cudaFree(d_std);

}

its working fine when image is full of ones. but when I give regular image, there is lot of difference in serial(MATLAB) and parallel(CUDA) outputs(When mean1 and mean2 are compared). Please tell me the error.

start by adding a cudaDeviceSynchronize() after the kernel launch in your host function

It is not helping

luckily, i said: “start by”

why is row; col declared double?

double row;
double col;

and i do not think this is right:

if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}

i think it should rather be something like:

a = X + Y;
b = row * col;
c = col * 3;

if ((a > (c + 4) && (a < (b - c - 4))
{
for (i = (-4); i <= 4; i++)
{
z = i*col;

for (j = (-4); j <= 4; j++)
{
sum += d_HSIImage2[(a + z) + j];
}
}

i did this in a hurry; just check

because row,col are double in MATLAB. I am typecasting them during the function call. I am checking the other part of your answer.

What is this c = col * 3 ; I didn’t understand this. can you tell about this stmt.