I have an image named HSIImage, of size is 565x585, in which I have find the local mean and standard deviation at every pixel. For this I am using a window W of size 9x9, if we a re finding the mean of x(i,j) we need values in the W where x(i,j) is at its center.
For working on the corner and edge pixels, I am padding the HSIImage and naming it as HSIImage2.
This is my matlab code:
[m,n,~] = size(HSIImage);
HSIImage2=padarray(HSIImage,[4,4],'symmetric');
mean1 = zeros(m,n);
sd = zeros(m,n);
phi_x=zeros(m,n);
for i=5:m+4
for j=5:n+4
mean1(i-4,j-4) = mean( mean(HSIImage2(i-4:i+4, j-4:j+4, 3) )); %sum / (4*4);
sd(i-4,j-4) = std( std(HSIImage2(i-4:i+4, j-4:j+4, 3), 1));
end
end
[phi_x2,mean2,sd2] = getPhi(HSIImage(:,:,3)',HSIImage2(:,:,3)',m,n);
My cuda code for finding mean and sd is
__global__ void phi(double *d_HSIImage,double *d_HSIImage2, int row, int col, double *d_phi_x, double *d_mean, double *d_std)
{
int X = blockDim.x * blockIdx.x + threadIdx.x;
int Y = blockDim.y * blockIdx.y + threadIdx.y;
int i,j;
double sum = 0;
if(Y>3 && X>3 && Y<row+4 && X<col+4)
{
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
sum= sum + d_HSIImage2[i*col+j];
}
}
d_mean[(Y-4)*col+X-4] = sum/81;
double mean = sum/81;
sum = 0;
for(i=Y-4;i<=Y+4;i++){
for(j=X-4;j<=X+4;j++){
int index = i*col+j;
sum= sum + (d_HSIImage2[index]-mean) * (d_HSIImage2[index]-mean);
}
}
d_std[(Y-4)*col+X-4] = sqrt(sum/81);
}
void mexFunction( int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
double* HSIImage;
double* d_HSIImage;
double* HSIImage2;
double* d_HSIImage2;
double row;
double col;
double* phi_x;
double* d_phi_x;
double* mean2;
double* d_mean;
double* d_std;
double* sd2;
HSIImage = (double*)mxGetPr(prhs[0]);
HSIImage2 = (double*)mxGetPr(prhs[1]);
row = mxGetScalar(prhs[2]);
col = mxGetScalar(prhs[3]);
plhs[0] = mxCreateDoubleMatrix(row,col,mxREAL);
phi_x = mxGetPr(plhs[0]);
plhs[1] = mxCreateDoubleMatrix(row,col,mxREAL);
mean2 = mxGetPr(plhs[1]);
plhs[2] = mxCreateDoubleMatrix(row,col,mxREAL);
sd2 = mxGetPr(plhs[2]);
dim3 grid(((col+8)/TILE_WIDTH)+1,((row+8)/TILE_WIDTH)+1,1); //TILEWIDTH is 32. row=565, col=584.
dim3 block(TILE_WIDTH,TILE_WIDTH,1);
if ( cudaMalloc(&d_HSIImage,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_HSIImage2,sizeof(double)*(row+8)*(col+8))!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_phi_x,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_mean,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
if ( cudaMalloc(&d_std,sizeof(double)*row*col)!= cudaSuccess )
mexErrMsgTxt("Memory allocating failure on the GPU");
cudaMemcpy(d_HSIImage,HSIImage,sizeof(double)*row*col,cudaMemcpyHostToDevice);
cudaMemcpy(d_HSIImage2,HSIImage2,sizeof(double)*(row+8)*(col+8),cudaMemcpyHostToDevice);
phi <<< grid,block >>> (d_HSIImage,d_HSIImage2,row,col,d_phi_x,d_mean,d_std);
cudaMemcpy(phi_x,d_phi_x,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(mean2,d_mean,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaMemcpy(sd2,d_std,sizeof(double)*row*col,cudaMemcpyDeviceToHost);
cudaFree(d_HSIImage);
cudaFree(d_HSIImage2);
cudaFree(d_phi_x);
cudaFree(d_mean);
cudaFree(d_std);
}
its working fine when image is full of ones. but when I give regular image, there is lot of difference in serial(MATLAB) and parallel(CUDA) outputs(When mean1 and mean2 are compared). Please tell me the error.