do you think the following transformation from cpu into cuda is correct?

somehow i am getting a little different outcome.

the location of __syncthreads(); is not a big factor…

what did i do wrong?

any comments are welcome and thanks in advance…

=================================================================

cuda version

{

int i,j,k,ns,nc,m;

REAL temp;

nc=kmax+2;

ns=nc*(jmax+2);

for (int slide = 0; slide <= (kmax+blockDim.z-1); slide += blockDim.z)

{

i = blockDim.x*blockIdx.x+threadIdx.x;
j = blockDim.y*blockIdx.y+threadIdx.y;

k = slide + threadIdx.z;

if ( i<=imax && j<=jmax && k<=kmax )

{

m = i*ns+j*nc+k ;

temp= (PHI[m]-PHI[m-ns])/dltx+(PHI[m+ns]-PHI[m])/dltx ;

PHI[m] = PHI[m]-delt*temp;

}

__syncthreads();

}

======================================================================

cpu version

for (i=0;i<=imax;i++)

for (j=0;j<=jmax;j++)

for (k=0;k<=kmax;k++)

{

m = i*ns+j*nc+k ;

temp=(PHI[m]-PHI[m-ns])/dltx+(PHI[m+ns]-PHI[m])/dltx;

PHI[m] = PHI[m]-delt*temp;

}