Hi all,
I’m currently learning to use CUDA for a project and I ran into the following problem:
When accumulating an array to get its sum, the result is different if I use GPU when compared with CPU. The following code is for CPU:
[codebox]int i, i1, i2;
struct atom *a1, *a2;
float ei, ej, eij, ri, rj, rij;
float dx, dy, dz, d2;
float Rd6, Rd12, dven, g;
for(i=0; i<n03; i++)
{
i1=list03[2*i];
a1=&(ag->atoms[i1]);
ei=f*(a1->eps03);
ri=a1->rminh03;
i2=list03[2*i+1];
a2=&(ag->atoms[i2]);
ej=a2->eps03;
rj=a2->rminh03;
eij=ei*ej;
rij=ri+rj;
rij*=rij;
dx=ag->atoms[i1].X-ag->atoms[i2].X;
dy=ag->atoms[i1].Y-ag->atoms[i2].Y;
dz=ag->atoms[i1].Z-ag->atoms[i2].Z;
d2=dx*dx+dy*dy+dz*dz;
Rd6=rij/d2;
Rd6=Rd6*Rd6*Rd6;
Rd12=Rd6*Rd6;
(*ven)+=eij*(Rd12-2*Rd6);
dven=-eij*12*(-Rd12+Rd6)/d2;
g=dven*dx;
(a1->GX)+=g;
(a2->GX)-=g;
g=dven*dy;
(a1->GY)+=g;
(a2->GY)-=g;
g=dven*dz;
(a1->GZ)+=g;
(a2->GZ)-=g;
}[/codebox]
here the variable (*ven) is the place to store accumulated value.
The following is the GPU version of function above:
[codebox]device unsigned int count = 0;
shared bool isLastBlockDone;
global void vdweng_assign(int numPairs, float* atom_X, float* atom_Y, float* atom_Z, float* atom_GX, float* atom_GY, float* atom_GZ, float* evdw, int* firstAtomIndex_assign, int* secondAtomIndex_assign, float rc2, float* atom_eps, float* atom_rminh, float* partial_result){
const int tid = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
const int threadN = IMUL(blockDim.x, gridDim.x);
__shared__ float evdw_sh_first[NUM_THREADS_PER_BLOCK];
int i, j;
int i1, i2;
int it, kt;
float x1,y1,z1;
float dx, dy, dz;
float d2;
int ij;
int my_i1;
float ri,rj,ei,ej,rij,eij;
float Rd6, Rd12, Rr6, Rr12, dr6, dven, g;
//float evdw_total;
int blockId, threadId;
int numBlocks, numThreadsPerBlock;
blockId = blockIdx.x;
numBlocks = gridDim.x;
threadId = threadIdx.x;
numThreadsPerBlock = blockDim.x;
int shared_memory_index = threadId;
for(i=0;i<NUM_BLOCKS;i++)
evdw_nop[i]=0;
for(i = tid; i < numPairs; i += threadN) {
//printf("Thread %d getting pair %d of %d, i1=%d, i2=%d\n", tid, i, numPairs);
my_i1 = firstAtomIndex_assign[i];
//printf("thread %d getting i1 = %d, n2 = %d\n", tid, my_i1, my_i2);
evdw_sh_first[shared_memory_index] = 0.0;
if(my_i1 >= 0) {
i1 = my_i1;
i2 = secondAtomIndex_assign[i];
x1 = atom_X[i1]; //(1)
y1 = atom_Y[i1];
z1 = atom_Z[i1];
dx = x1 - atom_X[i2];
dy = y1 - atom_Y[i2];
dz = z1 - atom_Z[i2];
d2 = dx*dx+dy*dy+dz*dz;
//printf("Thread id = %d, i1 = %d, i2 = %d\n", tid, i1, i2);
if(d2<rc2){
ri=atom_rminh[i1];
rj=atom_rminh[i2];
ei=atom_eps[i1];
ej=atom_eps[i2];
eij = ei*ej;
rij = (ri+rj)*(ri+rj);
Rd6 = (rij/d2)*(rij/d2)*(rij/d2);
Rd12 = Rd6*Rd6;
Rr6=(rij/rc2)*(rij/rc2)*(rij/rc2);
Rr12=Rr6*Rr6;
dr6=(d2/rc2)*(d2/rc2)*(d2/rc2);
evdw_sh_first[shared_memory_index] = eij*(Rd12-2*Rd6+Rr6*(4.0-2*dr6)+Rr12*(2*dr6-3.0));
//printf("Thread id = %d, i1 = %d, i2 = %d, evdw = %f\n", tid, i1, i2, eij*(Rd12-2*Rd6+Rr6*(4.0-2*dr6)+Rr12*(2*dr6-3.0)));
printf("%f\n",eij*(Rd12-2*Rd6+Rr6*(4.0-2*dr6)+Rr12*(2*dr6-3.0)));
dven=-eij*12*(-Rd12+Rd6+dr6*(Rr12-Rr6))/d2;
g=dven*dx;
atom_GX[i1]+=g;
atom_GX[i2]-=g;
g=dven*dy;
atom_GY[i1]+=g;
atom_GY[i2]-=g;
g=dven*dz;
atom_GZ[i1]+=g;
atom_GZ[i2]-=g;
}
else{
//printf("Thread id = %d, i1 = %d, i2 = %d, evdw = 0\n", tid, i1, i2);
}
}
__syncthreads();
if(threadIdx.x == 0){//@hxdai: first thread in each block is responsible for accumulating all the pairs in this block
partial_result[blockIdx.x] = 0;
for(i=0;i<NUM_THREADS_PER_BLOCK;i++){
partial_result[blockIdx.x]+=evdw_sh_first[i];
}
//printf("block %d's total is %f\n", blockIdx.x, partial_result[blockIdx.x]);
__threadfence();
printf("partial result of block %d is %f\n", blockIdx.x, partial_result[blockIdx.x]);
unsigned int value = atomicInc(&count, gridDim.x);
isLastBlockDone = (value == gridDim.x-1);
}
__syncthreads();
if(isLastBlockDone){
if(threadIdx.x == 0){
for(i=0;i<numBlocks;i++)
*evdw+=partial_result[i];
count = 0;
}
}
//__syncthreads();
} // end of for loop
}[/codebox]
Here *evdw is the final destination of accumulation.
I used the synchronization method mentioned in the programming guide to do the synchronization of accumulation.
I then run the code in EmuRelease mode, and the result from GPU is larger than that from CPU. They are using the same data inputs so there shouldn’t be a difference.
FYI I’m using a dell workstation with a Xeon dual-core 3.06GHz CPU and VS2008.
Any ideas? Thanks!