Hi Folks:
Following is my code to do: adding all art over j to certain i. The outside loop generates Ns number of art. I want all these art to be added into ith d1 or d2. I did some search, and it seems I should use atomicdoubleAdd() to avoid race condition since I have so many art waiting to be added to ith d1 or d2 simultaneously. But I still got error in this code. I am using MATLAB + CUDA, so I cannot step into .cu file to debug. I checked the code, and it looks right to me. Anyone give any pointer on why the code is wrong, or how to step in and debug .cu file? Thanks a lot. Any thought is appreciated.
__device__ double atomicdoubleAdd(double* address, double val)
{
unsigned long long int* address_as_ull =(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do
{
assumed = old;
old = atomicCAS(address_as_ull, assumed,__double_as_longlong(val +__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__global__
void afc_example
(
double *d1,*d2;
double *afr,
double *afl,
const double *x,
const double *g,
const int Ns,
const int Nt
)
{ /* j: inside loop; i: outside loop */
__shared__ double s_d1; //for per block accumulation
__shared__ double s_d2;
int j = blockIdx.x*blockDim.x + threadIdx.x;
int i;
double art1,art2;
double phs;
double zr, zi, l_d1, l_d2; //for local accumulation (over a fixed i)
if (threadIdx.x == 0) {s_d1 = 0; s_d2 = 0;}
__syncthreads();
while(j < Ns)
{ /* initialize */
l_d = 0;
for(i = Nt-1;i >= 0;i--)
{
phs = x[j]*g[i];
zr = cos(phs/2.0);
zi = sin(phs/2.0);
art1 = afr[j]*zr + afl[j]*zi;
art2 = afr[j]*zi - afl[j]*zr;
l_d1 += art1; // local accumulation of art
l_d2 += art2;
} /* i */
// local reduction of l_d into the shared variable
atomicdoubleAdd(&s_d1 l_d1);
atomicdoubleAdd(&s_d2 l_d2);
__syncthreads();
// global accumulation of the shared variables into d[i]
if (threadIdx.x == 0)
{
atomicdoubleAdd(&d1[i], s_d1); // only one thread per block do the update
atomicdoubleAdd(&d2[i], s_d2);
s_d1 = 0; s_d2 = 0;
}
__syncthreads();
j += blockDim.x*gridDim.x;
} /* j */
}