Hi all,
I want to perform atomicAdd on a cuDoubleComplex variable. In order to do this I used the atomicAdd version for double found here:
to do attomic adds independently on the real and imaginary parts of the cuDoubleComplex variable. The simple kernel I wrote is:
__device__ void atAddComplex(cuDoubleComplex* a, cuDoubleComplex b){
//transform the addresses of real and imag. parts to double pointers
double *x = (double*)a;
double *y = x+1;
//use atomicAdd for double variables
atomicAdd(x, cuCreal(b));
atomicAdd(y, cuCimag(b));
}
Then I test it using a kernel that adds the elements of an array of complex numbers (array) and stores the result in a dynamically allocated variable (sum):
__global__ void sum_kernel(cuDoubleComplex *array, cuDoubleComplex *sum){
int n = threadIdx.x + blockDim.x * blockIdx.x;
if(n < AR_SIZE){
atAddComplex(&sum[0], array[n]);
}
}
Finally I use the addition kernel:
int main(){
cuDoubleComplex z = make_cuDoubleComplex(1.5, 2.);
cuDoubleComplex *array;
array = new cuDoubleComplex[AR_SIZE];
for(int i = 0; i < AR_SIZE; ++i)
array[i] = make_cuDoubleComplex(cuCreal(z), cuCimag(z));
//Allocate and copy memory to device
cuDoubleComplex *d_array;
cudaMalloc((void**)&d_array, sizeof(cuDoubleComplex)*(AR_SIZE));
cudaMemcpy(d_array, array, sizeof(cuDoubleComplex)*(AR_SIZE), cudaMemcpyHostToDevice);
//Define grid and block size
int blocks = AR_SIZE / BLOCK_WIDTH;
if(AR_SIZE % BLOCK_WIDTH) blocks++;
//Allocate memory for output
cuDoubleComplex *d_sum;
cudaMalloc((void**)&d_sum, sizeof(cuDoubleComplex));
// Kernel invocation code
dim3 dimGrid(blocks, 1, 1);
dim3 dimBlock(BLOCK_WIDTH, 1, 1);
sum_kernel<<<dimGrid,dimBlock>>>(d_array, d_sum);
//copy output to host
cuDoubleComplex *sum;
sum = new cuDoubleComplex[1];
sum[0] = make_cuDoubleComplex(0., 0.);
cudaMemcpy(sum, d_sum, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
//print result
cout<<"sum="<<cuCreal(sum[0])<<" + i*"<<cuCimag(sum[0])<<endl;
//free memmory
cudaFree(d_array);
cudaFree(d_sum);
delete[] array;
delete[] sum;
return 0;
}
The code seems to add correctly except for one thing: unless I set d_sum to 0 using
cudaMemset(d_sum, 0, sizeof(cuDoubleComplex));
the result stacks every time I run the code, so for example if AR_SIZE=100, the first time you run it you get 150 + i 200, If you run it again you get 300 + i 400 , then 450 + i 600 and so. Why is this happening if I do cudaFree(d_sum)?
Thank you for any help you can give.