#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <time.h>
// double precision atomic add function
__device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do{ assumed = old;
old = atomicCAS(address_as_ull, assumed,__double_as_longlong(val +__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
// kernel function
__global__ void addKernel(double *dev_add)
{
int i = threadIdx.x;
__shared__ double add_result;
add_result = 0.1245274925749275269245279673295762;
double previously_calculated_value = double(i);
__syncthreads();
atomicAdd(&add_result, previously_calculated_value); // double precision atomic add
//atomicAdd(&add_result, float(previously_calculated_value)); // single precision atomic add
//add_result += double(i);
if(i==31)
{
//atomicAdd(&add_result, double(i));
*dev_add = (double)add_result;
}
}
int main()
{
int size = 1;
double *dev_add;
double c;
cudaMalloc((void**)&dev_add, size * sizeof(double));
clock_t t1, t2;
t1 = clock();
// Launch a kernel on the GPU
addKernel <<< 1000000, 32 >>> (dev_add);
// copy calculation result back to host
cudaMemcpy(&c, dev_add, size * sizeof(double), cudaMemcpyDeviceToHost);
t2 = clock();
printf("%.3f ms\n", (double)(t2 - t1) / CLOCKS_PER_SEC * 1000);
printf("add result: %.20f", c);
cudaFree(dev_add);
getchar();
return 1;
}
However, the usage of self-defined double precision atomicAdd() slowes the code down over 70 times. Even comparing with the single precision atomicAdd() provided by Nvidia, it is still 20 times slower.
So, I am wondering whether there is a better way to do that, rather than the method used in my code, to minimize the penalty of double precision atomicAdd() usage. I am using VS2010, CUDA6.0, and running code on K20C.