Hello there. I’m learning to use CUDA and made a very simple MD program, using Lennard-Jones potencial. I made a kernel do calculate the accelerations, but there’s a possible data race when the new acceleration is summed to the old one. I do understand that AtomicAdd would solve, but the floating-point version it doesn’t work with my board, that’s a 1.1 compute capable board. Any ideas how to do that? The kernel itself is as follows, and how I call it. How to fix that?
P.S.: Any ideas to improve the code would be appreciated aswell.
__global__ void forca (float2 *pos, float2 *acc, float box, int N, float sigma, float eps){
float2 del;
float r;
int i= threadIdx.x+blockIdx.x;
int j= threadIdx.y+blockIdx.x;
if ((i != j) && (i < N) && (j<N)){
del.x=pos[i].x-pos[j].x;
del.y=pos[i].y-pos[j].y;
if (del.x > box/2) {
del.x -= box;
}
else if (del.x < -box/2) {
del.x += box;
}
if (del.y > box/2) {
del.y -= box;
}
else if (del.y < -box/2) {
del.y += box;
}
r=del.x*del.x+del.y*del.y;
acc[i].x+=24*eps*(pow((sigma/r),14)-2*pow((sigma/r),8))*del.x;
acc[i].y+=24*eps*(pow((sigma/r),14)-2*pow((sigma/r),8))*del.y;
}
...
float2 *pos, *vel, *acc;
float2 *pos_d, *vel_d, *acc_d;
dim3 dimBlockforca(16, 16);
int nblocksforca=(N/16)*(N/16)+1;
int nblocksmove=(N+256-1)/256;
...
size_t memSize=N*sizeof(float2);
pos=(float2 *) malloc (memSize);
vel=(float2 *) malloc (memSize);
acc=(float2 *) malloc (memSize);
...
cudaMalloc( (void **) &pos_d, memSize );
cudaMalloc( (void **) &vel_d, memSize );
cudaMalloc( (void **) &acc_d, memSize );
cudaMemcpy( pos_d, pos, memSize, cudaMemcpyHostToDevice );
checkCUDAError("Memcpy pos");
cudaMemcpy( vel_d, vel, memSize, cudaMemcpyHostToDevice );
checkCUDAError("Memcpy vel");
cudaMemcpy( acc_d, acc, memSize, cudaMemcpyHostToDevice );
checkCUDAError("Memcpy acc");
...
forca <<< nblocksforca, dimBlockforca >>> (pos_d, acc_d, box, N, sigma, eps);
checkCUDAError("forca");
...
I’m running on
Win7 Professional x64
Microsoft Visual Studio 2008
GeForce 9300M (DevDriver 260.99)
CUDA Toolkit 3.2
EDIT: Just noticed the mistake on the title. Should be Possible Data Race on a 1.1 Device (GeForce 9300M). No visible way to fix that :(