Hello everyone!
I’m starting to study a bit of CUDA and came across this interesting code on GitHub (about “molecular dynamics”): https://github.com/FoleyLab/MolecularDynamics/blob/master/MD.cpp
Upon running it, I noticed that it was very slow, so I optimized the “hotspot” function, and now the slowest function looks like this:
void computeAccelerationsOPT() {
int i, j;
double f, rSqd;
double rij[3];
double rSqd3, rSqd7, ai0, ai1, ai2, rijf[3]; // computeAccelerations variables
double sigma6, term1, term2, r2, Pot; // Potential variables
sigma6 = sigma*sigma*sigma*sigma*sigma*sigma;
Pot = 0;
for (i = 0; i < N; i++) {
// loop unrolling
a[i][0] = 0;
a[i][1] = 0;
a[i][2] = 0;
}
for (i = 0; i < N-1; i++) {
ai0 = 0; ai1 = 0; ai2 = 0; // Reduces the number of accesses to a[i][0], a[i][1] and a[i][2] by storing the sum's value and writing it into the matrix outside of the loop j.
for (j = i+1; j < N; j++) {
// loop unrolling
rij[0] = r[i][0] - r[j][0];
rij[1] = r[i][1] - r[j][1];
rij[2] = r[i][2] - r[j][2];
rSqd = r2 = rij[0] * rij[0] +
rij[1] * rij[1] +
rij[2] * rij[2];
// mathematical simplification
rSqd3 = rSqd*rSqd*rSqd;
rSqd7 = rSqd3*rSqd3*rSqd;
f = (48-24*rSqd3) / rSqd7;
// BEGIN OF POTENTIAL OPERATIONS
// mathematical simplification of Potential calculations
term2 = sigma6/(r2*r2*r2);
term1 = term2*term2;
Pot += term1 - term2;
// END OF POTENTIAL OPERATIONS
// loop unrolling using the vars ai0, ai1 and ai2 that reduce the number of accesses to the matrix a
rijf[0] = rij[0]*f; rijf[1] = rij[1]*f; rijf[2] = rij[2]*f; // avoids duplicated multiplications
ai0 += rijf[0];//rij[0] * f;
ai1 += rijf[1];//rij[1] * f;
ai2 += rijf[2];//rij[2] * f;
a[j][0] -= rijf[0];//rij[0] * f;
a[j][1] -= rijf[1];//rij[1] * f;
a[j][2] -= rijf[2];//rij[2] * f;
}
// We only write the value into the matrix when we're outside of loop j in order to minimize the number of accesses to the matrix a.
a[i][0] += ai0;
a[i][1] += ai1;
a[i][2] += ai2;
}
// BEGIN OF POTENTIAL OPERATIONS
Pot *= 8*epsilon;
P = Pot;
// END OF POTENTIAL OPERATIONS
}
Could someone help me improve performance with CUDA? I’ve already tried to do some things, but it still takes quite a while considering that GPUs are much faster.
At the moment, the CUDA code is as follows:
__device__ double atomicAdd_double(double *address, double val){
unsigned long long int *address_as_ull = (unsigned long long int *)address;
unsigned long long int old = *address_as_ull, assumed;
do
{
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__global__ void computeAccelerationsKernel(double *r_, double *a_, double *Pot, int N_){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N_ - 1){
double f, rSqd;
double rij[3];
double rSqd3, rSqd7, rijf[3], sigma6 = 1.0, term1, term2;
double local_a[] = {0.0,0.0,0.0}, ri[] = {r_[i*3],r_[i*3+1],r_[i*3+2]};
Pot[i] = 0.0;
for(int j = i+1; j < N_; j++){
rij[0] = ri[0] - r_[j*3];
rij[1] = ri[1] - r_[j*3+1];
rij[2] = ri[2] - r_[j*3+2];
rSqd = rij[0] * rij[0] +
rij[1] * rij[1] +
rij[2] * rij[2];
rSqd3 = rSqd * rSqd * rSqd;
rSqd7 = rSqd3 * rSqd3 * rSqd;
f = (48 - 24 * rSqd3) / rSqd7;
term2 = sigma6 / (rSqd * rSqd * rSqd);
term1 = term2 * term2;
Pot[i] += term1 - term2;
rijf[0] = rij[0] * f;
rijf[1] = rij[1] * f;
rijf[2] = rij[2] * f;
local_a[0] += rijf[0];
local_a[1] += rijf[1];
local_a[2] += rijf[2];
atomicAdd_double(a_+j*3 , -rijf[0]);
atomicAdd_double(a_+j*3+1, -rijf[1]);
atomicAdd_double(a_+j*3+2, -rijf[2]);
}
atomicAdd_double(a_+i*3, local_a[0]);
atomicAdd_double(a_+i*3+1, local_a[1]);
atomicAdd_double(a_+i*3+2, local_a[2]);
}
}
void computeAccelerationsCUDA(){
for (int i = 0; i < N; i++){
a[i][0] = a[i][1] = a[i][2] = 0;
}
double *d_r, *d_a, *d_P;
double h_P[N];
cudaMalloc((void **)&d_a, N * 3 * sizeof(double));
checkCUDAError("cudaMalloc 1");
cudaMalloc((void **)&d_r, N * 3 * sizeof(double));
checkCUDAError("cudaMalloc 2");
cudaMalloc((void **)&d_P, N * sizeof(double));
checkCUDAError("cudaMalloc 3");
cudaMemcpy(d_a, a, N * 3 * sizeof(double), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy 2");
cudaMemcpy(d_r, r, N * 3 * sizeof(double), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy 2");
int threadsPerBlock = 512;
int nBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;
computeAccelerationsKernel<<<nBlocks, threadsPerBlock>>>(d_r, d_a, d_P, N);
cudaMemcpy(a, d_a, N * 3 * sizeof(double), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpu 1");
cudaMemcpy(h_P, d_P, N * sizeof(double), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpu 2");
P = 0.0;
for (int i = 0; i < N - 1; i++){
P += h_P[i];
}
P *= 8 * epsilon;
cudaFree(d_a);
cudaFree(d_r);
cudaFree(d_P);
}
Can someone help me to make this code faster?
Thank you very much!!