Using CUDA to improve performance of a Molecular Dynamics program

Hello everyone!

I’m starting to study a bit of CUDA and came across this interesting code on GitHub (about “molecular dynamics”): https://github.com/FoleyLab/MolecularDynamics/blob/master/MD.cpp

Upon running it, I noticed that it was very slow, so I optimized the “hotspot” function, and now the slowest function looks like this:

void computeAccelerationsOPT() {
    int i, j;
    double f, rSqd;
    double rij[3];
    
    double rSqd3, rSqd7, ai0, ai1, ai2, rijf[3]; // computeAccelerations variables

    double sigma6, term1, term2, r2, Pot; // Potential variables
    sigma6 = sigma*sigma*sigma*sigma*sigma*sigma;
    Pot = 0;
    for (i = 0; i < N; i++) {
        // loop unrolling
        a[i][0] = 0;
        a[i][1] = 0;
        a[i][2] = 0;
    }

    for (i = 0; i < N-1; i++) {
        ai0 = 0; ai1 = 0; ai2 = 0; // Reduces the number of accesses to a[i][0], a[i][1] and a[i][2] by storing the sum's value and writing it into the matrix outside of the loop j.
        for (j = i+1; j < N; j++) {
            // loop unrolling
            rij[0] = r[i][0] - r[j][0];
            rij[1] = r[i][1] - r[j][1];
            rij[2] = r[i][2] - r[j][2];
            rSqd = r2 = rij[0] * rij[0] +
                        rij[1] * rij[1] +
                        rij[2] * rij[2];

            //  mathematical simplification
            rSqd3 = rSqd*rSqd*rSqd;
            rSqd7 = rSqd3*rSqd3*rSqd;
            f = (48-24*rSqd3) / rSqd7;

            // BEGIN OF POTENTIAL OPERATIONS
            // mathematical simplification of Potential calculations
            term2 = sigma6/(r2*r2*r2);
            term1 = term2*term2;
            Pot += term1 - term2;
            // END OF POTENTIAL OPERATIONS

            // loop unrolling using the vars ai0, ai1 and ai2 that reduce the number of accesses to the matrix a
            rijf[0] = rij[0]*f; rijf[1] = rij[1]*f; rijf[2] = rij[2]*f; // avoids duplicated multiplications
            ai0     += rijf[0];//rij[0] * f;
            ai1     += rijf[1];//rij[1] * f;
            ai2     += rijf[2];//rij[2] * f;
            a[j][0] -= rijf[0];//rij[0] * f;
            a[j][1] -= rijf[1];//rij[1] * f;
            a[j][2] -= rijf[2];//rij[2] * f;
        }
        // We only write the value into the matrix when we're outside of loop j in order to minimize the number of accesses to the matrix a.
        a[i][0] += ai0;
        a[i][1] += ai1;
        a[i][2] += ai2;
    }
    // BEGIN OF POTENTIAL OPERATIONS
    Pot *= 8*epsilon;
    P = Pot;
    // END OF POTENTIAL OPERATIONS
}

Could someone help me improve performance with CUDA? I’ve already tried to do some things, but it still takes quite a while considering that GPUs are much faster.
At the moment, the CUDA code is as follows:

__device__ double atomicAdd_double(double *address, double val){
    unsigned long long int *address_as_ull = (unsigned long long int *)address;
    unsigned long long int old = *address_as_ull, assumed;

    do
    {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val + __longlong_as_double(assumed)));
    } while (assumed != old);

    return __longlong_as_double(old);
}

__global__ void computeAccelerationsKernel(double *r_, double *a_, double *Pot, int N_){
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N_ - 1){
        double f, rSqd;
        double rij[3];
        double rSqd3, rSqd7, rijf[3], sigma6 = 1.0, term1, term2;
        double local_a[] = {0.0,0.0,0.0}, ri[] = {r_[i*3],r_[i*3+1],r_[i*3+2]};

        Pot[i] = 0.0;
        for(int j = i+1; j < N_; j++){
            rij[0] = ri[0] - r_[j*3];
            rij[1] = ri[1] - r_[j*3+1];
            rij[2] = ri[2] - r_[j*3+2];
            rSqd = rij[0] * rij[0] +
                   rij[1] * rij[1] +
                   rij[2] * rij[2];

            rSqd3 = rSqd * rSqd * rSqd;
            rSqd7 = rSqd3 * rSqd3 * rSqd;
            f = (48 - 24 * rSqd3) / rSqd7;

            term2 = sigma6 / (rSqd * rSqd * rSqd);
            term1 = term2 * term2;
            Pot[i] += term1 - term2;

            rijf[0] = rij[0] * f;
            rijf[1] = rij[1] * f;
            rijf[2] = rij[2] * f;
            
            local_a[0] += rijf[0];
            local_a[1] += rijf[1];
            local_a[2] += rijf[2];
            
            atomicAdd_double(a_+j*3  , -rijf[0]);
            atomicAdd_double(a_+j*3+1, -rijf[1]);
            atomicAdd_double(a_+j*3+2, -rijf[2]);
        }
        atomicAdd_double(a_+i*3, local_a[0]);
        atomicAdd_double(a_+i*3+1, local_a[1]);
        atomicAdd_double(a_+i*3+2, local_a[2]);
    }
}

void computeAccelerationsCUDA(){
    for (int i = 0; i < N; i++){
        a[i][0] = a[i][1] = a[i][2] = 0;
    }

    double *d_r, *d_a, *d_P; 
    double h_P[N];

    cudaMalloc((void **)&d_a, N * 3 * sizeof(double));
    checkCUDAError("cudaMalloc 1");
    cudaMalloc((void **)&d_r, N * 3 * sizeof(double));
    checkCUDAError("cudaMalloc 2");
    cudaMalloc((void **)&d_P, N * sizeof(double));
    checkCUDAError("cudaMalloc 3");

    cudaMemcpy(d_a, a, N * 3 * sizeof(double), cudaMemcpyHostToDevice);
    checkCUDAError("cudaMemcpy 2");
    cudaMemcpy(d_r, r, N * 3 * sizeof(double), cudaMemcpyHostToDevice);
    checkCUDAError("cudaMemcpy 2");

    int threadsPerBlock = 512;
    int nBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;

    computeAccelerationsKernel<<<nBlocks, threadsPerBlock>>>(d_r, d_a, d_P, N);

    cudaMemcpy(a, d_a, N * 3 * sizeof(double), cudaMemcpyDeviceToHost);
    checkCUDAError("cudaMemcpu 1");
    cudaMemcpy(h_P, d_P, N * sizeof(double), cudaMemcpyDeviceToHost);
    checkCUDAError("cudaMemcpu 2");

    P = 0.0;
    for (int i = 0; i < N - 1; i++){
        P += h_P[i];
    }
    P *= 8 * epsilon;

    cudaFree(d_a);
    cudaFree(d_r);
    cudaFree(d_P);
}

Can someone help me to make this code faster?
Thank you very much!!

I can’t comment on the code directly, but just want to note that double floating point performance is generally slow on consumer grade GPUs, compared to 32 bit floats. You can get an idea of the throughput here.

Hello, thank you very much for your response.
I do not intend to change from doubles to floats, since doubles are used in the original code.
Is there any other way I can improve performance?
Thank you very much!

What GPU are you using?

Matrox Electronics Systems Ltd. MGA G200eW WPCM450

That card is not capable of running Cuda code. Cuda is intended for use on Nvidia cards.

I’m testing the code using SBATCH through ssh in a remote machine. I did "sbatch --wrap="lspci | grep VGA
“” and it returned that value.

Try “lspci | grep NVIDIA”

It returned nothing

If you are able to run Cuda, then you are using one of their cards. Hopefully someone can offer advice on your code.

To my knowledge, all of the major molecular dynamics packages (examples: LAMMPS, AMBER, GROMACS, NAMD) added CUDA support a decade ago. They typically use single precision for the bulk of the computations and double or fixed-point arithmetic (stored as long long int) only for some key quantities that require the higher precision. You may want to take a look at something like OpenMM (used by Folding@Home), which is open source.