Using CUDA to improve performance of a Molecular Dynamics program

anon66429139 · December 24, 2023, 11:39am

Hello everyone!

I’m starting to study a bit of CUDA and came across this interesting code on GitHub (about “molecular dynamics”): https://github.com/FoleyLab/MolecularDynamics/blob/master/MD.cpp

Upon running it, I noticed that it was very slow, so I optimized the “hotspot” function, and now the slowest function looks like this:

void computeAccelerationsOPT() {
    int i, j;
    double f, rSqd;
    double rij[3];
    
    double rSqd3, rSqd7, ai0, ai1, ai2, rijf[3]; // computeAccelerations variables

    double sigma6, term1, term2, r2, Pot; // Potential variables
    sigma6 = sigma*sigma*sigma*sigma*sigma*sigma;
    Pot = 0;
    for (i = 0; i < N; i++) {
        // loop unrolling
        a[i][0] = 0;
        a[i][1] = 0;
        a[i][2] = 0;
    }

    for (i = 0; i < N-1; i++) {
        ai0 = 0; ai1 = 0; ai2 = 0; // Reduces the number of accesses to a[i][0], a[i][1] and a[i][2] by storing the sum's value and writing it into the matrix outside of the loop j.
        for (j = i+1; j < N; j++) {
            // loop unrolling
            rij[0] = r[i][0] - r[j][0];
            rij[1] = r[i][1] - r[j][1];
            rij[2] = r[i][2] - r[j][2];
            rSqd = r2 = rij[0] * rij[0] +
                        rij[1] * rij[1] +
                        rij[2] * rij[2];

            //  mathematical simplification
            rSqd3 = rSqd*rSqd*rSqd;
            rSqd7 = rSqd3*rSqd3*rSqd;
            f = (48-24*rSqd3) / rSqd7;

            // BEGIN OF POTENTIAL OPERATIONS
            // mathematical simplification of Potential calculations
            term2 = sigma6/(r2*r2*r2);
            term1 = term2*term2;
            Pot += term1 - term2;
            // END OF POTENTIAL OPERATIONS

            // loop unrolling using the vars ai0, ai1 and ai2 that reduce the number of accesses to the matrix a
            rijf[0] = rij[0]*f; rijf[1] = rij[1]*f; rijf[2] = rij[2]*f; // avoids duplicated multiplications
            ai0     += rijf[0];//rij[0] * f;
            ai1     += rijf[1];//rij[1] * f;
            ai2     += rijf[2];//rij[2] * f;
            a[j][0] -= rijf[0];//rij[0] * f;
            a[j][1] -= rijf[1];//rij[1] * f;
            a[j][2] -= rijf[2];//rij[2] * f;
        }
        // We only write the value into the matrix when we're outside of loop j in order to minimize the number of accesses to the matrix a.
        a[i][0] += ai0;
        a[i][1] += ai1;
        a[i][2] += ai2;
    }
    // BEGIN OF POTENTIAL OPERATIONS
    Pot *= 8*epsilon;
    P = Pot;
    // END OF POTENTIAL OPERATIONS
}

Could someone help me improve performance with CUDA? I’ve already tried to do some things, but it still takes quite a while considering that GPUs are much faster.
At the moment, the CUDA code is as follows:

__device__ double atomicAdd_double(double *address, double val){
    unsigned long long int *address_as_ull = (unsigned long long int *)address;
    unsigned long long int old = *address_as_ull, assumed;

    do
    {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val + __longlong_as_double(assumed)));
    } while (assumed != old);

    return __longlong_as_double(old);
}

__global__ void computeAccelerationsKernel(double *r_, double *a_, double *Pot, int N_){
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N_ - 1){
        double f, rSqd;
        double rij[3];
        double rSqd3, rSqd7, rijf[3], sigma6 = 1.0, term1, term2;
        double local_a[] = {0.0,0.0,0.0}, ri[] = {r_[i*3],r_[i*3+1],r_[i*3+2]};

        Pot[i] = 0.0;
        for(int j = i+1; j < N_; j++){
            rij[0] = ri[0] - r_[j*3];
            rij[1] = ri[1] - r_[j*3+1];
            rij[2] = ri[2] - r_[j*3+2];
            rSqd = rij[0] * rij[0] +
                   rij[1] * rij[1] +
                   rij[2] * rij[2];

            rSqd3 = rSqd * rSqd * rSqd;
            rSqd7 = rSqd3 * rSqd3 * rSqd;
            f = (48 - 24 * rSqd3) / rSqd7;

            term2 = sigma6 / (rSqd * rSqd * rSqd);
            term1 = term2 * term2;
            Pot[i] += term1 - term2;

            rijf[0] = rij[0] * f;
            rijf[1] = rij[1] * f;
            rijf[2] = rij[2] * f;
            
            local_a[0] += rijf[0];
            local_a[1] += rijf[1];
            local_a[2] += rijf[2];
            
            atomicAdd_double(a_+j*3  , -rijf[0]);
            atomicAdd_double(a_+j*3+1, -rijf[1]);
            atomicAdd_double(a_+j*3+2, -rijf[2]);
        }
        atomicAdd_double(a_+i*3, local_a[0]);
        atomicAdd_double(a_+i*3+1, local_a[1]);
        atomicAdd_double(a_+i*3+2, local_a[2]);
    }
}

void computeAccelerationsCUDA(){
    for (int i = 0; i < N; i++){
        a[i][0] = a[i][1] = a[i][2] = 0;
    }

    double *d_r, *d_a, *d_P; 
    double h_P[N];

    cudaMalloc((void **)&d_a, N * 3 * sizeof(double));
    checkCUDAError("cudaMalloc 1");
    cudaMalloc((void **)&d_r, N * 3 * sizeof(double));
    checkCUDAError("cudaMalloc 2");
    cudaMalloc((void **)&d_P, N * sizeof(double));
    checkCUDAError("cudaMalloc 3");

    cudaMemcpy(d_a, a, N * 3 * sizeof(double), cudaMemcpyHostToDevice);
    checkCUDAError("cudaMemcpy 2");
    cudaMemcpy(d_r, r, N * 3 * sizeof(double), cudaMemcpyHostToDevice);
    checkCUDAError("cudaMemcpy 2");

    int threadsPerBlock = 512;
    int nBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;

    computeAccelerationsKernel<<<nBlocks, threadsPerBlock>>>(d_r, d_a, d_P, N);

    cudaMemcpy(a, d_a, N * 3 * sizeof(double), cudaMemcpyDeviceToHost);
    checkCUDAError("cudaMemcpu 1");
    cudaMemcpy(h_P, d_P, N * sizeof(double), cudaMemcpyDeviceToHost);
    checkCUDAError("cudaMemcpu 2");

    P = 0.0;
    for (int i = 0; i < N - 1; i++){
        P += h_P[i];
    }
    P *= 8 * epsilon;

    cudaFree(d_a);
    cudaFree(d_r);
    cudaFree(d_P);
}

Can someone help me to make this code faster?
Thank you very much!!

rs277 · December 24, 2023, 6:24pm

I can’t comment on the code directly, but just want to note that double floating point performance is generally slow on consumer grade GPUs, compared to 32 bit floats. You can get an idea of the throughput here.

anon66429139 · December 24, 2023, 7:03pm

Hello, thank you very much for your response.
I do not intend to change from doubles to floats, since doubles are used in the original code.
Is there any other way I can improve performance?
Thank you very much!

rs277 · December 24, 2023, 7:10pm

What GPU are you using?

anon66429139 · December 24, 2023, 7:23pm

Matrox Electronics Systems Ltd. MGA G200eW WPCM450

rs277 · December 24, 2023, 7:29pm

That card is not capable of running Cuda code. Cuda is intended for use on Nvidia cards.

anon66429139 · December 24, 2023, 7:37pm

I’m testing the code using SBATCH through ssh in a remote machine. I did "sbatch --wrap="lspci | grep VGA
“” and it returned that value.

rs277 · December 24, 2023, 7:40pm

Try “lspci | grep NVIDIA”

anon66429139 · December 24, 2023, 7:45pm

It returned nothing

rs277 · December 24, 2023, 7:48pm

If you are able to run Cuda, then you are using one of their cards. Hopefully someone can offer advice on your code.

njuffa · December 24, 2023, 10:13pm

To my knowledge, all of the major molecular dynamics packages (examples: LAMMPS, AMBER, GROMACS, NAMD) added CUDA support a decade ago. They typically use single precision for the bulk of the computations and double or fixed-point arithmetic (stored as long long int) only for some key quantities that require the higher precision. You may want to take a look at something like OpenMM (used by Folding@Home), which is open source.

Topic		Replies	Views
how to implement double for loops in CUDA CUDA Programming and Performance	23	15813	January 30, 2012
1080 does not support doubles? CUDA Programming and Performance	5	560	October 30, 2018
Optimization of Molecular Dynamics Code. (S CUDA Programming and Performance	14	7535	October 16, 2009
CUDA very slow performance CUDA Programming and Performance	21	16813	March 6, 2020
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	10017	January 18, 2012
Cuda Latency problems Slow Cuda CUDA Programming and Performance	15	13942	September 5, 2008
Cuda code performance CUDA Programming and Performance	14	3177	December 16, 2014
performance difference for cuda between experiments and the documentation for float/double data type... CUDA Programming and Performance	8	1916	October 28, 2016
Is CUDA really that fast? CUDA Programming and Performance	17	11740	September 21, 2009
CUDA slower than MATLAB... again I can't get the simplest examples to show any speed-up using GP CUDA Programming and Performance	5	2545	February 18, 2011

Using CUDA to improve performance of a Molecular Dynamics program

Related topics