Hi everyone,
I need to analyze some data, and I want to use GPU to speed up the process, which is otherwise quite long.
I need to compute an intermediate scattering function using atoms trajectories from molecular dynamics simulation.
For this, I have a 3D array ‘atomPos’ (dim0 → atoms, dim1 → trajectory frame, dim2 → xyz coordinates), a 3D array of scattering q vectors ‘qVecs’ (dim0 → vector amplitude, dim1 → random vectors of given amplitude, dim2 → xyz coordinates).
Finally everything is stored in a 2D array ‘out’ (dim0 → number of q vectors amplitude used, dim1 → number of time interval computed (number of trajectory frames) ).
All arrays are flattened.
The code below is compiled with nvcc using ‘nvcc -lib -o fileName.lib fileName.cu’, then I use .cpp file as a wrapper together with .pyx file, such that everything is cythonize and can be called from a Python software.
The idea is to use one thread per atom, and run all the loops for each of them independently. Then the result is added in the corresponding position (q vector amplitude and trajectory frame) in the ‘out’ array using atomic operations.
When I try to run this, I can see it starts, because I can print the state of execution from the kernel using printf. However, it quickly crashes, with an unspecified kernel launch failure.
Looking at memory allocation, I can see that not all my first array was loaded, because it should use 720 MB, but only 200 MB is used on GPU during the execution as seen using the windows resources monitor.
Is there some limitation on the amount of data that can be loaded on GPU at once?
I’m using a GeForce 940 MX on a Asus Laptop.
Thanks in advance for your help.
The compIntScatFunc.cu file:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 512
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__
void compIntScatFunc(float *atomPos, int atomPos_dim0, int atomPos_dim1, float *out,
float *qVecs, int qVecs_dim0, int qVecs_dim1,
int maxFrames, int nbrTimeOri)
{
int atomId = blockDim.x * blockIdx.x + threadIdx.x;
if( atomId < atomPos_dim0 )
{
extern __shared__ float s_qVecs[];
for(int i=0; i < 3*qVecs_dim0*qVecs_dim1; ++i)
s_qVecs[i] = qVecs[i];
__syncthreads();
// Starts computation of intermediate scattering function
for(int qValId=0; qValId < qVecs_dim0; ++qValId)
{
for(int dt=0; dt < maxFrames; ++dt)
{
int timeIncr = (int)( atomPos_dim1 - dt) / nbrTimeOri;
for(int t0=0; t0 < nbrTimeOri; ++t0)
{
for(int qVecId=0; qVecId < qVecs_dim1; ++qVecId)
{
// Gets indices
int atom_tf_idx = 3 * (atomId*atomPos_dim1 + t0*timeIncr + dt);
int atom_t0_idx = 3 * (atomId*atomPos_dim1 + t0*timeIncr);
int qVec_idx = 3 * (qValId * qVecs_dim1 + qVecId);
// Computes distances for given timestep and atom
float dist_0 = atomPos[atom_tf_idx] - atomPos[atom_t0_idx];
float dist_1 = atomPos[atom_tf_idx+1] - atomPos[atom_t0_idx+1];
float dist_2 = atomPos[atom_tf_idx+2] - atomPos[atom_t0_idx+2];
float re = cos( s_qVecs[qVec_idx] * dist_0
+ s_qVecs[qVec_idx+1] * dist_1
+ s_qVecs[qVec_idx+2] * dist_2 );
float im = sin( s_qVecs[qVec_idx] * dist_0
+ s_qVecs[qVec_idx+1] * dist_1
+ s_qVecs[qVec_idx+2] * dist_2 );
atomicAdd( &out[2*(qValId*maxFrames + dt)], re / (nbrTimeOri*qVecs_dim1) );
atomicAdd( &out[2*(qValId*maxFrames + dt) + 1], im / (nbrTimeOri*qVecs_dim1) );
} // q vectors loop
} // time origins loop
} // time increments loop
} // qVals loop
} // condition on atom index
}
void cu_compIntScatFunc_wrapper(float *atomPos, int atomPos_dim0, int atomPos_dim1, int atomPos_dim2,
float *qVecs, int qVecs_dim0, int qVecs_dim1, int qVecs_dim2,
float *out, int maxFrames, int nbrTimeOri)
{
// Copying atomPos matrix on GPU memory
float *cu_atomPos;
size_t size_atomPos = atomPos_dim0 * atomPos_dim1 * atomPos_dim2 * sizeof(float);
cudaMalloc(&cu_atomPos, size_atomPos);
cudaMemcpy(cu_atomPos, atomPos, size_atomPos, cudaMemcpyHostToDevice);
// Copying qVecs matrix on GPU memory
float *cu_qVecs;
size_t size_qVecs = qVecs_dim0 * qVecs_dim1 * qVecs_dim2 * sizeof(float);
cudaMalloc(&cu_qVecs, size_qVecs);
cudaMemcpy(cu_qVecs, qVecs, size_qVecs, cudaMemcpyHostToDevice);
// Copying out matrix on GPU memory
float *cu_out;
size_t size_out = 2 * qVecs_dim0 * maxFrames * sizeof(float);
cudaMalloc(&cu_out, size_out);
cudaMemcpy(cu_out, out, size_out, cudaMemcpyHostToDevice);
int nbrBlocks = ceil(atomPos_dim0 / BLOCK_SIZE);
int sharedMemSize = sizeof(float) * 3 * (qVecs_dim0*qVecs_dim1);
compIntScatFunc<<<nbrBlocks, BLOCK_SIZE, sharedMemSize>>>(cu_atomPos, atomPos_dim0, atomPos_dim1, cu_out,
cu_qVecs, qVecs_dim0, qVecs_dim1, maxFrames, nbrTimeOri);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
// Averages with the number of atoms
for(int i=0; i < 2*qVecs_dim0*maxFrames; ++i)
{
cu_out[i] /= atomPos_dim0;
}
cudaMemcpy(out, cu_out, size_out, cudaMemcpyDeviceToHost);
cudaFree(cu_atomPos);
cudaFree(cu_qVecs);
cudaFree(cu_out);
}
And the cuda_setup.py file:
from Cython.Build import cythonize
import numpy as np
import os
from setuptools import setup
from setuptools import Extension
from setuptools import Command
from setuptools.command.build_ext import build_ext
import distutils
with open('../README.md', 'r') as f:
description = f.read()
pyxPath = "NAMDAnalyzer/lib/cython_pyx/"
srcPath = "NAMDAnalyzer/lib/openmp/src/"
cudaSrcPath = "NAMDAnalyzer/lib/cuda/src/"
try:
cudaPath = os.environ['CUDA_PATH'] #_Using the default installation key in Path variable
if os.environ['OS'] == 'Windows_NT':
cudaInclude = cudaPath + "\include"
cudaLib = cudaPath + "\lib\x64"
else:
cudaInclude = cudaPath + "/include"
cudaLib = cudaPath + "/lib64"
except KeyError:
print("\n\nError: Couldn't locate CUDA path, please intall it or add it to PATH variable\n\n")
#_The following is used to compile with openmp with both mingGW and msvc
copt = {'msvc' : ['/openmp', '/Ox', '/fp:fast'],
'mingw32' : ['-fopenmp','-O3','-ffast-math','-march=native'] }
lopt = {'mingw32' : ['-fopenmp']}
def preprocessNVCC(path):
#_Used to process .cu file and create static libraries for GPU part of the program
for f in os.listdir(path):
if f[-3:] == '.cu':
os.system("nvcc -o %s.lib -lib %s" % ('NAMDAnalyzer/lib/cuda/' + f[:-3], path + f))
#_Used by setup function to define compile and link extra arguments
class build_ext_subclass( build_ext ):
def build_extensions(self):
c = self.compiler.compiler_type
if c in copt.keys():
for e in self.extensions:
e.extra_compile_args = copt[ c ]
if c in lopt.keys():
for e in self.extensions:
e.extra_link_args = lopt[ c ]
#_Simply uses execute function to make sure .cu files are processed first with nvcc
self.execute(preprocessNVCC, [cudaSrcPath])
build_ext.build_extensions(self)
packagesList = [ 'NAMDAnalyzer.dataManipulation',
'NAMDAnalyzer.dataParsers',
'NAMDAnalyzer.dataAnalysis',
'NAMDAnalyzer.lib',
'NAMDAnalyzer.helpersFunctions' ]
#_Defines extensions
pycompIntScatFunc_ext = Extension( "NAMDAnalyzer.lib.pycompIntScatFunc",
[cudaSrcPath + "compIntScatFunc.cpp",
pyxPath + "pycompIntScatFunc.pyx"],
library_dirs=["NAMDAnalyzer/lib/cuda", cudaLib],
libraries=['compIntScatFunc', 'cuda', 'cudart'],
language='c++',
include_dirs=[cudaSrcPath, np.get_include(), cudaInclude])
pygetDistances_ext = Extension( "NAMDAnalyzer.lib.pygetDistances",
[pyxPath + "pygetDistances.pyx",
srcPath + "getDistances.cpp"],
include_dirs=[srcPath, np.get_include()],
language='c++')
pygetWithin_ext = Extension( "NAMDAnalyzer.lib.pygetWithin",
[pyxPath + "pygetWithin.pyx", srcPath + "getWithin.cpp"],
include_dirs=[srcPath, np.get_include()],
language='c++')
pygetCOM_ext = Extension( "NAMDAnalyzer.lib.pygetCenterOfMass",
[pyxPath + "pygetCenterOfMass.pyx"],
include_dirs=[np.get_include()] )
pysetCOMAligned_ext = Extension( "NAMDAnalyzer.lib.pysetCenterOfMassAligned",
[pyxPath + "pysetCenterOfMassAligned.pyx"],
include_dirs=[np.get_include()] )
setup( name='NAMDAnalyzer',
version='alpha',
description=description,
author='Kevin Pounot',
author_email='kpounot@hotmail.fr',
url='github.com/kpounot/NAMDAnalyzer',
py_modules=['NAMDAnalyzer.Dataset'],
packages=packagesList,
ext_modules=cythonize( [pygetWithin_ext,
pygetDistances_ext,
pygetCOM_ext,
pysetCOMAligned_ext,
pycompIntScatFunc_ext]),
cmdclass={'build_ext': build_ext_subclass})