Dear Support team!
With the latest NVHPC 22.7 release, we are seeing regression with OpenACC offload kernels in our NEURON simulator code. Previous releases (until 22.5) have worked fine and we have checked our tests with cuda-memcheck
. So we wonder if there is a regression in 22.7.
As reported in other tickets, kernels are auto-generated from DSL and hence not always easy to provide a small reproducer. So, we will try to our “preliminary” analysis so far. If this seems like a known issue or someone has already reported then please let us know.
We have a DSL-generated code with Eigen and OpenACC offload as follows:
void nrn_state_hhkin(NrnThread* nt, Memb_list* ml, int type) {
_Pragma("acc data present(nt, ml) if(nt->compute_gpu)")
{
int nodecount = ml->nodecount;
int pnodecount = ml->_nodecount_padded;
const int* __restrict__ node_index = ml->nodeindices;
double* __restrict__ data = ml->data;
const double* __restrict__ voltage = nt->_actual_v;
Datum* __restrict__ indexes = ml->pdata;
ThreadDatum* __restrict__ thread = ml->_thread;
auto* const __restrict__ inst = static_cast<hhkin_Instance*>(ml->instance);
int start = 0;
int end = nodecount;
_Pragma("acc parallel loop present(inst, node_index, data, voltage, indexes, thread) async(nt->stream_id) if(nt->compute_gpu)")
for (int id = start; id < end; id++) {
int node_id = node_index[id];
double v = voltage[node_id];
inst->ena[id] = inst->ion_ena[indexes[0*pnodecount + id]];
inst->ek[id] = inst->ion_ek[indexes[3*pnodecount + id]];
Eigen::Matrix<double, 6, 1> nmodl_eigen_xm;
double* t_nmodl_eigen_x = nmodl_eigen_xm.data();
t_nmodl_eigen_x[0] = inst->m[id];
t_nmodl_eigen_x[1] = inst->h[id];
t_nmodl_eigen_x[2] = inst->n[id];
t_nmodl_eigen_x[3] = inst->mc[id];
t_nmodl_eigen_x[4] = inst->hc[id];
t_nmodl_eigen_x[5] = inst->nc[id];
struct functor {
NrnThread* nt;
hhkin_Instance* inst;
int id, pnodecount;
double v;
Datum* indexes;
double* data;
ThreadDatum* thread;
double old_m, old_h, old_n, old_mc, old_hc, old_nc;
void initialize() {
rates_hhkin(id, pnodecount, inst, data, indexes, thread, nt, v, v);
old_m = inst->m[id];
old_h = inst->h[id];
old_n = inst->n[id];
old_mc = inst->mc[id];
old_hc = inst->hc[id];
old_nc = inst->nc[id];
}
functor(NrnThread* nt, hhkin_Instance* inst, int id, int pnodecount, double v, Datum* indexes, double* data, ThreadDatum* thread) : nt{nt}, inst{inst}, id{id}, pnodecount{pnodecount}, v{v}, indexes{indexes}, data{data}, thread{thread} {}
void operator()(const Eigen::Matrix<double, 6, 1>& nmodl_eigen_xm, Eigen::Matrix<double, 6, 1>& nmodl_eigen_fm, Eigen::Matrix<double, 6, 6>& nmodl_eigen_jm) const {
const double* nmodl_eigen_x = nmodl_eigen_xm.data();
double* nmodl_eigen_j = nmodl_eigen_jm.data();
double* nmodl_eigen_f = nmodl_eigen_fm.data();
nmodl_eigen_f[0] = -nmodl_eigen_x[0] * inst->bm[id] * nt->_dt - nmodl_eigen_x[0] + nmodl_eigen_x[3] * inst->am[id] * nt->_dt + old_m;
nmodl_eigen_j[0] = -inst->bm[id] * nt->_dt - 1.0;
nmodl_eigen_j[6] = 0.0;
nmodl_eigen_j[12] = 0.0;
nmodl_eigen_j[18] = inst->am[id] * nt->_dt;
nmodl_eigen_j[24] = 0.0;
nmodl_eigen_j[30] = 0.0;
nmodl_eigen_f[1] = -nmodl_eigen_x[1] * inst->bh[id] * nt->_dt - nmodl_eigen_x[1] + nmodl_eigen_x[4] * inst->ah[id] * nt->_dt + old_h;
nmodl_eigen_j[1] = 0.0;
nmodl_eigen_j[7] = -inst->bh[id] * nt->_dt - 1.0;
nmodl_eigen_j[13] = 0.0;
nmodl_eigen_j[19] = 0.0;
nmodl_eigen_j[25] = inst->ah[id] * nt->_dt;
nmodl_eigen_j[31] = 0.0;
nmodl_eigen_f[2] = -nmodl_eigen_x[2] * inst->bn[id] * nt->_dt - nmodl_eigen_x[2] + nmodl_eigen_x[5] * inst->an[id] * nt->_dt + old_n;
nmodl_eigen_j[2] = 0.0;
nmodl_eigen_j[8] = 0.0;
nmodl_eigen_j[14] = -inst->bn[id] * nt->_dt - 1.0;
nmodl_eigen_j[20] = 0.0;
nmodl_eigen_j[26] = 0.0;
nmodl_eigen_j[32] = inst->an[id] * nt->_dt;
nmodl_eigen_f[3] = nmodl_eigen_x[0] * inst->bm[id] * nt->_dt - nmodl_eigen_x[3] * inst->am[id] * nt->_dt - nmodl_eigen_x[3] + old_mc;
nmodl_eigen_j[3] = inst->bm[id] * nt->_dt;
nmodl_eigen_j[9] = 0.0;
nmodl_eigen_j[15] = 0.0;
nmodl_eigen_j[21] = -inst->am[id] * nt->_dt - 1.0;
nmodl_eigen_j[27] = 0.0;
nmodl_eigen_j[33] = 0.0;
nmodl_eigen_f[4] = nmodl_eigen_x[1] * inst->bh[id] * nt->_dt - nmodl_eigen_x[4] * inst->ah[id] * nt->_dt - nmodl_eigen_x[4] + old_hc;
nmodl_eigen_j[4] = 0.0;
nmodl_eigen_j[10] = inst->bh[id] * nt->_dt;
nmodl_eigen_j[16] = 0.0;
nmodl_eigen_j[22] = 0.0;
nmodl_eigen_j[28] = -inst->ah[id] * nt->_dt - 1.0;
nmodl_eigen_j[34] = 0.0;
nmodl_eigen_f[5] = nmodl_eigen_x[2] * inst->bn[id] * nt->_dt - nmodl_eigen_x[5] * inst->an[id] * nt->_dt - nmodl_eigen_x[5] + old_nc;
nmodl_eigen_j[5] = 0.0;
nmodl_eigen_j[11] = 0.0;
nmodl_eigen_j[17] = inst->bn[id] * nt->_dt;
nmodl_eigen_j[23] = 0.0;
nmodl_eigen_j[29] = 0.0;
nmodl_eigen_j[35] = -inst->an[id] * nt->_dt - 1.0;
}
void finalize() {
}
};
functor newton_functor(nt, inst, id, pnodecount, v, indexes, data, thread);
newton_functor.initialize();
int newton_iterations = nmodl::newton::newton_solver(nmodl_eigen_xm, newton_functor);
inst->m[id] = t_nmodl_eigen_x[0];
inst->h[id] = t_nmodl_eigen_x[1];
inst->n[id] = t_nmodl_eigen_x[2];
inst->mc[id] = t_nmodl_eigen_x[3];
inst->hc[id] = t_nmodl_eigen_x[4];
inst->nc[id] = t_nmodl_eigen_x[5];
newton_functor.finalize();
}
}
}
When we execute our test simulation then we get:
Failing in Thread:1
call to cuStreamSynchronize(after launch) returned error 700: Illegal address during kernel execution
If I try to see through cuda-gdb
then I see:
$cuda-gdb ./x86_64/special-core
set cuda memcheck on
r -e 0.05 -d coredat/ --gpu
…
Illegal access to address (@local)0xdff5c0 detected.
Thread 1 "special-core" received signal CUDA_EXCEPTION_1, Lane Illegal Address.
[Switching focus to CUDA kernel 0, grid 42, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 1, lane 0]
0x00000000017c8400 in partialPivLu6(Eigen::Matrix<double, 6, 6, 0, 6, 6> const&, Eigen::Matrix<double, 6, 1, 0, 6, 1> const&) ()
(cuda-gdb) bt
#0 0x00000000017c8400 in partialPivLu6(Eigen::Matrix<double, 6, 6, 0, 6, 6> const&, Eigen::Matrix<double, 6, 1, 0, 6, 1> const&) ()
#1 0x00000000017c8400 in partialPivLu6(Eigen::Matrix<double, 6, 6, 0, 6, 6> const&, Eigen::Matrix<double, 6, 1, 0, 6, 1> const&) ()
#2 0x0000000001715e30 in nmodl::newton::newton_solver<6, coreneuron::nrn_state_hhkin(coreneuron::NrnThread*, coreneuron::Memb_list*, int)::functor>(Eigen::Matrix<double, 6, 1, (0)|((((6)==(1))&&((1)!=(1)))?((Eigen::StorageOptions)1) : ((((1)==(1))&&((6)!=(1)))?((Eigen::StorageOptions)0) : ((Eigen::StorageOptions)0))), 6, 1>&, coreneuron::nrn_state_hhkin(coreneuron::NrnThread*, coreneuron::Memb_list*, int)::functor, double, int) ()
at /gpfs/bbp.cscs.ch/home/kumbhar/workarena/systems/bbpv/repos/bbp/nrn/build_gpu_nmodl/install/include/newton/newton.hpp:79
#3 0x000000000175b1c0 in coreneuron::nrn_state_hhkin_610_gpu<<<(1,1,1),(128,1,1)>>> (nt=0x7fffe3fff620, pnodecount=755552426) at x86_64/corenrn/mod2c/hhkin.cpp:735
Note the (@local)0xdff5c0
mentioned in the error message. Is this related to local variables / memory in the offloaded kernel? It’s true that Eigen::Matrix
variables are local in the kernel, you can see the relevant code here.
I have attached the preprocessed file of relevant kernel (nrn_state_hhkin()
) that causes this error. If I compile with NVHPC 22.5 vs 22.7 and diff the output of -Minfo=accel
messages then I see one additional message in 22.7 :
613, Generating implicit private(loop_nmodl_eigen_x)
double* loop_nmodl_eigen_x
is just a local pointer variable declared in the loop. So not sure if the above message is something relevant here?
The attached pre-processed file is typically compiled as:
$HOME/22.5/compilers/bin/nvc++ -g -O0 -Minfo=accel --c++17 -acc -Mautoinline -DCORENEURON_CUDA_PROFILING -DCORENEURON_ENABLE_GPU -DCORENEURON_BUILD -DHAVE_MALLOC_H -DEIGEN_DONT_PARALLELIZE -DEIGEN_DONT_VECTORIZE=1 -DLAYOUT=0 -c hhkin.prep.cpp --diag_suppress=111,550 --diag_suppress=161,177 -c &> 22.5.accel.txt
$HOME/22.7/compilers/bin/nvc++ -g -O0 -Minfo=accel --c++17 -acc -Mautoinline -DCORENEURON_CUDA_PROFILING -DCORENEURON_ENABLE_GPU -DCORENEURON_BUILD -DHAVE_MALLOC_H -DEIGEN_DONT_PARALLELIZE -DEIGEN_DONT_VECTORIZE=1 -DLAYOUT=0 -c hhkin.prep.cpp --diag_suppress=111,550 --diag_suppress=161,177 -c &> 22.7.accel.txt
Could you take a look and see if you see something obvious for this erroneous behaviour with 22.7 release? This will be a great help!