Hello,
In the course of reorganising some of our codebase, I came across an issue where a certain device kernel hangs at runtime when compiled at -O
or above (-O
, -O2
, etc.). With -O1
, it works as expected.
The basic structure of for loops, break and return statements comes directly from code that has been working in production for some time. The changes that triggered the bug were mainly organisational, moving code from .cpp
to .hpp
and so on, rather than changing the structure of the kernels.
The real code in question is unfortunately quite convoluted, but I have done my best to produce a stand alone example that still demonstrates the issue. The device kernel that causes problems is:
struct Elm {
unsigned row{};
unsigned col{};
Elm *r_down{};
Elm *c_left{}; // only on host
Elm *c_right{};
};
struct SparseObj {
Elm **rowst{}; // only on host
Elm **diag{};
unsigned neqn{};
};
inline void subrow(SparseObj *so, Elm *pivot, Elm *rowsub) {
for (auto el = pivot->c_right; el; el = el->c_right) {
for (rowsub = rowsub->c_right; rowsub->col != el->col;
rowsub = rowsub->c_right) {
}
}
}
inline int matsol(SparseObj *so) {
for (unsigned i = 0; i < so->neqn; i++) {
Elm *pivot{so->diag[i]};
for (auto el = pivot->r_down; el; el = el->r_down) {
subrow(so, pivot, el);
}
}
return 0;
}
void device_kernel(SparseObj *so, int n) {
for (int i = 0; i < n;) {
int ierr = matsol(so);
if (ierr)
return;
for (int j = 1; j <= n; j++)
return;
}
for (unsigned i = 0; i < so->neqn; ++i) {
}
}
int main() {
auto *so = host_init(3);
#pragma acc parallel loop present(so)
for (int i = 0; i < 1; ++i) {
device_kernel(so, 1);
}
}
unfortunately setting up the data structure (host_init
) is a little convoluted. The code for that is given below; rest assured the original was more complicated.
If I compile this with nvc++ -V22.3 -acc -gpu=managed -Mautoinline -O1
then it executes without any problem.
If I change -O1
to -O
then it hangs. CUDA-GDB shows
(cuda-gdb) info cuda threads
BlockIdx ThreadIdx To BlockIdx ThreadIdx Count Virtual PC
Kernel 0
* (0,0,0) (0,0,0) (0,0,0) (0,0,0) 1 0x0000000000a79670
(0,0,0) (1,0,0) (0,0,0) (31,0,0) 31 0x0000000000a79700
(0,0,0) (32,0,0) (0,0,0) (127,0,0) 96 0x0000000000a79730
(cuda-gdb) x/1i $pc
=> 0xa79670 <main_136_gpu+496>: WARPSYNC 0xffffffff
(cuda-gdb) cuda thread 1
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (1,0,0), device 0, sm 0, warp 0, lane 1]
0x0000000000a79700 139 device_kernel(so, 1);
(cuda-gdb) x/1i $pc
=> 0xa79700 <main_136_gpu+640>: WARPSYNC 0xffffffff
(cuda-gdb) cuda thread 32
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (32,0,0), device 0, sm 0, warp 1, lane 0]
0x0000000000a79730 139 device_kernel(so, 1);
(cuda-gdb) x/1i $pc
=> 0xa79730 <main_136_gpu+688>: LDS.U R6, [RZ]
Can this be fixed in an upcoming release? Note we have historically used -Mautoinline
to work around other issues with non-trivial offloaded kernels.
#include <cassert>
inline Elm *getelm(SparseObj *so, unsigned row, unsigned col) {
Elm *el{}, *elnext{};
if (row == col) {
el = so->diag[row];
assert(el->row == row);
assert(el->col == col);
return so->diag[row];
}
assert(row > col);
for (el = so->diag[col];; el = elnext) {
elnext = el->r_down;
if (!elnext) {
break;
}
if (elnext->row == row) {
assert(elnext->col == col);
return elnext;
}
if (elnext->row > row) {
break;
}
}
auto *new_elem = new Elm{};
new_elem->col = col;
new_elem->row = row;
el->r_down = new_elem;
for (el = so->diag[row];; el = elnext) {
elnext = el->c_left;
if (!elnext) {
break;
}
if (elnext->col < col) {
break;
}
}
new_elem->c_left = el->c_left;
el->c_left = new_elem;
new_elem->c_right = el;
if (new_elem->c_left) {
new_elem->c_left->c_right = new_elem;
} else {
so->rowst[row] = new_elem;
}
return new_elem;
}
SparseObj *host_init(int maxeqn) {
auto *so = new SparseObj{};
so->neqn = maxeqn;
so->rowst = new Elm *[maxeqn];
so->diag = new Elm *[maxeqn];
for (unsigned i = 0; i < maxeqn; i++) {
auto *el = new Elm{};
el->row = el->col = i;
so->diag[i] = so->rowst[i] = el;
}
for (int i = 0; i < maxeqn; ++i) {
for (int j = i; j < maxeqn; ++j) {
getelm(so, j, i);
}
}
// Sanity check
for (int col = 0; col < maxeqn; ++col) {
for (int row = col; row < maxeqn; ++row) {
auto *el = getelm(so, row, col);
assert(el->row == row);
assert(el->col == col);
if (row < maxeqn - 1) {
assert(el->r_down);
assert(el->r_down->row == row + 1);
} else {
assert(!el->r_down);
}
if (col) {
assert(el->c_left);
assert(el->c_left->col == col - 1);
} else {
assert(!el->c_left);
}
// only the lower left triangle + diagonal are populated
if (col < row) {
assert(el->c_right);
assert(el->c_right->col == col + 1);
} else {
assert(!el->c_right);
}
}
}
return so;
}