The OpenACC quick reference guide says about async in the kernels contruct:
The kernels region executes asynchronously with the host.
This seems to work with PGI 13.6 (–ta=nvidia,cuda5.0,sm35) and I can measure the execution time of the kernels region. But it doesn’t work with 13.9 and 14.1 with the –ta=nvidia,cuda5.5,sm35 options:
inline void adi_acc(float lambda, float *u, float *du, float *ax, float *bx, float *cx, int nx, int ny, int nz ...) {
int i, j, k;
int n = nx*ny*nz;
#pragma acc data present(u[n], du[n], ax[n], bx[n], cx[n])
{
#pragma acc kernels loop collapse(3) independent async
for(k=0; k<NZ; k++) {
for(j=0; j<NY; j++) {
for(i=0; i<NX; i++) {
...
}
}
}
cuda_library_wrapper(u,du,ax,bx,cx);
#pragma acc kernels loop collapse(2) independent private(aa, bb, cc, dd, base, ind, c2, d2) async
for(k=0; k<nz; k++) {
for(i=0; i<NX; i++) {
...
}
}
}
}
int main(int argc, char* argv[]) {
...
float *h_u, *h_du, *h_ax, *h_bx, *h_cx;
int n = NX*NY*NZ;
h_u = (float *)malloc(sizeof(float)*n);
h_du = (float *)malloc(sizeof(float)*n);
h_ax = (float *)malloc(sizeof(float)*n);
h_bx = (float *)malloc(sizeof(float)*n);
h_cx = (float *)malloc(sizeof(float)*n);
...
acc_init(acc_device_nvidia);
acc_set_device_num(0,acc_device_nvidia);
...
#pragma acc data pcopy(h_u[n]) create(h_du[n], h_ax[n], h_bx[n], h_cx[n])
{
elapsed_time(&timer2);
for(it=0; it<iter; it++) {
adi_acc(lambda, h_u, h_du, h_ax, h_bx, h_cx, nx, ny, nz, ...);
elapsed_total = elapsed_time(&timer2);
}
free(h_u);
free(h_du);
free(h_ax);
free(h_bx);
free(h_cx);
acc_shutdown(acc_device_nvidia);
exit(0);
}
The error I get is:
Segmentation fault (core dumped)
The error I get with cuda-memcheck is:
========= CUDA-MEMCHECK
Grid dimensions: 256 x 256 x 256
========= Program hit error 201 on CUDA API call to cuCtxAttach
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib64/libcuda.so (cuCtxAttach + 0x182) [0xd9af2]
========= Host Frame:./adi_acc_libtrid [0x1a615]========= Error: process didn’t terminate successfully
========= Internal error (20)
========= No CUDA-MEMCHECK results found
Used config: Red Hat v6, NVIDIA K20c
The question is:
Why might an async “kernel call” result in segmentation fault when the consequent kernels actually run in-order in the same CUDA stream?
Any help is much appreciated!