Hi, this is a new version of an issue that has been around for a while. Classically the applications would fail at runtime unless every acc region inside an omp parallel region were surrounded by an acc data region. Now with 13.1 I can’t seem to get any acc regions to work inside omp parallel regions at all. For example, look at the following code. (apologies for the length but this is the simplest way to put it through)
#include <stdio.h>
#include <unistd.h>
#include <omp.h>
#include <openacc.h>
/* #include <cuda.h> */
#include <sys/time.h>
#include <sys/types.h>
#define SIZE 1000
void works(){
int data[15][SIZE]={0};
for(int j=0; j<4; j++)
{
int * stuff = data[j];
if(j < acc_get_num_devices(acc_device_nvidia)){
acc_set_device_num(j, acc_device_nvidia);
fprintf(stderr,"prelaunch: in thread %d, testing device %d\n", j, acc_get_device_num(acc_device_nvidia));
int i;
#pragma acc data copyout(stuff[0:SIZE])
{
fprintf(stderr,"data environment initialized: in thread %d, testing device %d\n", j, acc_get_device_num(acc_device_nvidia));
#pragma acc kernels
for(i = 0; i<SIZE; i++)
{
stuff[i] = 1;
}
fprintf(stderr,"ACC region complete: in thread %d, testing device %d\n", j, acc_get_device_num(acc_device_nvidia));
}
printf("device %d done, checking results\n", j);
for(i = 0; i<SIZE; i++)
{
if(stuff[i] != 1){
printf("fail after: %d\n", i);
exit(1);
}
}
}
}
}
void dies(){
int data[15][SIZE]={0};
#pragma omp parallel
{
int j = omp_get_thread_num();
int * stuff = data[j];
if(j < acc_get_num_devices(acc_device_nvidia)){
acc_set_device_num(j, acc_device_nvidia);
fprintf(stderr,"prelaunch: in thread %d, testing device %d\n", j, acc_get_device_num(acc_device_nvidia));
int i;
#pragma acc data copyout(stuff[0:SIZE])
{
fprintf(stderr,"data environment initialized: in thread %d, testing device %d\n", j, acc_get_device_num(acc_device_nvidia));
#pragma acc kernels
for(i = 0; i<SIZE; i++)
{
stuff[i] = 1;
}
fprintf(stderr,"ACC region complete: in thread %d, testing device %d\n", j, acc_get_device_num(acc_device_nvidia));
}
printf("device %d done, checking results\n", j);
for(i = 0; i<SIZE; i++)
{
if(stuff[i] != 1){
printf("fail after: %d\n", i);
exit(1);
}
}
}
}
}
int main(int argc, char * argv[])
{
int tid;
if(argc > 1){
dies();
}else{
works();
}
return 0;
}
If this is compiled with the following command, both branches work (testable with ./test and ./test 1).
/opt/pgi/linux86-64/2012/bin/pgcc -Minfo=accel,mp -mp=allcores -O3 -g -ta=nvidia:cuda4.1,keepgpu,keepptx -acc
Output looks like this.
Works branch:
prelaunch: in thread 0, testing device 0
data environment initialized: in thread 0, testing device 0
ACC region complete: in thread 0, testing device 0
device 0 done, checking results
prelaunch: in thread 1, testing device 1
data environment initialized: in thread 1, testing device 1
ACC region complete: in thread 1, testing device 1
device 1 done, checking results
prelaunch: in thread 2, testing device 2
data environment initialized: in thread 2, testing device 2
ACC region complete: in thread 2, testing device 2
device 2 done, checking results
prelaunch: in thread 3, testing device 3
data environment initialized: in thread 3, testing device 3
ACC region complete: in thread 3, testing device 3
device 3 done, checking results
Dies branch:
prelaunch: in thread 3, testing device 3
prelaunch: in thread 0, testing device 0
prelaunch: in thread 2, testing device 2
prelaunch: in thread 1, testing device 1
data environment initialized: in thread 0, testing device 0
ACC region complete: in thread 0, testing device 0
device 0 done, checking results
data environment initialized: in thread 1, testing device 1
data environment initialized: in thread 3, testing device 3
ACC region complete: in thread 1, testing device 1
ACC region complete: in thread 3, testing device 3
data environment initialized: in thread 2, testing device 2
ACC region complete: in thread 2, testing device 2
device 1 done, checking results
device 3 done, checking results
device 2 done, checking results
On the other hand compiled with 13.1 as with the following line, it dies on the “dies” branch.
pgcc -Minfo=accel,mp -mp=allcores -O3 -g -ta=nvidia:cuda5.0,keepgpu,keepptx -acc
Works branch:
prelaunch: in thread 0, testing device 0
data environment initialized: in thread 0, testing device 0
ACC region complete: in thread 0, testing device 0
device 0 done, checking results
prelaunch: in thread 1, testing device 1
data environment initialized: in thread 1, testing device 1
ACC region complete: in thread 1, testing device 1
device 1 done, checking results
prelaunch: in thread 2, testing device 2
data environment initialized: in thread 2, testing device 2
ACC region complete: in thread 2, testing device 2
device 2 done, checking results
prelaunch: in thread 3, testing device 3
data environment initialized: in thread 3, testing device 3
ACC region complete: in thread 3, testing device 3
device 3 done, checking results
Dies branch:
prelaunch: in thread 0, testing device 0
prelaunch: in thread 3, testing device 3
prelaunch: in thread 2, testing device 2
prelaunch: in thread 1, testing device 1
data environment initialized: in thread 1, testing device 1
data environment initialized: in thread 0, testing device 0
data environment initialized: in thread 2, testing device 2
data environment initialized: in thread 3, testing device 3
call to cuMemcpyDtoHAsync returned error 1: Invalid value
call to cuStreamSynchronize returned error 4: Deinitialized
Has anyone else run into this? I’ve been trying to fix it for a few days now with no success, and have run out of ideas to try. Since the compile lines do specify cuda versions, I did test this with different cuda versions with the same result.