NVHPC OpenACC compiler (nvc++ V22.11 and V23.7 are tested) seems to fail on handling atomic fetch constructs for atomic-fetch-and-shift operations, such as the following example:

I also attached the full example, where simple atomic-shift operations work using atomic update construct, but atomic-fetch-shift operations using atomic fetch construct fail: atomic_shift.cpp.txt (4.0 KB)

I took a look and while there is a compiler issue here, it looks to be due to the capture variable not being used. If you update the code to use â€śtempâ€ť, then it will get correct answers. Iâ€™ve filed a problem report, TPR #34550, but would suggest updating the example since itâ€™s more likely that a code would use the capture value in some form.

For example:

atomic_shift.fixed.cpp

#include <stdio.h>
#include <math.h>
#include <openacc.h>
#define N 8
#ifndef DTYPE
#define DTYPE 2
#endif
#if DTYPE == 1
typedef int Type;
#elif DTYPE == 2
typedef unsigned int Type;
#elif DTYPE == 3
typedef short Type;
#elif DTYPE == 4
typedef long Type;
#elif DTYPE == 5
typedef unsigned long Type;
#elif DTYPE == 6
typedef long long Type;
#elif DTYPE == 7
typedef unsigned long long Type;
#elif DTYPE == 8
typedef float Type;
#elif DTYPE == 9
typedef double Type;
#endif
int main () {
int i;
Type A[N];
Type B[N];
Type valOne = 1;
Type Expected = 0;
#if DTYPE == 1
printf("==> Test atomic operations using int type variables\n");
#elif DTYPE == 2
printf("==> Test atomic operations using unsigned int type variables\n");
#elif DTYPE == 3
printf("==> Test atomic operations using short type variables\n");
#elif DTYPE == 4
printf("==> Test atomic operations using long type variables\n");
#elif DTYPE == 5
printf("==> Test atomic operations using unsigned long type variables\n");
#elif DTYPE == 6
printf("==> Test atomic operations using long long type variables\n");
#elif DTYPE == 7
printf("==> Test atomic operations using unsigned long long type variables\n");
#elif DTYPE == 8
printf("==> Test atomic operations using float type variables\n");
#endif
for(i=0; i<N; i++) {
A[i] = 0;
}
Expected = 1;
for(i=0; i<N; i++) {
A[i] = i;
Expected = Expected<<valOne;
}
A[0] = 1;
for(i=0; i<N; i++) {
Type temp=0;
{
temp = A[0];
A[0] = A[0] << valOne;
}
}
printf("[TEST0: non_atomic_fetch_lshift() on the host] A[0] = %d (expected: %d)\n", A[0], Expected);
Expected = 1;
for(i=0; i<N; i++) {
A[i] = i;
Expected = Expected<<valOne;
}
A[0] = 1;
#pragma acc data copy(B[:N])
{
for(i=0; i<N; i++) {
Type temp=0;
#pragma acc atomic capture
{
temp = A[0];
A[0] = A[0] << valOne;
}
B[i] = temp;
}
printf("[TEST1: atomic_fetch_lshift() on the host] A[0] = %d (expected: %d)\n", A[0], Expected);
A[0] = 1;
#pragma acc parallel loop gang vector copy(A[:N]) //copyin(valOne)
for(i=0; i<N; i++) {
Type temp=0;
#pragma acc atomic capture
{
temp = A[0];
A[0] = A[0] << valOne;
}
B[i] = temp;
}
printf("[TEST2: atomic_fetch_lshift() on the device] A[0] = %d (expected: %d)\n", A[0], Expected);
A[0] = 1;
#pragma acc parallel loop gang vector copy(A) copyin(valOne)
for(i=0; i<N; i++) {
Type temp=0;
#pragma acc atomic update
A[0] = A[0] << valOne;
}
printf("[TEST3: atomic_lshift() on the device] A[0] = %d (expected: %d)\n", A[0], Expected);
Expected = 1;
for(i=0; i<N; i++) {
Expected = Expected<<valOne;
}
A[0] = Expected;
for(i=0; i<N; i++) {
Expected = Expected >> valOne;
}
for(i=0; i<N; i++) {
Type temp=0;
#pragma acc atomic capture
{
temp = A[0];
A[0] = A[0] >> valOne;
}
B[i] = temp;
}
printf("[TEST4: atomic_fetch_rshift() on the host] A[0] = %d (expected: %d)\n", A[0], Expected);
Expected = 1;
for(i=0; i<N; i++) {
Expected = Expected<<valOne;
}
A[0] = Expected;
for(i=0; i<N; i++) {
Expected = Expected >> valOne;
}
#pragma acc parallel loop gang vector copy(A) copyin(valOne)
for(i=0; i<N; i++) {
Type temp=0;
#pragma acc atomic capture
{
temp = A[0];
A[0] = A[0] >> valOne;
}
B[i] = temp;
}
printf("[TEST5: atomic_fetch_rshift() on the device] A[0] = %d (expected: %d)\n", A[0], Expected);
Expected = 1;
for(i=0; i<N; i++) {
Expected = Expected<<valOne;
}
A[0] = Expected;
for(i=0; i<N; i++) {
Expected = Expected >> valOne;
}
#pragma acc parallel loop gang vector copy(A) copyin(valOne)
for(i=0; i<N; i++) {
Type temp=0;
#pragma acc atomic update
A[0] = A[0] >> valOne;
}
printf("[TEST6: atomic_rshift() on the device] A[0] = %d (expected: %d)\n", A[0], Expected);
Expected = 1;
for(i=0; i<N; i++) {
Expected = Expected<<valOne;
}
A[0] = Expected;
for(i=0; i<N; i++) {
Expected = Expected >> valOne;
}
#pragma acc parallel loop gang vector copy(A) copyin(valOne)
for(i=0; i<N; i++) {
Type temp=0;
#pragma acc atomic capture
{
A[0] = A[0] >> valOne;
temp = A[0];
}
B[i] = temp;
}
}
printf("[TEST7: atomic_rshift_fetch() on the device] A[0] = %d (expected: %d)\n", A[0], Expected);
return 0;
}

% nvc++ -acc -w atomic_shift.fixed.cpp -DDTYPE=2 -V23.11; a.out
==> Test atomic operations using unsigned int type variables
[TEST0: non_atomic_fetch_lshift() on the host] A[0] = 256 (expected: 256)
[TEST1: atomic_fetch_lshift() on the host] A[0] = 256 (expected: 256)
[TEST2: atomic_fetch_lshift() on the device] A[0] = 256 (expected: 256)
[TEST3: atomic_lshift() on the device] A[0] = 256 (expected: 256)
[TEST4: atomic_fetch_rshift() on the host] A[0] = 1 (expected: 1)
[TEST5: atomic_fetch_rshift() on the device] A[0] = 1 (expected: 1)
[TEST6: atomic_rshift() on the device] A[0] = 1 (expected: 1)
[TEST7: atomic_rshift_fetch() on the device] A[0] = 256 (expected: 1)

Thank you for this update.
As shown in the new result, however, TEST7 (atomic_rshift_fetch()) still fails even with the change to use the fetched variable; is this also a compiler bug?

I reported these in TPR#34556. Iâ€™m not sure if these are supported types (though I would assume â€śintâ€ť is), but even if so, we should emit an error not an ICE.

Thanks a lot!
By the way, the C99 standard says that the operands of the bitwise shift operators shall have integer type, even though the behavior when the value of the right operatnd is negative is undefined.