zero output when OpenACC is used

mohammadi_sajad · March 18, 2018, 7:03am

I use PGI community edition 17.10 to compile and run fallowing code. why the output is wrong when I add directives of OpenACC?
may you if help me why it’s happen?
Thanks in advance,
sajad
The code:
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
#include <cuda_runtime_api.h>

#define NX 2
#define NY 2
#define NZ 2

int main(void)
{
static int i, j, k;

static double A[NX][NY][NZ]=2 ,B[NX][NY][NZ]=10.,C[NX]=10.,D[NY]=10.,E[NZ]=10.;
FILE *file;
file = fopen(“BB-and-A.csv”, “w”);
#pragma acc data copy( A ,B,C,D,E,i, j, k)
{
#pragma acc kernels loop private(i, j, k)

for (i = 0; i <= NX; i++) {
for (j =0; j <= NY ; j++) {
for (k =0; k <= NZ ; k++) {
C[i]=i;
D[j]=j;
E[k]=k;

}
}
}
}
for (i = 0; i <= NX; i++) {
for (j =0; j <= NY ; j++) {
for (k =0; k <= NZ ; k++) {
fprintf(file, “%e, %e, %e \n”, C[i], D[j],E[k] );
}
}
}

fclose(file);
}

MatColgrove · March 18, 2018, 9:58pm

Hi mohammadi sajad,

You have a number of issues with this code.

Your array bounds are incorrect. Since the loops go from 1 to <= N but the arrays only have N members, you’re writing off the end of the array.
Your loop isn’t parallelizable since you’re writing to each element from multiple loop iterations. To fix, I’d make these three separate loops.
The loop index variables shouldn’t be made static. This puts them in global storage and thus causes a dependency. While you can fix this by putting them in private clause, it’s better to remove the static and let the compiler implicitly privatize them.
No need to copy the loop index variables.

Try something like the following:

% cat test2.c
#include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <openacc.h>
 #include<time.h>
 #include <string.h>
 #include <malloc.h>
// #include <cuda_runtime_api.h>

 #define NX 2
 #define NY 2
 #define NZ 2


 int main(void)
 {
 int i, j, k;

 static double A[NX+1][NY+1][NZ+1]=2 ,B[NX+1][NY+1][NZ+1]=10.,C[NX+1]=10.,D[NY+1]=10.,E[NZ+1]=10.;
 FILE *file;
 file = fopen("BB-and-A.csv", "w");
 #pragma acc data copy(A,B,C,D,E)
 {
 #pragma acc kernels
 {
 for (i = 0; i <= NX; i++) C[i]=i;
 for (j =0; j <= NY ; j++) D[j]=j;
 for (k =0; k <= NZ ; k++) E[k]=k;
 }}
 for (i = 0; i <= NX; i++) {
 for (j =0; j <= NY ; j++) {
 for (k =0; k <= NZ ; k++) {
   fprintf(file,"%e, %e, %e \n", C[i], D[j],E[k] );
 }
 }
 }

 fclose(file);
 }
% pgcc test2.c -ta=tesla:cc60 -Minfo=accel
main:
     23, Generating copy(A[:][:][:],B[:][:][:],C[:],E[:],D[:])
     27, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         27, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     28, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         28, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     29, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         29, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
% a.out
% cat BB-and-A.csv
0.000000e+00, 0.000000e+00, 0.000000e+00
0.000000e+00, 0.000000e+00, 1.000000e+00
0.000000e+00, 0.000000e+00, 2.000000e+00
0.000000e+00, 1.000000e+00, 0.000000e+00
0.000000e+00, 1.000000e+00, 1.000000e+00
0.000000e+00, 1.000000e+00, 2.000000e+00
0.000000e+00, 2.000000e+00, 0.000000e+00
0.000000e+00, 2.000000e+00, 1.000000e+00
0.000000e+00, 2.000000e+00, 2.000000e+00
1.000000e+00, 0.000000e+00, 0.000000e+00
1.000000e+00, 0.000000e+00, 1.000000e+00
1.000000e+00, 0.000000e+00, 2.000000e+00
1.000000e+00, 1.000000e+00, 0.000000e+00
1.000000e+00, 1.000000e+00, 1.000000e+00
1.000000e+00, 1.000000e+00, 2.000000e+00
1.000000e+00, 2.000000e+00, 0.000000e+00
1.000000e+00, 2.000000e+00, 1.000000e+00
1.000000e+00, 2.000000e+00, 2.000000e+00
2.000000e+00, 0.000000e+00, 0.000000e+00
2.000000e+00, 0.000000e+00, 1.000000e+00
2.000000e+00, 0.000000e+00, 2.000000e+00
2.000000e+00, 1.000000e+00, 0.000000e+00
2.000000e+00, 1.000000e+00, 1.000000e+00
2.000000e+00, 1.000000e+00, 2.000000e+00
2.000000e+00, 2.000000e+00, 0.000000e+00
2.000000e+00, 2.000000e+00, 1.000000e+00
2.000000e+00, 2.000000e+00, 2.000000e+00

Hope this helps,
Mat

mohammadi_sajad · March 19, 2018, 10:09am

Hello mat
Thank You so much for your useful helps.
I’m going to optimize this code by openacc, but the output computations are zero. I would appreciate the opportunity to help me in this way and use your guidances to achieve success and solve my problem.

Impatiently,I am looking forward to hear you soon.

King regards,
Sajjad Mohammadi

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
#include <cuda_runtime_api.h>
#define NX 201
#define NY 101
#define NZ 201
int main(void)
{
int i, j, k, l, m;
static double vr = 1.5e8, tr, w1, w2;
static double dt = 9.5e-9, t, comp;
static double cu[NZ];
static double AA[NX][NY][NZ] , CC[NX][NY][NZ] , BB[NX][NY][NZ] ;
static double A[NX][NY][NZ] , B[NX][NY][NZ] , C[NX][NY][NZ] ;
FILE file;
file = fopen(“BB-and-A.csv”, “w”);
t = 0.;
for (l = 0; l < 737; l++) {
#pragma acc data copyin( vr, tr, w1, w2,dt, t, comp,AA , CC , A , C ),copyout(t,cu,BB,B)
{
#pragma acc kernels loop private(i, j, k)
for (i = 0; i < NX - 1; i++) {
for (j = 1; j < NY - 1; j++) {
for (k = 1; k < NZ - 1; k++) {
AA_[j][k] = 1. AA[j][k]

1. - (C[j][k] - C[j - 1][k])

1. - (B[j][k] - B[j][k - 1]);
    }
    }
    }
    #pragma acc kernels loop private(i,j,k)
    for (i = 1; i < NX - 1; i++) {
    for (j = 0; j < NY - 1; j++) {
    for (k = 1; k < NZ - 1; k++) {
    CC[j][k] = 1. * CC[j][k]

1. - (A[j][k] - A[j][k - 1])

1. - (C[i][j][k] - C[i - 1][j][k]);
    }
    }
    }
    #pragma acc kernels loop private(i,j,k)
    for (i = 1; i < NX - 1; i++) { /* BB */
    for (j = 1; j < NY - 1; j++) {
    for (k = 0; k < NZ - 1; k++) {
    BB[i][j][k] = 1. * BB[i][j][k]

1. - (B[i][j][k] - B[i - 1][j][k])

1. - (A[i][j][k] - A[i][j - 1][k]);
    }
    }
    }
    #pragma acc kernels loop private(i,j,k)
    for (i = 1; i < NX - 1; i++) {
    for (j = 0; j < NY - 1; j++) {
    for (k = 0; k < NZ - 1; k++) {
    A[i][j][k] = A[i][j][k]
0.0015 * (BB[i][j + 1][k] - BB[i][j][k])

0.0015 * (CC[i][j][k + 1] - CC[i][j][k]);
}
}
}
#pragma acc kernels loop private(i,j,k)
for (i = 0; i < NX - 1; i++) { /* C */
for (j = 1; j < NY - 1; j++) {
for (k = 0; k < NZ - 1; k++) {
B[i][j][k] = B[i][j][k]

0.0015 * (AA[i][j][k + 1] - AA[i][j][k])

0.0015 * (BB[i + 1][j][k] - BB[i][j][k]);
}
}
}
#pragma acc kernels loop private(i,j,k)
for (i = 0; i < NX - 1; i++) {
for (j = 0; j < NY - 1; j++) {
for (k = 1; k < NZ - 1; k++) {
C[i][j][k] = C[i][j][k]

0.0015 * (CC[i + 1][j][k] - CC[i][j][k])

0.0015 * (AA[i][j + 1][k] - AA[i][j][k]);
}
}
}
for (m = 10; m < NZ - 1; m++) {
tr = t - (double)(m)5 / vr;
if (tr <= 0.)
cu[m] = 0.;
else {
w1 = (tr / 0.25e-6)(tr / 0.25e-6);
w2 = (tr / 2.e-6)(tr / 2.e-6);
cu[m] = 10700. / 0.639w1 / (w1 + 1.)exp(-tr / 2.5e-6) + 6500. / 0.867w2 / (w2 + 1.)exp(-tr / 230.e-6);
cu[m] = (exp((double)(m)-5. / 2000.))*cu[m];
}
A[10][60][m] = -cu[m] / (4.*5.);
A[10][59][m] = cu[m] / (4.*5.);
B[10][60][m] = cu[m] / (4.*5.);
B[9][60][m] = -cu[m] / (4.*5.);
}
comp = ((double)(l + 1) / 737)100.;
printf(“Computation: %4.3f %% completed \r”, comp);
fprintf(file, “%e, %e, %e, %e \n”, t1e6, cu[30] / 1000., -BB[30][60][10] / 1000., -(B[109][60][10] + B[110][60][10]) / 2.);
t = t + dt;
}
}
fclose(file);
}_

MatColgrove · March 19, 2018, 3:36pm

Hi mohammadi sajad,

I see two main issues.

You’re not initializing your arrays. Since they are static, this happens to have the side effect of zero initialization on the host, but on the device the values are not initialized. You can set the environment variable “PGI_ACC_FILL=1” to have the compiler initialize the device data to zero, but I would recommend you explicitly initialize the data in the program.
You data region ends after you use the arrays on the host. Hence, the values you are printing are what ever they were before the device compute regions (in this case zero). While you can fix this by moving the end of the data region before the “m” loop, I’d recommend you move the compute region above the “l” loop, initialize it on the device, and then use the OpenACC update directives to synchronize the data. This will remove unneeded extra copies and allocation/deallocation of device data.

Mat

#include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <openacc.h>
 #include<time.h>
 #include <string.h>
 #include <malloc.h>
 #include <cuda_runtime_api.h>
 #define NX 201
 #define NY 101
 #define NZ 201
 int main(void)
 {
 int i, j, k, l, m;
 static double vr = 1.5e8, tr, w1, w2;
 static double dt = 9.5e-9, t, comp;
 static double cu[NZ];
 static double AA[NX][NY][NZ] , CC[NX][NY][NZ] , BB[NX][NY][NZ] ;
 static double A[NX][NY][NZ] , B[NX][NY][NZ] , C[NX][NY][NZ] ;
 FILE *file;
 file = fopen("BB-and-A.csv", "w");
 t = 0.;
 #pragma acc data create(AA,CC,C,A,BB,B)
 {
 #pragma acc kernels loop
 for (i = 0; i < NX; i++) {
 for (j = 0; j < NY; j++) {
 for (k = 0; k < NZ; k++) {
// Initialize the arrays on the device
    CC[i][j][k]= 0.0; 
    AA[i][j][k] = 0.0;
    BB[i][j][k] = 0.0; 
    A[i][j][k] = 0.0;
    B[i][j][k] = 0.0;
    C[i][j][k] = 0.0;
}}}

 for (l = 0; l < 737; l++) {
 #pragma acc kernels loop
 for (i = 0; i < NX - 1; i++) {
 for (j = 1; j < NY - 1; j++) {
 for (k = 1; k < NZ - 1; k++) {
 AA[i][j][k] = 1.* AA[i][j][k]
 + 214. * (C[i][j][k] - C[i][j - 1][k])
 - 214. * (B[i][j][k] - B[i][j][k - 1]);
 }
 }
 }
 #pragma acc kernels loop
 for (i = 1; i < NX - 1; i++) {
 for (j = 0; j < NY - 1; j++) {
 for (k = 1; k < NZ - 1; k++) {
 CC[i][j][k] = 1. * CC[i][j][k]
 + 214. * (A[i][j][k] - A[i][j][k - 1])
 - 214. * (C[i][j][k] - C[i - 1][j][k]);
 }
 }
 }
 #pragma acc kernels loop
 for (i = 1; i < NX - 1; i++) { /* BB */
 for (j = 1; j < NY - 1; j++) {
 for (k = 0; k < NZ - 1; k++) {
 BB[i][j][k] = 1. * BB[i][j][k]
 + 214. * (B[i][j][k] - B[i - 1][j][k])
 - 214. * (A[i][j][k] - A[i][j - 1][k]);
 }
 }
 }
 #pragma acc kernels loop
 for (i = 1; i < NX - 1; i++) {
 for (j = 0; j < NY - 1; j++) {
 for (k = 0; k < NZ - 1; k++) {
 A[i][j][k] = A[i][j][k]
 - 0.0015 * (BB[i][j + 1][k] - BB[i][j][k])
 + 0.0015 * (CC[i][j][k + 1] - CC[i][j][k]);
 }
 }
 }
 #pragma acc kernels loop
 for (i = 0; i < NX - 1; i++) { /* C */
 for (j = 1; j < NY - 1; j++) {
 for (k = 0; k < NZ - 1; k++) {
 B[i][j][k] = B[i][j][k]
 - 0.0015 * (AA[i][j][k + 1] - AA[i][j][k])
 + 0.0015 * (BB[i + 1][j][k] - BB[i][j][k]);
 }
 }
 }
 #pragma acc kernels loop
 for (i = 0; i < NX - 1; i++) {
 for (j = 0; j < NY - 1; j++) {
 for (k = 1; k < NZ - 1; k++) {
 C[i][j][k] = C[i][j][k]
 - 0.0015 * (CC[i + 1][j][k] - CC[i][j][k])
 + 0.0015 * (AA[i][j + 1][k] - AA[i][j][k]);
 }
 }
 }
#pragma acc update self(A,B,BB)
for (m = 10; m < NZ - 1; m++) {
 tr = t - (double)(m)*5 / vr;
 if (tr <= 0.)
 cu[m] = 0.;
 else {
 w1 = (tr / 0.25e-6)*(tr / 0.25e-6);
 w2 = (tr / 2.e-6)*(tr / 2.e-6);
 cu[m] = 10700. / 0.639*w1 / (w1 + 1.)*exp(-tr / 2.5e-6) + 6500. / 0.867*w2 / (w2 + 1.)*exp(-tr / 230.e-6);
 cu[m] = (exp((double)(m)*-5. / 2000.))*cu[m];
 }
 A[10][60][m] = -cu[m] / (4.*5.);
 A[10][59][m] = cu[m] / (4.*5.);
 B[10][60][m] = cu[m] / (4.*5.);
 B[9][60][m] = -cu[m] / (4.*5.);
 }
 comp = ((double)(l + 1) / 737)*100.;
 printf("Computation: %4.3f %% completed \r", comp);
 fprintf(file, "%e, %e, %e, %e \n", t*1e6, cu[30] / 1000., -BB[30][60][10] / 1000., -(B[109][60][10] + B[110][60][10]) / 2.);
 t = t + dt;
#pragma acc update device(A,B)
 } // End l loop
 } // END OPENACC DATA REGION
 fclose(file);
 }

Topic		Replies	Views
Optimize Data Locality(output is wrong when i use OpenACC) Legacy PGI Compilers	2	1392	March 21, 2018
zero output when OpenACC is used Legacy PGI Compilers	0	8802	March 17, 2018
output is wrong when i use OpenACC(Data Clauses) Legacy PGI Compilers	2	1230	March 22, 2018
Strange issues with OpenACC data and loop directives in C++ classes Legacy PGI Compilers	6	1040	January 8, 2021
Data Clauses (output is zero when i use OpenACC) Legacy PGI Compilers	2	1481	March 23, 2018
Vector array assignments within a $acc parallel region Legacy PGI Compilers	13	10949	November 27, 2013
OpenACC data copy with AMD FirePro W9100 Legacy PGI Compilers	5	7623	April 6, 2017
OpenACC regions with C++ structs nvc, nvc++ and nvfortran	3	828	January 8, 2021
Launch failed error Legacy PGI Compilers	8	11194	September 13, 2013
Clarification on using OpenACC in a shared library Legacy PGI Compilers	27	4522	December 9, 2020

zero output when OpenACC is used

Related topics