zero output when OpenACC is used

I use PGI community edition 17.10 to compile and run fallowing code. why the output is wrong when I add directives of OpenACC?
may you if help me why it’s happen?
Thanks in advance,
sajad
The code:
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
#include <cuda_runtime_api.h>

#define NX 2
#define NY 2
#define NZ 2


int main(void)
{
static int i, j, k;

static double A[NX][NY][NZ]=2 ,B[NX][NY][NZ]=10.,C[NX]=10.,D[NY]=10.,E[NZ]=10.;
FILE *file;
file = fopen(“BB-and-A.csv”, “w”);
#pragma acc data copy( A ,B,C,D,E,i, j, k)
{
#pragma acc kernels loop private(i, j, k)

for (i = 0; i <= NX; i++) {
for (j =0; j <= NY ; j++) {
for (k =0; k <= NZ ; k++) {
C[i]=i;
D[j]=j;
E[k]=k;

}
}
}
}
for (i = 0; i <= NX; i++) {
for (j =0; j <= NY ; j++) {
for (k =0; k <= NZ ; k++) {
fprintf(file, “%e, %e, %e \n”, C[i], D[j],E[k] );
}
}
}

fclose(file);
}

Hi mohammadi sajad,

You have a number of issues with this code.

  1. Your array bounds are incorrect. Since the loops go from 1 to <= N but the arrays only have N members, you’re writing off the end of the array.

  2. Your loop isn’t parallelizable since you’re writing to each element from multiple loop iterations. To fix, I’d make these three separate loops.

  3. The loop index variables shouldn’t be made static. This puts them in global storage and thus causes a dependency. While you can fix this by putting them in private clause, it’s better to remove the static and let the compiler implicitly privatize them.

  4. No need to copy the loop index variables.

Try something like the following:

% cat test2.c
#include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <openacc.h>
 #include<time.h>
 #include <string.h>
 #include <malloc.h>
// #include <cuda_runtime_api.h>

 #define NX 2
 #define NY 2
 #define NZ 2


 int main(void)
 {
 int i, j, k;

 static double A[NX+1][NY+1][NZ+1]=2 ,B[NX+1][NY+1][NZ+1]=10.,C[NX+1]=10.,D[NY+1]=10.,E[NZ+1]=10.;
 FILE *file;
 file = fopen("BB-and-A.csv", "w");
 #pragma acc data copy(A,B,C,D,E)
 {
 #pragma acc kernels
 {
 for (i = 0; i <= NX; i++) C[i]=i;
 for (j =0; j <= NY ; j++) D[j]=j;
 for (k =0; k <= NZ ; k++) E[k]=k;
 }}
 for (i = 0; i <= NX; i++) {
 for (j =0; j <= NY ; j++) {
 for (k =0; k <= NZ ; k++) {
   fprintf(file,"%e, %e, %e \n", C[i], D[j],E[k] );
 }
 }
 }

 fclose(file);
 }
% pgcc test2.c -ta=tesla:cc60 -Minfo=accel
main:
     23, Generating copy(A[:][:][:],B[:][:][:],C[:],E[:],D[:])
     27, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         27, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     28, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         28, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     29, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         29, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
% a.out
% cat BB-and-A.csv
0.000000e+00, 0.000000e+00, 0.000000e+00
0.000000e+00, 0.000000e+00, 1.000000e+00
0.000000e+00, 0.000000e+00, 2.000000e+00
0.000000e+00, 1.000000e+00, 0.000000e+00
0.000000e+00, 1.000000e+00, 1.000000e+00
0.000000e+00, 1.000000e+00, 2.000000e+00
0.000000e+00, 2.000000e+00, 0.000000e+00
0.000000e+00, 2.000000e+00, 1.000000e+00
0.000000e+00, 2.000000e+00, 2.000000e+00
1.000000e+00, 0.000000e+00, 0.000000e+00
1.000000e+00, 0.000000e+00, 1.000000e+00
1.000000e+00, 0.000000e+00, 2.000000e+00
1.000000e+00, 1.000000e+00, 0.000000e+00
1.000000e+00, 1.000000e+00, 1.000000e+00
1.000000e+00, 1.000000e+00, 2.000000e+00
1.000000e+00, 2.000000e+00, 0.000000e+00
1.000000e+00, 2.000000e+00, 1.000000e+00
1.000000e+00, 2.000000e+00, 2.000000e+00
2.000000e+00, 0.000000e+00, 0.000000e+00
2.000000e+00, 0.000000e+00, 1.000000e+00
2.000000e+00, 0.000000e+00, 2.000000e+00
2.000000e+00, 1.000000e+00, 0.000000e+00
2.000000e+00, 1.000000e+00, 1.000000e+00
2.000000e+00, 1.000000e+00, 2.000000e+00
2.000000e+00, 2.000000e+00, 0.000000e+00
2.000000e+00, 2.000000e+00, 1.000000e+00
2.000000e+00, 2.000000e+00, 2.000000e+00

Hope this helps,
Mat

Hello mat
Thank You so much for your useful helps.
I’m going to optimize this code by openacc, but the output computations are zero. I would appreciate the opportunity to help me in this way and use your guidances to achieve success and solve my problem.

Impatiently,I am looking forward to hear you soon.

King regards,
Sajjad Mohammadi


#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
#include <cuda_runtime_api.h>
#define NX 201
#define NY 101
#define NZ 201
int main(void)
{
int i, j, k, l, m;
static double vr = 1.5e8, tr, w1, w2;
static double dt = 9.5e-9, t, comp;
static double cu[NZ];
static double AA[NX][NY][NZ] , CC[NX][NY][NZ] , BB[NX][NY][NZ] ;
static double A[NX][NY][NZ] , B[NX][NY][NZ] , C[NX][NY][NZ] ;
FILE file;
file = fopen(“BB-and-A.csv”, “w”);
t = 0.;
for (l = 0; l < 737; l++) {
#pragma acc data copyin( vr, tr, w1, w2,dt, t, comp,AA , CC , A , C ),copyout(t,cu,BB,B)
{
#pragma acc kernels loop private(i, j, k)
for (i = 0; i < NX - 1; i++) {
for (j = 1; j < NY - 1; j++) {
for (k = 1; k < NZ - 1; k++) {
AA_[j][k] = 1.
AA[j][k]

      • (C[j][k] - C[j - 1][k])
      • (B[j][k] - B[j][k - 1]);
        }
        }
        }
        #pragma acc kernels loop private(i,j,k)
        for (i = 1; i < NX - 1; i++) {
        for (j = 0; j < NY - 1; j++) {
        for (k = 1; k < NZ - 1; k++) {
        CC[j][k] = 1. * CC[j][k]
      • (A[j][k] - A[j][k - 1])
      • (C[i][j][k] - C[i - 1][j][k]);
        }
        }
        }
        #pragma acc kernels loop private(i,j,k)
        for (i = 1; i < NX - 1; i++) { /* BB */
        for (j = 1; j < NY - 1; j++) {
        for (k = 0; k < NZ - 1; k++) {
        BB[i][j][k] = 1. * BB[i][j][k]
      • (B[i][j][k] - B[i - 1][j][k])
      • (A[i][j][k] - A[i][j - 1][k]);
        }
        }
        }
        #pragma acc kernels loop private(i,j,k)
        for (i = 1; i < NX - 1; i++) {
        for (j = 0; j < NY - 1; j++) {
        for (k = 0; k < NZ - 1; k++) {
        A[i][j][k] = A[i][j][k]
  • 0.0015 * (BB[i][j + 1][k] - BB[i][j][k])
  • 0.0015 * (CC[i][j][k + 1] - CC[i][j][k]);
    }
    }
    }
    #pragma acc kernels loop private(i,j,k)
    for (i = 0; i < NX - 1; i++) { /* C */
    for (j = 1; j < NY - 1; j++) {
    for (k = 0; k < NZ - 1; k++) {
    B[i][j][k] = B[i][j][k]
  • 0.0015 * (AA[i][j][k + 1] - AA[i][j][k])
  • 0.0015 * (BB[i + 1][j][k] - BB[i][j][k]);
    }
    }
    }
    #pragma acc kernels loop private(i,j,k)
    for (i = 0; i < NX - 1; i++) {
    for (j = 0; j < NY - 1; j++) {
    for (k = 1; k < NZ - 1; k++) {
    C[i][j][k] = C[i][j][k]
  • 0.0015 * (CC[i + 1][j][k] - CC[i][j][k])
  • 0.0015 * (AA[i][j + 1][k] - AA[i][j][k]);
    }
    }
    }
    for (m = 10; m < NZ - 1; m++) {
    tr = t - (double)(m)5 / vr;
    if (tr <= 0.)
    cu[m] = 0.;
    else {
    w1 = (tr / 0.25e-6)
    (tr / 0.25e-6);
    w2 = (tr / 2.e-6)(tr / 2.e-6);
    cu[m] = 10700. / 0.639
    w1 / (w1 + 1.)exp(-tr / 2.5e-6) + 6500. / 0.867w2 / (w2 + 1.)exp(-tr / 230.e-6);
    cu[m] = (exp((double)(m)
    -5. / 2000.))*cu[m];
    }
    A[10][60][m] = -cu[m] / (4.*5.);
    A[10][59][m] = cu[m] / (4.*5.);
    B[10][60][m] = cu[m] / (4.*5.);
    B[9][60][m] = -cu[m] / (4.*5.);
    }
    comp = ((double)(l + 1) / 737)100.;
    printf(“Computation: %4.3f %% completed \r”, comp);
    fprintf(file, “%e, %e, %e, %e \n”, t
    1e6, cu[30] / 1000., -BB[30][60][10] / 1000., -(B[109][60][10] + B[110][60][10]) / 2.);
    t = t + dt;
    }
    }
    fclose(file);
    }_

Hi mohammadi sajad,

I see two main issues.

  1. You’re not initializing your arrays. Since they are static, this happens to have the side effect of zero initialization on the host, but on the device the values are not initialized. You can set the environment variable “PGI_ACC_FILL=1” to have the compiler initialize the device data to zero, but I would recommend you explicitly initialize the data in the program.

  2. You data region ends after you use the arrays on the host. Hence, the values you are printing are what ever they were before the device compute regions (in this case zero). While you can fix this by moving the end of the data region before the “m” loop, I’d recommend you move the compute region above the “l” loop, initialize it on the device, and then use the OpenACC update directives to synchronize the data. This will remove unneeded extra copies and allocation/deallocation of device data.

Mat


#include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <openacc.h>
 #include<time.h>
 #include <string.h>
 #include <malloc.h>
 #include <cuda_runtime_api.h>
 #define NX 201
 #define NY 101
 #define NZ 201
 int main(void)
 {
 int i, j, k, l, m;
 static double vr = 1.5e8, tr, w1, w2;
 static double dt = 9.5e-9, t, comp;
 static double cu[NZ];
 static double AA[NX][NY][NZ] , CC[NX][NY][NZ] , BB[NX][NY][NZ] ;
 static double A[NX][NY][NZ] , B[NX][NY][NZ] , C[NX][NY][NZ] ;
 FILE *file;
 file = fopen("BB-and-A.csv", "w");
 t = 0.;
 #pragma acc data create(AA,CC,C,A,BB,B)
 {
 #pragma acc kernels loop
 for (i = 0; i < NX; i++) {
 for (j = 0; j < NY; j++) {
 for (k = 0; k < NZ; k++) {
// Initialize the arrays on the device
    CC[i][j][k]= 0.0; 
    AA[i][j][k] = 0.0;
    BB[i][j][k] = 0.0; 
    A[i][j][k] = 0.0;
    B[i][j][k] = 0.0;
    C[i][j][k] = 0.0;
}}}

 for (l = 0; l < 737; l++) {
 #pragma acc kernels loop
 for (i = 0; i < NX - 1; i++) {
 for (j = 1; j < NY - 1; j++) {
 for (k = 1; k < NZ - 1; k++) {
 AA[i][j][k] = 1.* AA[i][j][k]
 + 214. * (C[i][j][k] - C[i][j - 1][k])
 - 214. * (B[i][j][k] - B[i][j][k - 1]);
 }
 }
 }
 #pragma acc kernels loop
 for (i = 1; i < NX - 1; i++) {
 for (j = 0; j < NY - 1; j++) {
 for (k = 1; k < NZ - 1; k++) {
 CC[i][j][k] = 1. * CC[i][j][k]
 + 214. * (A[i][j][k] - A[i][j][k - 1])
 - 214. * (C[i][j][k] - C[i - 1][j][k]);
 }
 }
 }
 #pragma acc kernels loop
 for (i = 1; i < NX - 1; i++) { /* BB */
 for (j = 1; j < NY - 1; j++) {
 for (k = 0; k < NZ - 1; k++) {
 BB[i][j][k] = 1. * BB[i][j][k]
 + 214. * (B[i][j][k] - B[i - 1][j][k])
 - 214. * (A[i][j][k] - A[i][j - 1][k]);
 }
 }
 }
 #pragma acc kernels loop
 for (i = 1; i < NX - 1; i++) {
 for (j = 0; j < NY - 1; j++) {
 for (k = 0; k < NZ - 1; k++) {
 A[i][j][k] = A[i][j][k]
 - 0.0015 * (BB[i][j + 1][k] - BB[i][j][k])
 + 0.0015 * (CC[i][j][k + 1] - CC[i][j][k]);
 }
 }
 }
 #pragma acc kernels loop
 for (i = 0; i < NX - 1; i++) { /* C */
 for (j = 1; j < NY - 1; j++) {
 for (k = 0; k < NZ - 1; k++) {
 B[i][j][k] = B[i][j][k]
 - 0.0015 * (AA[i][j][k + 1] - AA[i][j][k])
 + 0.0015 * (BB[i + 1][j][k] - BB[i][j][k]);
 }
 }
 }
 #pragma acc kernels loop
 for (i = 0; i < NX - 1; i++) {
 for (j = 0; j < NY - 1; j++) {
 for (k = 1; k < NZ - 1; k++) {
 C[i][j][k] = C[i][j][k]
 - 0.0015 * (CC[i + 1][j][k] - CC[i][j][k])
 + 0.0015 * (AA[i][j + 1][k] - AA[i][j][k]);
 }
 }
 }
#pragma acc update self(A,B,BB)
for (m = 10; m < NZ - 1; m++) {
 tr = t - (double)(m)*5 / vr;
 if (tr <= 0.)
 cu[m] = 0.;
 else {
 w1 = (tr / 0.25e-6)*(tr / 0.25e-6);
 w2 = (tr / 2.e-6)*(tr / 2.e-6);
 cu[m] = 10700. / 0.639*w1 / (w1 + 1.)*exp(-tr / 2.5e-6) + 6500. / 0.867*w2 / (w2 + 1.)*exp(-tr / 230.e-6);
 cu[m] = (exp((double)(m)*-5. / 2000.))*cu[m];
 }
 A[10][60][m] = -cu[m] / (4.*5.);
 A[10][59][m] = cu[m] / (4.*5.);
 B[10][60][m] = cu[m] / (4.*5.);
 B[9][60][m] = -cu[m] / (4.*5.);
 }
 comp = ((double)(l + 1) / 737)*100.;
 printf("Computation: %4.3f %% completed \r", comp);
 fprintf(file, "%e, %e, %e, %e \n", t*1e6, cu[30] / 1000., -BB[30][60][10] / 1000., -(B[109][60][10] + B[110][60][10]) / 2.);
 t = t + dt;
#pragma acc update device(A,B)
 } // End l loop
 } // END OPENACC DATA REGION
 fclose(file);
 }