output is wrong when i use OpenACC(Data Clauses)

I wanna optimize this code by OpenACC, but the output computations are zero.I would be thankful the opportunity to help me in this way and use your guidances to achieve success and solve my trouble.

King regards,
Mohammadi

#include <stdio.h>
  #include <math.h>
  #include <stdlib.h>
  #include <assert.h>
  #include <openacc.h>
  #include<time.h>
  #include <string.h>
  #include <malloc.h>

  #define NX 4
  #define NY 4
  #define NZ 4
  int main(void)
  {
  int i, j,p, k,m;
  static double A[NX][NY][NZ]={0.} ,B[NX][NY][NZ]={0.},C[NX][NY][NZ]={0.},D[NX][NY][NZ]={0.};
  FILE *file;
  file = fopen("B-and-A.csv", "w");
#pragma acc data copyin(C,D),copy(A,B)
 {
for (p = 0; p <=5; p++) {
    #pragma acc kernels
    for ( i = 1; i < NX - 1; i++ ) {
            for ( j = 0; j < NY - 1; j++ ) {
                for ( k = 0; k < NZ - 1; k++ ) {
                 A[i][j][k] = A[i][j][k] + 1.*( B[i][j+1][k] + D[i][j][k] );
                        }
                    }
                }
#pragma acc kernels
 for ( i = 1; i < NX - 1; i++ ) {
        for ( j = 0; j < NY - 1; j++ ) {
            for ( k = 0; k < NZ - 1; k++ ) {
             B[i][j][k] = B[i][j][k]+ 1.*( A[i][j+1][k] + D[i][j][k] );
            }
        }
    }

    for (m = 0; m < NZ - 1; m++) {
        A[0][m][m] = -25. ;
        A[2][m][m] = 52. ;
        B[0][m][m] = 15. ;
        B[2][m][m] = -55. ;
                                }
#pragma acc update self(B)
fprintf(file,"%e\n",B[2][2][2]);
printf("%e\n",B[2][2][2]);
}
}
  fclose(file);
}

When you perform an “update self(B)”, you are copying the device values of B to the host copy of B. Any changes you have made to the host copy before this point will be lost.

For this code, you will need to update B before the loop executed on the host, and then update the device copy after the host loop. Alternatively, you can offload the loop to the GPU so all computation is done on the device.

Option #1:

// Update the host copies before executing on the host
#pragma acc update self(A,B)
                        // This loop is executed on the host
                        for (m = 0; m < NZ - 1; m++) {
                                A[0][m][m] = -25. ;
                                A[2][m][m] = 52. ;
                                B[0][m][m] = 15. ;
                                B[2][m][m] = -55. ;
                        }
// To keep the device and host copies coherent, update the device copies
#pragma acc update device(A,B)

Option #2:

// Or offload the loop to the device
#pragma acc kernels
                        for (m = 0; m < NZ - 1; m++) {
                                A[0][m][m] = -25. ;
                                A[2][m][m] = 52. ;
                                B[0][m][m] = 15. ;
                                B[2][m][m] = -55. ;
                        }
#pragma acc update self(B)

Thank you kindly