optimize runtime

Hello.
The attached code is miniature and simplified part of a complicated code. What strategies, solutions and instructions do you recommend in order to reduce the computational time by open ACC?
Considering that a slight decrease in accuracy of output computations is acceptable.
I am looking forward to hear you soon.
King regards,
Sajjad Mohammadi

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>

#define NX 201
#define NY 101
#define NZ 201
int main(void)
{
clock_t start, end;
double cpu_time_used;
start = clock();
int  i, j, k, l, m;
static double   tr, w;
static double  dt = 9.5e-9, t;
static double cu[NZ];
static double AA[NX][NY][NZ] , CC[NX][NY][NZ] , BB[NX][NY][NZ] ;
static double A[NX][NY][NZ] , B[NX][NY][NZ] , C[NX][NY][NZ] ;
FILE *file;
file = fopen("BB-and-A.csv", "w");
t = 0.;
#pragma acc  data  copy(B ,A , C,AA , CC,BB,cu )
{
for (l = 1; l < 255; l++) {
#pragma acc kernels 
for (i = 1; i < NX - 1; i++) {
         for (j = 0; j < NY - 1; j++) {
            for (k = 1; k < NZ - 1; k++) {
               A[i][j][k] = A[i][j][k]
               + 1. * (B[i][j][k] - AA[i][j][k - 1]);

                }
        }
}
#pragma acc kernels 
for (i = 1; i < NX - 1; i++) { /* BB */
      for (j = 1; j < NY - 1; j++) {
         for (k = 0; k < NZ - 1; k++) {
            B[i][j][k] =  B[i][j][k]
            + 1.* (BB[i][j][k] - A[i - 1][j][k]);

               }
            }
         }
#pragma acc kernels
 for (m = 1; m < NZ - 1; m++) {
tr = t - (double)(m)*5 / 1.5e8;
if (tr <= 0.)
cu[m] = 0.;
else {
w = (tr / 0.25e-6)*(tr / 0.25e-6);
cu[m] =1666*w / (w + 1.)*exp(-tr / 2.5e-6) ;
cu[m] = 2*cu[m];
}
A[10][60][m] = -cu[m];
}
#pragma acc update self(B)
fprintf(file, "%e, %e \n", t*1e6,  -B[22][60][10] );
t = t + dt;
}
}
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("\n cpu time %5.5f sec ", cpu_time_used);
fclose(file);
}

Hi mohammadi,

Looking at the output from the command line profiler, you can see that most of your time is being spend copying “B” back to the host.

If your larger application does not need these intermediary values, then it’s best to delay copying the data back until after the “l” loop.

% setenv PGI_ACC_TIME 1
% a.out

 cpu time 2.67000 sec

Accelerator Kernel Timing data
test.c
  main  NVIDIA  devicenum=0
    time(us): 805,729
    27: data region reached 2 times
        27: data copyin transfers: 13
             device time(us): total=15,969 max=1,382 min=7 avg=1,228
        68: data copyout transfers: 13
             device time(us): total=15,458 max=1,509 min=22 avg=1,189
    30: compute region reached 254 times
        33: kernel launched 254 times
            grid: [7x199x25]  block: [32x4]
             device time(us): total=62,497 max=253 min=243 avg=246
            elapsed time(us): total=84,423 max=3,395 min=267 avg=332
    40: compute region reached 254 times
        43: kernel launched 254 times
            grid: [7x199x25]  block: [32x4]
             device time(us): total=62,736 max=250 min=244 avg=246
            elapsed time(us): total=77,372 max=3,435 min=261 avg=304
    50: compute region reached 254 times
        51: kernel launched 254 times
            grid: [2]  block: [128]
             device time(us): total=764 max=4 min=2 avg=3
            elapsed time(us): total=6,038 max=90 min=18 avg=23
    63: update directive reached 254 times
        63: data copyout transfers: 508
             device time(us): total=648,305 max=3,145 min=1,209 avg=1,276