Optimize runtime

hello
I want to optimize this code by OpenACC, but i encounter with an error.
how can i change num_workers(),vector_length() to Optimize runtime?
I use PGI community edition 17.10 to compile and run .( GeForce GTX 1050 ti hardware)
I would be thankful to help me in this way and use your guidances to achieve success and solve my troubles.
King regards,
Mohammadi

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>

#define NX 201
#define NY 101
#define NZ 201
int main(void)
{
clock_t start, end;
double cpu_time_used;
start = clock();
int  i, j, k, l, m;
static double   tr, w;
static double  dt = 9.5e-9, t;
static double cu[NZ];
static double AA[NX][NY][NZ] , CC[NX][NY][NZ] , BB[NX][NY][NZ] ;
static double A[NX][NY][NZ] , B[NX][NY][NZ] , C[NX][NY][NZ] ;
FILE *file;
file = fopen("BB-and-A.csv", "w");
t = 0.;
#pragma acc  data  copy(B ,A , C,AA , CC,BB,cu )
{
for (l = 1; l < 255; l++) {
#pragma acc kernels,num_workers(32),vector_length(256)
for (i = 1; i < NX - 1; i++) {
         for (j = 0; j < NY - 1; j++) {
            for (k = 1; k < NZ - 1; k++) {
               A[i][j][k] = A[i][j][k]
               + 1. * (B[i][j][k] - AA[i][j][k - 1]);

                }
        }
}
#pragma acc kernels,num_workers(32),vector_length(256)
for (i = 1; i < NX - 1; i++) { /* BB */
      for (j = 1; j < NY - 1; j++) {
         for (k = 0; k < NZ - 1; k++) {
            B[i][j][k] =  B[i][j][k]
            + 1.* (BB[i][j][k] - A[i - 1][j][k]);

               }
            }
         }
#pragma acc kernels,num_workers(32),vector_length(256)
 for (m = 1; m < NZ - 1; m++) {
tr = t - (double)(m)*5 / 1.5e8;
if (tr <= 0.)
cu[m] = 0.;
else {
w = (tr / 0.25e-6)*(tr / 0.25e-6);
cu[m] =1666*w / (w + 1.)*exp(-tr / 2.5e-6) ;
cu[m] = 2*cu[m];
}
A[10][60][m] = -cu[m];
}
#pragma acc update self(B)
fprintf(file, "%e, %e \n", t*1e6,  -B[22][60][10] );
t = t + dt;
}
}
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("\n cpu time %5.5f sec ", cpu_time_used);
fclose(file);
}

What is the error you are getting?

PGI$ pgcc -acc -ta=tesla:cc60 -Minfo=accel ee.c

ptxas warning : Too many threads per SM specified for entry main_43_gpu, will be
ignored
ptxas warning : Too many threads per SM specified for entry main_51_gpu, will be
ignored
ptxas warning : Too many threads per SM specified for entry main_33_gpu, will be
ignored
PGI$ ee
call to cuLaunchKernel returned error 1: Invalid value
PGI$ ee
call to cuLaunchKernel returned error 1: Invalid value
PGI$

Hi mohammadi sajad,

A CUDA block (which translates to a “gang” in OpenACC) can have up to 1024 threads. Hence in OpenACC, the product of the number of workers and vector length can not be more than 1024. You’re trying to create 8192 threads (32 x 256) and hence the error.

-Mat