# Optimize Data Locality(output is wrong when i use OpenACC)

I’m going to optimize this code by openacc, but the output computations are zero. I would appreciate the opportunity to help me in this way and use your guidances to achieve success and solve my problem.

Impatiently,I am looking forward to hear you soon.
King regards,

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
// #include <cuda_runtime_api.h>

#define NX 4
#define NY 4
#define NZ 4

int main(void)
{
int i, j,p, k;

static double A[NX-1][NY-1][NZ-1]={10.} ,B[NX-1][NY-1][NZ-1]={10.},C[NX-1][NY-1][NZ-1]={10.};
FILE *file;
file = fopen(“B-and-A.csv”, “w”);

#pragma acc data copyin(B,C),copyout(A)
{
for (p = 0; p <=2; p++) {
#pragma acc kernels loop private(i,j,k)
for ( i = 1; i < NX - 1; i++ ) {

for ( j = 0; j < NY - 1; j++ ) {
for ( k = 0; k < NZ - 1; k++ ) {
A_[j][k] = A[j][k]+2.

• 1.*( B[j+1][k] + C[j][k] )
• 1.*( C[j][k+1] + B[j][k] );
}
}
}

fprintf(file,"%e\n",A[2][2][2]);
}
}
fclose(file);
}_

There are two problems with your OpenACC directives.

First, since “A” is in a “copyout” clause, it is not initialized on the device. But you’re using “A” on both sides of the equation so need to put “A” in a “copy” directive instead.

Second, your data region spans over the “p” loop so isn’t copied back until after the end of the loop. However, you’re printing “A” for each iteration of “p”. Hence, you’re not printing the updated value from the device. To fix, add an “#pragma acc update self(A)” before you print the value.

``````  #include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
// #include <cuda_runtime_api.h>

#define NX 4
#define NY 4
#define NZ 4

int main(void)
{
int i, j,p, k;

static double A[NX][NY][NZ]={10.} ,B[NX][NY][NZ]={10.},C[NX][NY][NZ]={10.};
FILE *file;
file = fopen("B-and-A.csv", "w");

#pragma acc data copyin(B,C),copy(A)
{
for (p = 0; p <=2; p++) {
#pragma acc kernels  loop
for ( i = 1; i < NX - 1; i++ ) {
for ( j = 0; j < NY - 1; j++ ) {
for ( k = 0; k < NZ - 1; k++ ) {
A[i][j][k] = A[i][j][k]+2.
+ 1.*( B[i][j+1][k] + C[i][j][k] )
+ 1.*( C[i][j][k+1] + B[i][j][k] );
}
}
}
#pragma acc update self(A)
fprintf(file,"%e\n",A[2][2][2]);
printf("%e\n",A[2][2][2]);

}
}
fclose(file);
}

% pgcc test.c -ta=tesla:cc60 -Minfo=accel
main:
25, Generating copy(A[:][:][:])
Generating copyin(C[:][:][:],B[:][:][:])
29, Loop is parallelizable
30, Loop is parallelizable
31, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
29, #pragma acc loop gang /* blockIdx.y */
30, #pragma acc loop gang, vector(4) /* blockIdx.z threadIdx.y */
31, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
39, Generating update self(A[:][:][:])
% a.out
2.000000e+00
4.000000e+00
6.000000e+00
``````

Hope this helps,
Mat