Hi!
I’m testing the OpenACC directives with the following function:
void update(intrestrict solid, floatrestrict fp, floatrestrict rho, float*restrict ux, float restrict uy, floatrestrict uz, int nx, int ny, int nz, float mass, int time_total)
{
int x,y,z,t;
for (t=0; t<=time_total; t++)
{
mass = 0.0;
#pragma acc kernels loop present(solid[0:nx-1][0:ny-1][0:nz-1], fp[0:nx-1][0:ny-1][0:nz-1][0:18], rho[0:nx-1][0:ny-1][0:nz-1], ux[0:nx-1][0:ny-1][0:nz-1], uy[0:nx-1][0:ny-1][0:nz-1], uz[0:nx-1][0:ny-1][0:nz-1])
for (x=0; x<nx; x++)
{
for (y=0; y<ny; y++)
{
for (z=0; z<nz; z++)
{
if (!solid[y][z])
{
rho[y][z] = fp[y][z][0]+fp[y][z][1]+fp[y][z][2]+fp[y][z][3]+fp[y][z][4]+fp[y][z][5]+fp[y][z][6]+fp[y][z][7]+fp[y][z][8]+fp[y][z][9]+fp[y][z][10]+fp[y][z][11]+fp[y][z][12]+fp[y][z][13]+fp[y][z][14]+fp[y][z][15]+fp[y][z][16]+fp[y][z][17]+fp[y][z][18];
ux[y][z] = (fp[y][z][1]+fp[y][z][2]+fp[y][z][8]-fp[y][z][4]-fp[y][z][5]-fp[y][z][6]+fp[y][z][15]+fp[y][z][18]-fp[y][z][16]-fp[y][z][17])/rho[y][z];
uy[y][z] = (fp[y][z][2]+fp[y][z][3]+fp[y][z][4]-fp[y][z][6]-fp[y][z][7]-fp[y][z][8]+fp[y][z][9]+fp[y][z][14]-fp[y][z][11]-fp[y][z][12])/rho[y][z];
uz[y][z] = (fp[y][z][9]+fp[y][z][10]+fp[y][z][11]-fp[y][z][12]-fp[y][z][13]-fp[y][z][14]+fp[y][z][15]+fp[y][z][16]-fp[y][z][17]-fp[y][z][18])/rho[y][z];
mass += rho[y][z];
}
}
}
}
}
}
And the function is called from the main with:
#pragma acc data copy(s[0:nx-1][0:ny-1][0:nz-1], f[0:nx-1][0:ny-1][0:nz-1][0:18], r[0:nx-1][0:ny-1][0:nz-1], uxx[0:nx-1][0:ny-1][0:nz-1], uyy[0:nx-1][0:ny-1][0:nz-1], uzz[0:nx-1][0:ny-1][0:nz-1])
{
update(s, f, r, uxx, uyy, uzz, nx, ny, nz, mass, time_total);
}
The info messages and error I got are the following:
PGI$ pgcc -acc -Minfo=accel -fast lbm3Dacc4.c
NOTE: your trial license will expire in 10 days, 6.38 hours.
update:
268, Generating present(uz[0:nx-1][0:ny-1][0:nz-1])
Generating present(uy[0:nx-1][0:ny-1][0:nz-1])
Generating present(ux[0:nx-1][0:ny-1][0:nz-1])
Generating present(rho[0:nx-1][0:ny-1][0:nz-1])
Generating present(fp[0:nx-1][0:ny-1][0:nz-1][0:18])
Generating present(solid[0:nx-1][0:ny-1][0:nz-1])
Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
269, Loop is parallelizable
271, Loop is parallelizable
273, Loop is parallelizable
Accelerator kernel generated
269, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.z /
271, #pragma acc loop gang, vector(4) / blockIdx.x threadIdx.y /
273, #pragma acc loop vector(16) / threadIdx.x */
CC 1.0 : 24 registers; 1136 shared, 72 constant, 0 local memory byt
es; 33% occupancy
CC 2.0 : 32 registers; 1032 shared, 136 constant, 0 local memory by
tes; 66% occupancy
282, Sum reduction generated for mass
main:
368, Generating copy(uzz[0:nx-1][0:ny-1][0:nz-1])
Generating copy(uyy[0:nx-1][0:ny-1][0:nz-1])
Generating copy(uxx[0:nx-1][0:ny-1][0:nz-1])
Generating copy(r[0:nx-1][0:ny-1][0:nz-1])
Generating copy(f[0:nx-1][0:ny-1][0:nz-1][0:18])
Generating copy(s[0:nx-1][0:ny-1][0:nz-1])
PGI$ lbm3Dacc4.exe
Calculating…
call to cuMemcpyDtoH returned error 999: Unknown
CUDA driver version: 4000
I tried to follow strictly the program structure of the example file “acc_c3a.c” for OpenACC. I should make note that for sizing arrays I had used calloc instead of malloc.
I would appreciate any advice about this error. Regards!