OpenACC declare and update directives

Hello, I found a problem, connected with OpenACC declare and update directives. I use delare create to allocate memory on GPU (3 arrays of n float elements) and update device to update two of them, but not updating the third, which is used only for result storing. Then perform calculations and use update host for the third array, it shows wrong results. But if I updated it before, with two input arrays, it works fine. I can send you code to test it but it’s proprietary.

Hi i_alex2004,

Without using your proprietary code, can you write-up a small example which shows how you are using the “declare” and “update” directives?

Thanks,
Mat

Hello, here is a bit cut code, I was talking about:

// (c) 2012 Dmitry Mikushin, University of Lugano
// (c) 2012 Alexey Ivakhnenko, Applied Parallel Computing LLC

#include <malloc.h>
#include <stdio.h>
#include <stdlib.h>

// Memory alignment, for vectorization.
// 4096 should be best for memory transfers over PCI-E.
#define MEMALIGN 4096

int wave13pt(int nx, int ny, int ns,
	const real c0, const real c1, const real c2,
	real* restrict w0, real* restrict w1, real* restrict w2)
{
#if defined(_OPENACC)
	size_t szarray = (size_t)nx * ny * ns;
	#pragma acc kernels loop independent,  present(w0[0:szarray], w1[0:szarray], w2[0:szarray])
#endif
	for (int k = 2; k < ns - 2; k++)
	{
#if defined(_OPENACC)
		#pragma acc loop independent
#endif
		for (int j = 2; j < ny - 2; j++)
		{
#if defined(_OPENACC)
			#pragma acc loop independent
#endif
			for (int i = 2; i < nx - 2; i++)
			{
				int idx=i+ nx * j + nx * ny * k;
				w2[idx] = 1;
			}
		}
	}

	return 0;
}

#define parse_arg(name, arg) \
	int name = atoi(arg); \
	if (name < 0) \
	{ \
		printf("Value for " #name " is invalid: %d\n", name); \
	}

#define real_rand() ((real)(rand() / (double)RAND_MAX))

int main(int argc, char* argv[])
{
	if (argc != 5)
	{
		printf("Usage: %s <nx> <ny> <ns> <nt>\n", argv[0]);
		//return 0;
	}

	parse_arg(nx, argv[1]);
	parse_arg(ny, argv[2]);
	parse_arg(ns, argv[3]);
	parse_arg(nt, argv[4]);

	real c0 = real_rand();
	real c1 = real_rand();
	real c2 = real_rand();

	printf("c0 = %f, c1 = %f, c2 = %f\n", c0, c1, c2);

	size_t szarray = (size_t)nx * ny * ns;
	size_t szarrayb = szarray * sizeof(real);

	real* w0 = memalign(MEMALIGN, szarrayb);
	real* w1 = memalign(MEMALIGN, szarrayb);
	real* w2 = memalign(MEMALIGN, szarrayb);

	real mean = 0.0f;
	for (int i = 0; i < szarray; i++)
	{
		w0[i] = real_rand();
		w1[i] = real_rand();
		mean += w0[i] + w1[i];
	}
	printf("Initial mean = %f\n", mean / szarray / 3);

	if (!w0 || !w1 || !w2)
	{
		printf("Error allocating memory for arrays: %p, %p, %p\n", w0, w1, w2);
		//return 0;
	}

	
#if defined(_OPENACC)	
	#pragma acc declare create (w0[szarray],w1[szarray])
	#pragma acc declare create (w2[szarray])
#endif
	// Transfer data from host to device and leave it there,
	// i.e. do not allocate deivce memory buffers.
#if defined(_OPENACC)
	#pragma acc update device(w0[0:szarray], w1[0:szarray])
#endif

	// Perform data processing iterations, keeping all data
	// on device.
	{			
		for (int it = 0; it < nt; it++)
		{
			wave13pt(nx, ny, ns, c0, c1, c2, w0, w1, w2);
		}
	}

	// Transfer output data back from device to host.
#if defined(_OPENACC)
	#pragma acc update host (w2[0:szarray])
#endif

	// Deallocate device data buffers.
	mean = 0.0f;
	for (int i = 0; i < szarray; i++)
	{
		mean += w2[i];
	}
	printf("Final mean = %f\n", mean / szarray / 3);

	free(w0);
	free(w1);
	free(w2);

	return 0;
}

Here is the compiler log and results of OpenACC version and CPU version:

[aivahnenko@tesla-apc wave13pt]$ make -f makefile.acc
pgcc -Dreal=float -c99 -acc -Minfo -Minline -ta=nvidia wave13pt.c -o wave13pt.acc -lrt
wave13pt:
     18, Generating present(w2[0:szarray])
         Generating present(w1[0:szarray])
         Generating present(w0[0:szarray])
         Generating compute capability 1.0 binary
         Generating compute capability 2.0 binary
     20, Loop is parallelizable
     25, Loop is parallelizable
     30, Loop is parallelizable
         Accelerator kernel generated
         25, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         30, #pragma acc loop gang, vector(64) /* blockIdx.x threadIdx.x */
             CC 1.0 : 13 registers; 52 shared, 8 constant, 0 local memory bytes
             CC 2.0 : 24 registers; 0 shared, 68 constant, 0 local memory bytes
main:
     94, Generating create(w1[0:szarray])
         Generating create(w0[0:szarray])
     99, Generating create(w2[0:szarray])
    105, Generating update device(w1[0:szarray])
         Generating update device(w0[0:szarray])
    117, Generating update host(w2[0:szarray])
[aivahnenko@tesla-apc wave13pt]$ ./wave13pt.acc 512 512 512 1
c0 = 0.840188, c1 = 0.394383, c2 = 0.783099
Initial mean = 0.083333
Final mean = 78.922485


[aivahnenko@tesla-apc wave13pt]$ make -f makefile.gcc
gcc -Dreal=float -std=c99 wave13pt.c -o wave13pt  -lrt
[aivahnenko@tesla-apc wave13pt]$ ./wave13pt 512 512 512 1
c0 = 0.840188, c1 = 0.394383, c2 = 0.783099
Initial mean = 0.083333
Final mean = 0.041667

As I told before, if I replace: #pragma acc update device(w0[0:szarray], w1[0:szarray])
with:
#pragma acc update device(w0[0:szarray], w1[0:szarray], w2[0:szarray])
it works fine.

Hi i_alex2004,

When data is created on the device, it is not automatically initialized. Since your code does not set w2’s halo on the GPU, when it’s copied back, you’re copying back uninitialised values. The problem is the same on the CPU, but it just happens that w2’s data is zero. Though, this is not guaranteed.

To fix, initialize w2.

....
#if defined(_OPENACC)
   #pragma acc update device(w0[0:szarray], w1[0:szarray])
#endif

#pragma acc kernels loop
   for (int i = 0; i < szarray; ++i) {
        w2[i] = 0.0f;
   }
....
% pgcc -Dreal=float -c99 -acc -Minfo -Minline -ta=nvidia wave13pt.c -o wave13pt.acc -lrt -V12.10
wave13pt:
     19, Generating present(w2[0:szarray])
         Generating present(w1[0:szarray])
         Generating present(w0[0:szarray])
         Generating compute capability 1.0 binary
         Generating compute capability 2.0 binary
     21, Loop is parallelizable
     26, Loop is parallelizable
     31, Loop is parallelizable
         Accelerator kernel generated
         26, #pragma acc loop gang /* blockIdx.y */
         31, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             CC 1.0 : 13 registers; 52 shared, 4 constant, 0 local memory bytes
             CC 2.0 : 23 registers; 0 shared, 68 constant, 0 local memory bytes
main:
     95, Generating create(w1[0:szarray])
         Generating create(w0[0:szarray])
    100, Generating create(w2[0:szarray])
    103, Generating update device(w1[0:szarray])
         Generating update device(w0[0:szarray])
         Generating present_or_create(w2[0:szarray])
         Generating compute capability 1.0 binary
         Generating compute capability 2.0 binary
    104, Loop is parallelizable
         Accelerator kernel generated
        104, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             CC 1.0 : 7 registers; 36 shared, 4 constant, 0 local memory bytes
             CC 2.0 : 10 registers; 0 shared, 52 constant, 0 local memory bytes
    123, Generating update host(w2[0:szarray])
% wave13pt.acc 512 512 512 1
c0 = 0.840188, c1 = 0.394383, c2 = 0.783099
Initial mean = 0.083333
Final mean = 0.041667

Hope this helps,
Mat

Hi, Mat
w2 is initialized in int wave13pt function as one can see:

for (int i = 2; i < nx - 2; i++) 
         { 
            int idx=i+ nx * j + nx * ny * k; 
            w2[idx] = 1; 
         }

But it doesn’t help.

w2 is initialized in int wave13pt function as one can see

No, you’re only initializing the interior of the array since your starting index is “2” and ending index is nx-2. The halo region (index 0,1,nx-1) will contain garbage values. Hence when the full array is copied back, and you sum all the elements, the garbage values give you incorrect results.

  • Mat

Oh, now I see. Thanks a lot, Mat!