C array of pointers in OpenACC

Hi :)

I want to use array of pointers in OpenACC region.
So I trying to using OpenACC directives like below code.

int func(int *restrict in[], int *restrict out[], int size)
{
     int i;
     int j;
     
     #pragma acc data present[in[0:size], out[0:size])
     for(i=0; i<10; i++)
          for(j=0; j<size; j++)
               out[i][j] = in[i][j];
}

int main()
{
     int *in[10], *out[10];
     int i;

     /* malloc memories for in[], out[] elements, initial random values, blah blah...*/
  
    #pragma acc parallel loop private(in, out)
    {
     for(i=0; i<100; i++)
         func(in, out, 10);
     }

    /* blah blah... */
}

Is it right using openacc directives like that?

If not, What do I have to do using point of arrays in openacc region?



Always thanks for your help, PGI :)

Is it right using openacc directives like that?

Well…no.

Let me turn this around on you, what are you trying to accomplish and do you have a host version which represents what you’re trying to do?

There’s a few syntactical errors here. First you need to add a routine directive to “func” so the compiler know to create a device version of the code. Also, data directive can only be used from host code so it doesn’t make sense to use it inside of “func”.

I assume you know that the outer for loop in main is not parallelizable given that every thread would be accessing the same elements of “in” and “out”. This is why you put these variables in a private clause. Ignoring the fact that this use case doesn’t make sense (I know it’s just a test), private variables are not initialized so you’d be dereferencing garbage and cause your program to segv. The “firstprivate” clause will initialize the data, but the pointers would be host address so again when you access it data, the program would segv.

Again, please post a full host example of what you’re trying to do and we try and make it work. Below I wrote an example of using basic 2-D arrays which may or may not be what you’re looking for.

  • Mat
% cat test.c
#include<stdio.h>
#include<stdlib.h>
#ifdef _OPENACC
#include<openacc.h>
#endif
#define N 10
#define SIZE 100

#pragma acc routine(func) seq
int func(int *restrict in[], int *restrict out[], int i, int j)
{
      out[i][j] = in[i][j];
}

 int main()
 {
      int **in, **out;
      int i,j;

      in = (int**) malloc(sizeof(int*)*SIZE);
      out = (int**) malloc(sizeof(int*)*SIZE);
      for(i=0; i<SIZE; i++) {
         in[i] = (int*) malloc(sizeof(int)*N);
         out[i] = (int*) malloc(sizeof(int)*N);
         for (j=0; j<N;++j) {
           in[i][j] = i*N+j;
         }
      }

     #pragma acc parallel loop copyin(in[0:SIZE][0:N]), copyout(out[0:SIZE][0:N])
     for(i=0; i<SIZE; i++) {
        for(j=0; j < N; j++)  {
          func(in, out, i, j);
        }
      }
     printf("%d %d %d\n",out[1][3],out[34][5],out[99][9]);
 }

% pgcc -acc -Minfo=accel test.c; a.out
func:
     11, Generating acc routine seq
main:
     30, Generating copyin(in[:100][:10])
         Generating copyout(out[:100][:10])
         Accelerator kernel generated
         Generating Tesla code
         31, #pragma acc loop gang /* blockIdx.x */
         32, #pragma acc loop vector(128) /* threadIdx.x */
             Interchanging generated strip mine loop outwards
             Interchanging generated vector loop outwards
     32, Loop is parallelizable
13 345 999

Thanks for your reply and apology for my parsimonious explanation.
I’m not very good in English, so my explanation may occur some confusion. I really sorry for that. ;_;

What I really want to do is modify values in 1-D array ‘A_arr’ values with
pointers ‘Aarr_ptr’ which is directing A_arr’s some points.

This code is what I exactly want to do.

#include <stdio.h>
#include <stdlib.h>

#define N 8
#define SIZE 1024

int func(int *restrict inptr[], int *restrict outptr[], int i, int size)
{
	int j;

	for(j=0; j<size; j++)
	{
		outptr[i][j] = inptr[i][j];
	}	
}

int main(void)
{
	int *inptr[N], *outptr[N];
	int *in_buffer, *out_buffer;
	int i, j, cnt;

	cnt = 0;
	in_buffer  = (int *)malloc(sizeof(int)*SIZE);
	out_buffer = (int *)malloc(sizeof(int)*SIZE);
	memset(in_buffer, 10, sizeof(int)*SIZE);
	memset(out_buffer, 20, sizeof(int)*SIZE);

	for(i=0; i<N; i++)
	{
		inptr[i]  =  in_buffer + (sizeof(int)*SIZE/N)*i;
		outptr[i] = out_buffer + (sizeof(int)*SIZE/N)*i;
	}

	for(i=0; i<N; i++)
	{
		func(inptr, outptr, i, sizeof(int)*SIZE/N);
	}

	for(i=0; i<SIZE; i++)
	{
		if(in_buffer[i]!=out_buffer[i])
		{
			cnt++;			
		}
	}
	printf("err_cnt=%d\n", cnt);
}

And, I modified above code like below code to OpenACC style.
But this code makes ‘cuStreamSynchronize error 700’

#include <stdio.h>
#include <stdlib.h>
#if defined (_OPENACC)
#include <openacc.h>
#endif

#define N 8
#define SIZE 1024

#pragma acc routine(func)
int func(int *restrict inptr[], int *restrict outptr[], int i, int size)
{
	int j;

	for(j=0; j<size; j++)
	{
		outptr[i][j] = inptr[i][j];
	}	
}

int main(void)
{
	int *inptr[N], *outptr[N];
	int *in_buffer, *out_buffer;
	int i, j, cnt;

	cnt = 0;
	in_buffer  = (int *)malloc(sizeof(int)*SIZE);
	out_buffer = (int *)malloc(sizeof(int)*SIZE);
	memset(in_buffer, 10, sizeof(int)*SIZE);
	memset(out_buffer, 20, sizeof(int)*SIZE);

	for(i=0; i<N; i++)
	{
		inptr[i]  =  in_buffer + (sizeof(int)*SIZE/N)*i;
		outptr[i] = out_buffer + (sizeof(int)*SIZE/N)*i;
	}

	#pragma acc parallel loop copy(inptr[0:N], outptr[0:N])
	for(i=0; i<N; i++)
	{
		func(inptr, outptr, i, sizeof(int)*SIZE/N);
	}

	for(i=0; i<SIZE; i++)
	{
		if(in_buffer[i]!=out_buffer[i])
		{
			cnt++;			
		}
	}

	printf("err_cnt=%d\n", cnt);
}

$ pgcc -acc -Minfo=all -Minline =ta=tesla:cc50 test.c
func:
     12, Generating acc routine seq
         Generating Tesla code
     15, Loop is parallelizable
         Memory copy idiom, loop replaced by call to __c_mcopy4
main:
     39, Generating copy(inptr[:][:8],outptr[:][:8])
         Accelerator kernel generated
         Generating Tesla code
         40, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     42, func inlined, size=6, file test3.c (12)
          15, Complex loop carried dependence of ->,->-> prevents parallelization
$ ./a.out
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

In my think, event though ‘func’ function declared as a acc routine,
‘func’ function CANNOT access to ‘in_buffer’ and ‘out_buffer’.
Because there isn’t any directives or arguments to tell ‘func’ or OpenACC region ‘in_buffer’ and ‘out_buffer’ is exist.

I know I can use acc_malloc, deviceptr directive and host_data directive or somethings in this case which is malloc on GPU memory space.
But, is there any way to handling HOST memory space handling way like that?


Always I really thanks for your answer.

I’m not very good in English, so my explanation may occur some confusion.

No worries. As an English-only speaker, I’ve always admired those who are multi-lingual no matter their skill level.

Ok, this wasn’t too difficult to offload. The trick is that you need to set the device pointers the same way that you set the host pointers. I also changed the pointer arrays to be “size_t” instead of “int *”. When “int * [10]” is used, the compiler will treat the variable as a 2-D int array rather than a 1-D array of pointers. 2-D arrays require special support which we apply to all cases. Hopefully in the future we can find a way to determine contextually that you really just want an array of pointers and don’t need the extra support. Until then, we’ll need to work around it in these cases.

Here’s the example:

% cat testptr.c
#include <stdio.h>
#include <stdlib.h>
#if defined (_OPENACC)
#include <openacc.h>
#endif

#define N 8
#define SIZE 1024

#pragma acc routine(func)
int func(int *restrict inptr[], int *restrict outptr[], int i, int size)
{
   int j;

   for(j=0; j<size; j++)
   {
      outptr[i][j] = inptr[i][j];
   }
}

int main(void)
{
   size_t inptr[N], outptr[N];
   int *in_buffer, *out_buffer;
   int i, j, cnt;

   cnt = 0;
   in_buffer  = (int *)malloc(sizeof(int)*SIZE);
   out_buffer = (int *)malloc(sizeof(int)*SIZE);
   memset(in_buffer, 10, sizeof(int)*SIZE);
   memset(out_buffer, 20, sizeof(int)*SIZE);

// Create the buffers on the device as well as the pointer arrays.
#pragma acc enter data copyin(in_buffer[0:SIZE]) create(out_buffer[0:SIZE])
#pragma acc enter data create(inptr[0:N], outptr[0:N])

   for(i=0; i<N; i++)
   {
      inptr[i]  = (size_t) in_buffer + (sizeof(int)*SIZE/N)*i;
      outptr[i] = (size_t) out_buffer + (sizeof(int)*SIZE/N)*i;
   }
#if defined(_OPENACC)
// Like the host, the pointer arrays need to be populated with the device pointers
   size_t inDev = (size_t) acc_deviceptr(in_buffer);
   size_t outDev = (size_t) acc_deviceptr(out_buffer);
#pragma acc parallel loop present(inptr,outptr)
   for(i=0; i<N; i++)
   {
      inptr[i]  =  (size_t) inDev + (sizeof(int)*SIZE/N)*i;
      outptr[i] =  (size_t) outDev + (sizeof(int)*SIZE/N)*i;
   }
#endif


#pragma acc parallel loop present(inptr,outptr)
   for(i=0; i<N; i++)
   {
      func(inptr, outptr, i, sizeof(int)*SIZE/N);
   }

// Copy back the out_buffer and delete the device data.
#pragma acc exit data delete(in_buffer[0:SIZE]) copyout(out_buffer[0:SIZE])
#pragma acc exit data delete(inptr[0:N], outptr[0:N])

   for(i=0; i<SIZE; i++)
   {
      if(in_buffer[i]!=out_buffer[i])
      {
         cnt++;
      }
   }

   printf("err_cnt=%d\n", cnt);
}

% pgcc -acc -Minfo=accel testptr.c; a.out
PGC-W-0095-Type cast required for this conversion (testptr.c: 58)
PGC-W-0095-Type cast required for this conversion (testptr.c: 58)
func:
     12, Generating acc routine seq
         Generating Tesla code
     15, Loop is parallelizable
main:
     34, Generating enter data create(out_buffer[:1024],in_buffer[:1024])
     35, Generating enter data create(outptr[:],inptr[:])
     46, Generating present(inptr[:],outptr[:])
         Accelerator kernel generated
         Generating Tesla code
         47, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     55, Generating present(inptr[:],outptr[:])
         Accelerator kernel generated
         Generating Tesla code
         56, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     62, Generating exit data copyout(out_buffer[:1024],in_buffer[:1024])
     63, Generating exit data delete(outptr[:],inptr[:])
PGC/x86-64 Linux 15.7-0: compilation completed with warnings
err_cnt=0

Hope this helps,
Mat

Thank you very much for your reply, again.

This reply so useful for me. I wanted to know how to use GPU memory space’s pointer. And I got some hints from your sample code.
I’ll try with this method.

Again, Thank you very much Mat. :)