Dynamically allocated jagged 2d array field of C structure

Purpose: Testing of deep copy of user-defined data structure with dynamically allocated jagged 2d array field

Compiler: PGI Community Edition 18.10
Platform: Windows 10 64-bit
GPU: Quadro P5000 (Mobile)
Command: pgcc -acc -Minfo acc_deepcopy_jagged.c -o a.exe

The source code of acc_deepcopy_jagged.c is:

#include <stdlib.h> 
#include <stdio.h> 
#include <openacc.h>
#include <math.h>

int ROW = 3;

struct A 
{ 
	int n;	        //number of elements in x 
	int **data2D;	
	int *data1D; 
}; 

double sum1D(const struct A *z, const int n, int usegpu) 
{ 
	double sum1D = 0.0;
	/* The disjoint data structure is expected to be present */ 
	#pragma acc parallel loop reduction(+:sum1D) present(z[0:n]) if(usegpu) 
	for (int i = 0; i < n; i++) 
	{ 
		for (int r = 0; r < ROW; r++) 
		{ 
			sum1D += z[i].data1D[r]; 
		} 
	} 
	return sum1D; 
} 

double sum2D(const struct A *z, const int n, int usegpu) 
{ 
	double sum2D = 0.0;
	/* The disjoint data structure is expected to be present */ 
	#pragma acc parallel loop reduction(+:sum2D) present(z[0:n]) if(usegpu) 
	for (int i = 0; i < n; i++) 
	{ 
		for (int r = 0; r < ROW; r++) 
		{ 
			if (r > 0)
			{
				for (int c = 0; c < r; c++) 
				{ 
					sum2D += z[i].data2D[r][c]; 
				} 
			}
		} 
	} 
	return sum2D; 
} 

int main(int argc, char* argv[]) 
{ 
	int n = 2, cnt1D, cnt2D; // cnt1D and cnt2D are counters 

	acc_set_device_num(1, acc_device_nvidia); // use the second GPU

	//Allocate and initalize a disjoint data structure 
	struct A *z; 
	//The top level is an array of struct A objects 
	z = (struct A*) malloc(n * sizeof(struct A)); 

	#pragma acc enter data create(z[0:n]) 
	for (int i = 0; i < n; i++) 
	{ 
		z[i].n = n; 
		#pragma acc update device(z[i].n) 

		z[i].data1D = (int*)malloc(ROW * sizeof(int)); 
		#pragma acc enter data create(z[i].data1D[0:ROW]) 

		z[i].data2D = (int**)malloc(ROW * sizeof(int*)); 
		#pragma acc enter data create(z[i].data2D[0:ROW][0:1])
		for (int r = 0; r < ROW; r++) 
		{ 
			if (r > 0)
			{
				z[i].data2D[r] = (int*)malloc(r * sizeof(int));
			    #pragma acc enter data create(z[i].data2D[r:1][0:r])
			}
		} 

		cnt1D = 0;
		cnt2D = 0; 
		for (int r = 0; r < ROW; r++) 
		{ 
			z[i].data1D[r] = ++cnt1D; 

			if (r > 0)
			{
				for (int c = 0; c < r; c++) 
				{ 
					z[i].data2D[r][c] = ++cnt2D; //OR *(*(z[i].data2D+r)+c) = ++cnt2D 
				} 
				#pragma acc update device(z[i].data2D[r:1][0:r])
			}
		} 
		#pragma acc update device(z[i].data1D[0:ROW]) 
	} 

	//Compute and print host sum 
	printf("host_sum1D=%g\n", sum1D(z, n, 0)); 
	printf("host_sum2D=%g\n", sum2D(z, n, 0)); 

	//compute and print accelerator sum 
	printf("acc_sum1D=%g\n", sum1D(z, n, 1)); 
	printf("acc_sum2D=%g\n", sum2D(z, n, 1)); 

	return 0; 
}

The compiler response is:

sum1D:
     19, Generating present(z[:n])
         Generating implicit copy(sum1D)
         Accelerator kernel generated
         Generating Tesla code
         20, #pragma acc loop gang /* blockIdx.x */
             Generating reduction(+:sum1D)
         22, #pragma acc loop vector(128) /* threadIdx.x */
     22, Loop is parallelizable
sum2D:
     34, Generating present(z[:n])
         Generating implicit copy(sum2D)
         Accelerator kernel generated
         Generating Tesla code
         35, #pragma acc loop gang /* blockIdx.x */
             Generating reduction(+:sum2D)
         37, #pragma acc loop seq
         41, #pragma acc loop vector(128) /* threadIdx.x */
     37, Loop is parallelizable
     41, Loop is parallelizable
main:
     62, Generating enter data create(z[:n])
     68, Generating update device(z->n)
     69, Generating enter data create(z->data1D[:ROW])
     72, Generating enter data create(z->data2D[:ROW][:1])
     78, Generating enter data create(z->data2D[r][:r])
     96, Generating update device(z->data2D[r][:r])
     98, Generating update device(z->data1D[:ROW])

When run the exe, error information is:

host_sum1D=12
host_sum2D=12
acc_sum1D=12
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

call to cuMemFreeHost returned error 700: Illegal address during kernel execution

It can be seen that for 1d field the result is correct. However, for 2d field, the GPU result is incorrect.

How to solve this?

It looks like the compiler is having a bit of an issue with implicitly attaching the second dimension of the array when the array is accessed within a struct. Adding an explicit attach works around the issue.

Also, the compiler is implicitly vectorizing the inner “c” loop in sum2D, so you’ll instead want to put a “loop vector” on the “ROW” loop.

Hope this helps,
Mat

% cat test1.c
#include <stdlib.h>
#include <stdio.h>
#include <openacc.h>
#include <math.h>

int ROW = 3;

struct A
{
   int n;           //number of elements in x
   int **data2D;
   int *data1D;
};

double sum1D(const struct A *z, const int n, int usegpu)
{
   double sum1D = 0.0;
   /* The disjoint data structure is expected to be present */
   #pragma acc parallel loop reduction(+:sum1D) present(z[0:n]) if(usegpu)
   for (int i = 0; i < n; i++)
   {
      for (int r = 0; r < ROW; r++)
      {
         sum1D += z[i].data1D[r];
      }
   }
   return sum1D;
}

double sum2D(const struct A *z, const int n, int usegpu)
{
   double sum2D = 0.0;
   /* The disjoint data structure is expected to be present */
   #pragma acc parallel loop gang reduction(+:sum2D) present(z[0:n]) if(usegpu)
   for (int i = 0; i < n; i++)
   {
   #pragma acc loop vector reduction(+:sum2D)
      for (int r = 0; r < ROW; r++)
      {
         if (r > 0)
         {
            for (int c = 0; c < r; c++)
            {
               sum2D += z[i].data2D[r][c];
            }
         }
      }
   }
   return sum2D;
}

int main(int argc, char* argv[])
{
   int n = 2, cnt1D, cnt2D; // cnt1D and cnt2D are counters

   acc_set_device_num(1, acc_device_nvidia); // use the second GPU

   //Allocate and initalize a disjoint data structure
   struct A *z;
   //The top level is an array of struct A objects
   z = (struct A*) malloc(n * sizeof(struct A));

   #pragma acc enter data create(z[0:n])
   for (int i = 0; i < n; i++)
   {
      z[i].n = n;
      #pragma acc update device(z[i].n)

      z[i].data1D = (int*)malloc(ROW * sizeof(int));
      #pragma acc enter data create(z[i].data1D[0:ROW])

      z[i].data2D = (int**)malloc(ROW * sizeof(int*));
      #pragma acc enter data create(z[i].data2D[0:ROW][0:1])
      for (int r = 0; r < ROW; r++)
      {
         if (r > 0)
         {
            z[i].data2D[r] = (int*)malloc(r * sizeof(int));
            #pragma acc enter data create(z[i].data2D[r:1][0:r]) attach(z[i].data2D[r:1])
         }
      }

      cnt1D = 0;
      cnt2D = 0;
      for (int r = 0; r < ROW; r++)
      {
         z[i].data1D[r] = ++cnt1D;

         if (r > 0)
         {
            for (int c = 0; c < r; c++)
            {
               z[i].data2D[r][c] = ++cnt2D; //OR *(*(z[i].data2D+r)+c) = ++cnt2D
            }
            #pragma acc update device(z[i].data2D[r:1][0:r])
         }
      }
      #pragma acc update device(z[i].data1D[0:ROW])
   }

   //Compute and print host sum
   printf("host_sum1D=%g\n", sum1D(z, n, 0));
   printf("host_sum2D=%g\n", sum2D(z, n, 0));

   //compute and print accelerator sum
   printf("acc_sum1D=%g\n", sum1D(z, n, 1));
   printf("acc_sum2D=%g\n", sum2D(z, n, 1));

   return 0;
}
% pgcc -ta=tesla test1.c -Minfo=accel -V18.10 ; a.out
sum1D:
     19, Generating present(z[:n])
         Generating implicit copy(sum1D)
         Accelerator kernel generated
         Generating Tesla code
         20, #pragma acc loop gang /* blockIdx.x */
             Generating reduction(+:sum1D)
         22, #pragma acc loop vector(128) /* threadIdx.x */
     22, Loop is parallelizable
sum2D:
     34, Generating present(z[:n])
         Generating implicit copy(sum2D)
         Accelerator kernel generated
         Generating Tesla code
         35, #pragma acc loop gang /* blockIdx.x */
             Generating reduction(+:sum2D)
         38, #pragma acc loop vector(128) /* threadIdx.x */
             Generating reduction(+:sum2D)
         42, #pragma acc loop seq
     38, Loop is parallelizable
     42, Loop is parallelizable
main:
     63, Generating enter data create(z[:n])
     69, Generating update device(z->n)
     70, Generating enter data create(z->data1D[:ROW])
     73, Generating enter data create(z->data2D[:ROW][:1])
     79, Generating enter data create(z->data2D[r][:r])
         Generating enter data attach(z->data2D)
     97, Generating update device(z->data2D[r][:r])
     99, Generating update device(z->data1D[:ROW])
host_sum1D=12
host_sum2D=12
acc_sum1D=12
acc_sum2D=12

It works!
Very nice advice!
Thank you very much!