copy(movement) of user defined objects to the gpu in OpenACC

hyuzuguzel · February 25, 2015, 6:29pm

Hi,

I have the following code which gives “call to cuEventSynchronize returned error 700: Illegal address during kernel execution” when I run. I have a struct A which contains 1D (data1D) and 2D (data2D) dynamically allocated memories. I can succesfuly move data1D to the device but I cannot do for data2D. Does anyone have any idea how to fix this issue?

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#if defined(_OPENACC)
#include <openacc.h>
#else
#endif

int ROW = 3, COLUMN = 4;

struct A
{
int n; //number of elements in x
int** data2D; //2D doesn’t work
int* data1D;
};

struct A* struct_A_copyin(struct A* z, int n)
{
//copyin z[0:n]
struct A* acc_z = (struct A*) acc_copyin(z, n*sizeof(struct A));

for (int i = 0; i < n; i++)
{
//copyin z_.data2D[0:ROW][0:COLUMN]
int** acc_data2D = (int**)acc_copyin(z.data2D, ROWCOLUMNsizeof(int));
//fix acc pointer acc_z.data2D
acc_memcpy_to_device(&acc_z.data2D, &acc_data2D, sizeof(int**));

//copyin z.data1D[0:ROW]
int* acc_data1D = (int*)acc_copyin(z.data1D, ROWsizeof(int));
//fix acc pointer acc_z.data1D
acc_memcpy_to_device(&acc_z.data1D, &acc_data1D, sizeof(int));
}
return acc_z;
}

double acc_sum1D(const struct Az, const int n) {
double sum1D=0.0;
/ The disjoint data structure is expected to be present /
#pragma acc parallel loop reduction(+:sum1D) present(z[0:n])
for (int i = 0; i < n; i++)
{
for (int r = 0; r < ROW; r++)
{
sum1D += z.data1D[r];
}
}
return sum1D;
}

double acc_sum2D(const struct Az, const int n) {
double sum2D = 0.0;
/* The disjoint data structure is expected to be present /
#pragma acc parallel loop reduction(+:sum2D) present(z[0:n])
for (int i = 0; i < n; i++)
{
for (int r = 0; r < ROW; r++)
{
for (int c = 0; c < COLUMN; c++)
{
sum2D += z.data2D[r][c];
}
}
}
return sum2D;
}

double host_sum1D(const struct Az, const int n) {
double sum1D = 0.0;
for (int i = 0; i < n; i++)
{
for (int r = 0; r < ROW; r++)
{
sum1D += z[i].data1D[r];
}
}
return sum1D;
}

double host_sum2D(const struct Az, const int n) {
double sum2D = 0.0;
for (int i = 0; i < n; i++)
{
for (int r = 0; r < ROW; r++)
{
for (int c = 0; c < COLUMN; c++)
{
sum2D += z[i].data2D[r][c];
}
}
}
return sum2D;
}

int main(int argc, char argv)
{
int n = 4, cnt1D, cnt2D; // cnt1D and cnt2D are counters
//Allocate and initalize a disjoint data structure
struct A* z;
//The top level is an array of struct A objects
z = (struct A*) calloc(n, sizeof(struct A));
for (int i = 0; i < n; i++)
{
z[i].n = n;
z[i].data1D = (int*)malloc(ROWsizeof(int));

z[i].data2D = (int**)malloc(ROWsizeof(int*));
for (int r = 0; r < ROW; r++)
{
z[i].data2D[r] = (int*)malloc(COLUMN*sizeof(int));
}
cnt1D = 0, cnt2D=0;
for (int r = 0; r < ROW; r++)
{
z[i].data1D[r] = ++cnt1D;
for (int c = 0; c < COLUMN; c++)
{
z[i].data2D[r][c] = ++cnt2D; //OR ((z[i].data2D+r)+c) = ++cnt2D
}
}
}

//Compute and print host sum
printf(“host_sum1D=%g\n”, host_sum1D(z, n));
printf(“host_sum2D=%g\n”, host_sum2D(z, n));

//Deep-copy the disjoint data structure to the accelerator
printf(“deep copy from host to acc\n”);
struct_A_copyin(z, n);

//#pragma acc enter data copyin(z[0:n]) //Doesn’t work
//compute and print accelerator sum
printf(“acc sum1D=%g\n”, acc_sum1D(z, n));
printf(“acc sum2D=%g\n”, acc_sum2D(z, n));

system(“PAUSE”);
return 0;
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Best Regards,

P.S. I ompile with the following options:
pgcpp -acc -ta=nvidia,time -Minfo -Minline=levels:3_

MatColgrove · February 27, 2015, 12:09am

Hi hyuzuguzel,

“acc_copyin” is expecting a contiguous block of memory hence, you need to loop through each row and copy over each column. You’ll need to first create to COLUMNs over on the device, capture the device pointers, then update the 2d array with the created pointers.

I’m thinking something like the following code. This then gets an error when you try an access in to the compute region so it’s not correct yet. But I’m out of time for today so hoping you can take it further.

Note that OpenACC doesn’t support complex data structures. The committee is working on it, but it’s not yet defined in the standard. You might consider trying out our Beta Unified Memory package (grab it from the Downloads page, it’s the last item in the product pull down). UM is still very experimental and has many caveats, but eliminates the need to manage dynamic data.

Mat

struct A* struct_A_copyin(struct A* z, int n)
{
        //copyin z[0:n]
        struct A* acc_z = (struct A*) acc_copyin(z, n*sizeof(struct A));

        for (int i = 0; i < n; i++)
        {
        //copyin z[i].data2D[0:ROW][0:COLUMN]
        int** acc_data2D = (int**)malloc(ROW*sizeof(int*));
        printf("i=%d ad2d=%p th2d=%p \n",i,acc_data2D,acc_data2D);
        for (int j=0; j<ROW;++j) {
            acc_data2D[j] = (int*)acc_copyin((void*) z[i].data2D[j], COLUMN*sizeof(int));
            printf("i=%d j=%d acc_data2D[j]=%p\n",i,j,acc_data2D[j]);
        }
        printf("memcpy to device\n");
        //fix acc pointer acc_z[i].data2D
        acc_memcpy_to_device(&(acc_z[i].data2D), &acc_data2D, ROW*sizeof(int*));
        printf("start 1D\n");

        //copyin z[i].data1D[0:ROW]
        int* acc_data1D = (int*)acc_copyin(z[i].data1D, ROW*sizeof(int));
        //fix acc pointer acc_z[i].data1D
        acc_memcpy_to_device(&acc_z[i].data1D, &acc_data1D, sizeof(int*));
        }
        return acc_z;
}

hyuzuguzel · February 27, 2015, 4:03pm

Hi,

Thanks for the reply. Actually I was trying something similar in that loop through each row,etc. However, it doesn’t seem straightforward for me from this point on. I would appreciate if you can make it fully work .

Best Regards

MatColgrove · February 27, 2015, 9:15pm

Hi hyuzuguzel,

I thought about it and decided instead of figuring out what’s wrong with your program to rewrite it how I would approach the problem. As I said, OpenACC doesn’t yet have a way to define complex data structures (by “complex” I mean a structure with dynamic members). This means you can’t just use “copy(z)” and expect the compiler to know how to copy “z” since it doesn’t know it shape which dynamically changes at runtime. However, you can put these types of structures on the device if you’re willing to do the underlying data management yourself.

What I would do is use unstructured data regions to build the data structure on the device as shown in the following example. In each case, the contiguous block of memory needs to be created on the device and then attached to the parent structure. The one exception is a 2-D rectangular “” structure which is a special case.

% cat testNotManaged.c
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

 int ROW = 3, COLUMN = 4;

 struct A
 {
 int n; //number of elements in x
 int** data2D; //2D doesn't work
 int* data1D;
 };

double sum1D(const struct A*z, const int n, int usegpu) {
 double sum1D=0.0;
 /* The disjoint data structure is expected to be present */
 #pragma acc parallel loop reduction(+:sum1D) present(z[0:n]) if(usegpu)
 for (int i = 0; i < n; i++)
 {
 for (int r = 0; r < ROW; r++)
 {
 sum1D += z[i].data1D[r];
 }
 }
 return sum1D;
 }

 double sum2D(const struct A*z, const int n, int usegpu) {
 double sum2D = 0.0;
 /* The disjoint data structure is expected to be present */
 #pragma acc parallel loop reduction(+:sum2D) present(z[0:n]) if(usegpu)
 for (int i = 0; i < n; i++)
 {
 for (int r = 0; r < ROW; r++)
 {
 for (int c = 0; c < COLUMN; c++)
 {
 sum2D += z[i].data2D[r][c];
 }
 }
 }
 return sum2D;
 }


 int main(int argc, char* argv[])
 {
 int n = 4, cnt1D, cnt2D; // cnt1D and cnt2D are counters
 //Allocate and initalize a disjoint data structure
 struct A* z;
 //The top level is an array of struct A objects
 z = (struct A*) malloc(n*sizeof(struct A));
#pragma acc enter data create(z[0:n])
 for (int i = 0; i < n; i++)
 {
 z[i].n = n;
#pragma acc update device(z[i].n)
 z[i].data1D = (int*)malloc(ROW*sizeof(int));
#pragma acc enter data create(z[i].data1D[0:ROW])

 z[i].data2D = (int**)malloc(ROW*sizeof(int*));
 for (int r = 0; r < ROW; r++)
 {
 z[i].data2D[r] = (int*)malloc(COLUMN*sizeof(int));
 }
#pragma acc enter data create(z[i].data2D[0:ROW][0:COLUMN])
 cnt1D = 0, cnt2D=0;
 for (int r = 0; r < ROW; r++)
 {
 z[i].data1D[r] = ++cnt1D;
 for (int c = 0; c < COLUMN; c++)
 {
 z[i].data2D[r][c] = ++cnt2D; //OR *(*(z[i].data2D+r)+c) = ++cnt2D
 }
 }
#pragma acc update device(z[i].data1D[0:ROW])
#pragma acc update device(z[i].data2D[0:ROW][0:COLUMN])
 }

 //Compute and print host sum
 printf("host_sum1D=%g\n", sum1D(z, n,0));
 printf("host_sum2D=%g\n", sum2D(z, n,0));

 //compute and print accelerator sum
 printf("acc sum1D=%g\n", sum1D(z, n, 1));
 printf("acc sum2D=%g\n", sum2D(z, n, 1));

 return 0;
 }

% pgcc testNotManaged.c -acc -ta=tesla:cc35 -Minfo=accel -V15.1
sum1D:
     17, Generating present(z[:n])
         Accelerator kernel generated
         17, Sum reduction generated for sum1D
         18, #pragma acc loop gang /* blockIdx.x */
         20, #pragma acc loop vector(256) /* threadIdx.x */
     17, Generating Tesla code
     20, Loop is parallelizable
sum2D:
     31, Generating present(z[:n])
         Accelerator kernel generated
         31, Sum reduction generated for sum2D
         32, #pragma acc loop gang /* blockIdx.x */
         36, #pragma acc loop vector(256) /* threadIdx.x */
     31, Generating Tesla code
     34, Loop is parallelizable
     36, Loop is parallelizable
main:
     53, Generating enter data create(z[:n])
     58, Generating update device(z->n)
     59, Generating enter data create(z->data1D[:ROW])
     66, Generating enter data create(z->data2D[:ROW][:COLUMN])
     77, Generating update device(z->data1D[:ROW])
     78, Generating update device(z->data2D[:ROW][:COLUMN])
% a.out
host_sum1D=24
host_sum2D=312
acc sum1D=24
acc sum2D=312

Here’s the same example, but this time using Unified Memory. In this case, I can remove all the data regions and updates since the CUDA driver handles the dynamic data. Unified Memory is still experimental and has many caveat’s but is very promising.

% cat testManaged.c
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

 int ROW = 3, COLUMN = 4;

 struct A
 {
 int n; //number of elements in x
 int** data2D; //2D doesn't work
 int* data1D;
 };

double sum1D(const struct A*z, const int n, int usegpu) {
 double sum1D=0.0;
 /* The disjoint data structure is expected to be present */
 #pragma acc parallel loop reduction(+:sum1D) present(z[0:n]) if(usegpu)
 for (int i = 0; i < n; i++)
 {
 for (int r = 0; r < ROW; r++)
 {
 sum1D += z[i].data1D[r];
 }
 }
 return sum1D;
 }

 double sum2D(const struct A*z, const int n, int usegpu) {
 double sum2D = 0.0;
 /* The disjoint data structure is expected to be present */
 #pragma acc parallel loop reduction(+:sum2D) present(z[0:n]) if(usegpu)
 for (int i = 0; i < n; i++)
 {
 for (int r = 0; r < ROW; r++)
 {
 for (int c = 0; c < COLUMN; c++)
 {
 sum2D += z[i].data2D[r][c];
 }
 }
 }
 return sum2D;
 }


 int main(int argc, char* argv[])
 {
 int n = 4, cnt1D, cnt2D; // cnt1D and cnt2D are counters
 //Allocate and initalize a disjoint data structure
 struct A* z;
 //The top level is an array of struct A objects
 z = (struct A*) malloc(n*sizeof(struct A));

 for (int i = 0; i < n; i++)
 {
 z[i].n = n;
 z[i].data1D = (int*)malloc(ROW*sizeof(int));
 z[i].data2D = (int**)malloc(ROW*sizeof(int*));
 for (int r = 0; r < ROW; r++)
 {
 z[i].data2D[r] = (int*)malloc(COLUMN*sizeof(int));
 }
 cnt1D = 0, cnt2D=0;
 for (int r = 0; r < ROW; r++)
 {
 z[i].data1D[r] = ++cnt1D;
 for (int c = 0; c < COLUMN; c++)
 {
 z[i].data2D[r][c] = ++cnt2D; //OR *(*(z[i].data2D+r)+c) = ++cnt2D
 }
 }
 }

 //Compute and print host sum
 printf("host_sum1D=%g\n", sum1D(z, n,0));
 printf("host_sum2D=%g\n", sum2D(z, n,0));

 //compute and print accelerator sum
 printf("acc sum1D=%g\n", sum1D(z, n, 1));
 printf("acc sum2D=%g\n", sum2D(z, n, 1));

 return 0;
 }

% pgcc testManaged.c -acc -ta=tesla:cc35,managed -Minfo=accel -V15.1
sum1D:
     17, Generating present(z[:n])
         Accelerator kernel generated
         17, Sum reduction generated for sum1D
         18, #pragma acc loop gang /* blockIdx.x */
         20, #pragma acc loop vector(256) /* threadIdx.x */
     17, Generating Tesla code
     20, Loop is parallelizable
sum2D:
     31, Generating present(z[:n])
         Accelerator kernel generated
         31, Sum reduction generated for sum2D
         32, #pragma acc loop gang /* blockIdx.x */
         36, #pragma acc loop vector(256) /* threadIdx.x */
     31, Generating Tesla code
     34, Loop is parallelizable
     36, Loop is parallelizable
% a.out
host_sum1D=24
host_sum2D=312
acc sum1D=24
acc sum2D=312

Hope this helps,
Mat

hyuzuguzel · February 28, 2015, 2:17pm

Hi,

Thanks for the solutions. I have a few more questions:

The first one compiles and runs. When I run the .exe, it prints the results on the command window but the program doesn’t terminate. (as if it is waiting for an input). Even Ctrl+C doesn’t work. I have to close the pgi command window manuelly. Did you have the same problem? I feel that as if gpu is busy and doesn’t response.
The second issue is that when I run the testNotManaged with pgcpp as c++ code (>>pgcpp testNotManaged.cpp -acc -ta=nvidia,time -Minfo=accel -V15.1), I get the following compile error:

“testNotManaged.cpp”, line 60: internal error:
coalesce_and_lookup_generalized_identifier: not identifier
#pragma acc update device(z_.n)
^

1 catastrophic error detected in the compilation of “testNotManaged.cpp”.
Compilation aborted.

3) When I run testManaged.c (>>pgcc testManaged.c -acc -ta=nvidia,time, managed -Minfo=accel -V15.1 ), I get the following linker error:

testManaged.c:
sum1D:
19, Generating present(z[:n])
Accelerator kernel generated
19, Sum reduction generated for sum1D
20, #pragma acc loop gang /* blockIdx.x /
22, #pragma acc loop vector(256) / threadIdx.x /
19, Generating Tesla code
22, Loop is parallelizable
sum2D:
33, Generating present(z[:n])
Accelerator kernel generated
33, Sum reduction generated for sum2D
34, #pragma acc loop gang / blockIdx.x /
38, #pragma acc loop vector(256) / threadIdx.x */
33, Generating Tesla code
36, Loop is parallelizable
38, Loop is parallelizable
LINK : fatal error LNK1181: cannot open input file ‘managed.obj’
./testManaged.exf: error STP001: cannot open file

In fact, I don’t see any compiler flag “managed” for -ta when I typed >>pgcc -ta - help on the pgi command window.

Best Regards_

MatColgrove · March 2, 2015, 6:10pm

Hi hyuzuguzel,

For #1, no I don’t see this issue. I wrote and tested this code on Linux, but just tried on my Windows system as well. Not sure why you’re seeing this issue.

For #2, this looks like a bug to me. It doesn’t look like we added support for an array of structs in the C++ compiler. I added a report (TPR#21386) and sent it to engineering.

For #3, The Unified Memory Beta Package is currently only available on 64-bit Linux.

Mat

hyuzuguzel · March 2, 2015, 6:45pm

Hi,

Thanks for the answer. I have a trivial question (actually I’m asking it just to be sure) about #pragma acc enter data create(z_.data2D[0:ROW][0:COLUMN]) and #pragma acc update device(z.data2D[0:ROW][0:COLUMN])'s. My question is how the compiler knows that it updates the correct memory block on the gpu. In fact, we give the same input argument (“z.data2D[0:ROW][0:COLUMN]” in my example) to both create and update device pragmas. Does the compiler knows because of sending the same input arguments. I’m asking this question because in my real code, which is quite complex regarding the data structures, I used the same approach, i.e. create and update device. But I get “FATAL ERROR: data in update device clause was not found on device 1: name=(null)” for the update device line, which makes me thought about that issue.

Best Regards,_

MatColgrove · March 2, 2015, 9:30pm

Does the compiler knows because of sending the same input arguments.

We use a “present” table which contains a list of host addresses and size, which map to a device address. The error means that the runtime couldn’t find the host address in the table.

Exactly why, I’m not sure. Though I’d first check if the data create directive was encountered before the update directive.

Mat

MatColgrove · March 2, 2015, 9:30pm

Does the compiler knows because of sending the same input arguments.

We use a “present” table which contains a list of host addresses and size, which map to a device address. The error means that the runtime couldn’t find the host address in the table.

Exactly why, I’m not sure. Though I’d first check if the data create directive was encountered before the update directive.

Mat

hyuzuguzel · March 4, 2015, 4:01pm

Hi Mat,

I see. I created a similar test code as my previous code which isn’t compiling. It outputs “LINK : fatal error LNK1104: cannot open file’./bin_openac/ComplexClass_OpenACC.
exe’./bin_openac/ComplexClass_OpenACC.exf: error STP001: cannot open file”. The code is similar to the one given in the PGI Accelerator Compilers OpenACC Getting Started Guide regarding the structure. The code is :

// pgcpp -acc -ta=nvidia,time -Minfo -Minline=levels:3  -o./ComplexClass_OpenACC.exe ./ComplexClass_OpenACC.cpp 
#include <stdlib.h> 
#include <stdio.h> 
#include <assert.h> 

#if defined(_OPENACC)
#include <openacc.h>
#else
#endif

int ROW = 3, COLUMN = 4;

class A
{
public:
	A()					//constructor
	{
		data1D = (int*)malloc(ROW*sizeof(int));
		data2D = (int**)malloc(ROW*sizeof(int*));
		for (int r = 0; r < ROW; r++)
		{
			data2D[r] = (int*)malloc(COLUMN*sizeof(int));
		}
#pragma acc enter data copyin(this[0:1]) create(data1D[0:ROW], data2D[0:ROW][0:COLUMN]) 
		init();
	}

	~A(){}

	void init()
	{
		int cnt1D = 0, cnt2D = 0;	//counters
		for (int r = 0; r < ROW; r++)
		{
			data1D[r] = ++cnt1D;
			for (int c = 0; c < COLUMN; c++)
			{
				data2D[r][c] = ++cnt2D; //OR *(*(z[i].data2D+r)+c) = ++cnt2D
			}
		}
#pragma acc update device(data1D[0:ROW], data2D[0:ROW][0:COLUMN]) 
	}

public:
	int** data2D;
	int* data1D;
};

double sum1D(const class A**z, const int n, int usegpu) {
	double sum1D = 0.0;
#pragma acc parallel loop reduction(+:sum1D) present(z[0:n][0:1]) if(usegpu) 
	for (int i = 0; i < n; i++)
	{
		for (int r = 0; r < ROW; r++)
		{
			sum1D += z[i][0].data1D[r];
		}
	}
	return sum1D;
}

double sum2D(const class A**z, const int n, int usegpu) {
	double sum2D = 0.0;
#pragma acc parallel loop reduction(+:sum2D) present(z[0:n][0:1]) if(usegpu) 
	for (int i = 0; i < n; i++)
	{
		for (int r = 0; r < ROW; r++)
		{
			for (int c = 0; c < COLUMN; c++)
			{
				sum2D += z[i][0].data2D[r][c];
			}
		}
	}
	return sum2D;
}


int main(int argc, char* argv[])
{
	int n = 4; 
	A** z = (A**) malloc(n*sizeof(A*));	
#pragma acc enter data create(z[0:n][0:1]) 
	for (int i = 0; i < n; i++)
	{		
		z[i] = new A();
	}

	//Compute and print host sum 
	printf("host_sum1D=%g\n", sum1D(z, n, 0));
	printf("host_sum2D=%g\n", sum2D(z, n, 0));

	//compute and print accelerator sum 
	printf("acc sum1D=%g\n", sum1D(z, n, 1));
	printf("acc sum2D=%g\n", sum2D(z, n, 1));

	return 0;
}

Best Regards,

hyuzuguzel · March 4, 2015, 5:48pm

Hi,

I realized that the linker error was due to Windows crash. The code compiles but it gives the following run time error → “FATAL ERROR: variable in data clause is partially present on the device: name=_22530_29_z
file:D:\Projects\Cray_OpenACC_examples\complexclass_trs_help.\ComplexClass_OpenACC.cpp sum1D__FPPC1AiT2 line:50
_22530_29_z lives at 00000000003A89A0 size 64 partially present
Present table dump for device[1]: NVIDIA Tesla GPU 1, compute capability 3.0”

The data movement to the GPU isnot successful.

Best,

MatColgrove · March 4, 2015, 9:50pm

Two issues. First, just use “present(z)” and second, you need to populate the z array with the device pointers.

% cat test_030415.cpp
// pgcpp -acc -ta=nvidia,time -Minfo -Minline=levels:3  -o./ComplexClass_OpenACC.exe ./ComplexClass_OpenACC.cpp
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>

 #if defined(_OPENACC)
 #include <openacc.h>
 #else
 #endif

 int ROW = 3, COLUMN = 4;

 class A
 {
 public:
    A()               //constructor
    {
       data1D = (int*)malloc(ROW*sizeof(int));
       data2D = (int**)malloc(ROW*sizeof(int*));
       for (int r = 0; r < ROW; r++)
       {
          data2D[r] = (int*)malloc(COLUMN*sizeof(int));
       }
 #pragma acc enter data copyin(this[0:1]) create(data1D[0:ROW], data2D[0:ROW][0:COLUMN])
       init();
    }

    ~A(){}

    void init()
    {
       int cnt1D = 0, cnt2D = 0;   //counters
       for (int r = 0; r < ROW; r++)
       {
          data1D[r] = ++cnt1D;
          for (int c = 0; c < COLUMN; c++)
          {
             data2D[r][c] = ++cnt2D; //OR *(*(z[i].data2D+r)+c) = ++cnt2D
          }
       }
 #pragma acc update device(data1D[0:ROW], data2D[0:ROW][0:COLUMN])
    }

 public:
    int** data2D;
    int* data1D;
 };

 double sum1D(class A**z, const int n, int usegpu) {
    double sum1D = 0.0;
 #pragma acc parallel loop reduction(+:sum1D) present(z) if(usegpu)
    for (int i = 0; i < n; i++)
    {
       for (int r = 0; r < ROW; r++)
       {
          sum1D += z[i][0].data1D[r];
       }
    }
    return sum1D;
 }

 double sum2D(class A**z, const int n, int usegpu) {
    double sum2D = 0.0;
 #pragma acc parallel loop reduction(+:sum2D) present(z) if(usegpu)
    for (int i = 0; i < n; i++)
    {
       for (int r = 0; r < ROW; r++)
       {
          for (int c = 0; c < COLUMN; c++)
          {
             sum2D += z[i][0].data2D[r][c];
          }
       }
    }
    return sum2D;
 }


 int main(int argc, char* argv[])
 {
    int n = 8;
    A** z = (A**) malloc(n*sizeof(A*));
    size_t * zptr = (size_t*) malloc(n*sizeof(size_t));
 #pragma acc enter data create(z[0:n][0:1])
    for (int i = 0; i < n; i++)
    {
       z[i] = new A();
       zptr[i] = (size_t) acc_deviceptr(z[i]);
    }

 #pragma acc kernels copyin(zptr[0:n]) present(z)
    for (int i = 0; i < n; ++i) {
        z[i] = (A*) zptr[i];
    }


    //Compute and print host sum
    printf("host_sum1D=%g\n", sum1D(z, n, 0));
    printf("host_sum2D=%g\n", sum2D(z, n, 0));

    //compute and print accelerator sum
    printf("acc sum1D=%g\n", sum1D(z, n, 1));
    printf("acc sum2D=%g\n", sum2D(z, n, 1));

    return 0;
 }
% pgc++ -ta=tesla -acc -Minfo=accel test_030415.cpp; a.out
sum1D(A **, int, int):
     50, Generating present(z[:][:])
         Accelerator kernel generated
         50, Sum reduction generated for sum1D
         52, #pragma acc loop gang /* blockIdx.x */
         54, #pragma acc loop vector(256) /* threadIdx.x */
     50, Generating present(z[:])
         Generating Tesla code
     54, Loop is parallelizable
sum2D(A **, int, int):
     63, Generating present(z[:][:])
         Accelerator kernel generated
         63, Sum reduction generated for sum2D
         65, #pragma acc loop gang /* blockIdx.x */
         69, #pragma acc loop vector(256) /* threadIdx.x */
     63, Generating present(z[:])
         Generating Tesla code
     67, Loop is parallelizable
     69, Loop is parallelizable
main:
     83, Generating enter data create(z[:n][:1])
     89, Generating copyin(zptr[:n])
         Generating present(z[:][:])
         Generating Tesla code
     92, Loop is parallelizable
         Accelerator kernel generated
         92, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
A::A():
     25, Generating enter data create(data2D[:ROW][:COLUMN],data1D[:ROW],this[:1])
A::init():
     42, Generating update device(data2D[:ROW][:COLUMN],data1D[:ROW])
host_sum1D=48
host_sum2D=624
acc sum1D=48
acc sum2D=624

hyuzuguzel · March 6, 2015, 4:19pm

Hi Mat,

Thanks for the solution. It works on pgi 15.1 but when I tried to compile it with pgi 14.10 it gives the following output:

"test_030415.cpp", line 107: warning: last line of file ends without a newline
   }
     ^

sum1D(A **, int, int):
     51, Generating present(z[:][:])
         Accelerator kernel generated
         53, #pragma acc loop gang /* blockIdx.x */
         55, #pragma acc loop vector(256) /* threadIdx.x */
         57, Sum reduction generated for sum1D
     51, Generating present(z[:])
         Generating Tesla code
     55, Loop is parallelizable
sum2D(A **, int, int):
     64, Generating present(z[:][:])
         Accelerator kernel generated
         66, #pragma acc loop gang /* blockIdx.x */
         70, #pragma acc loop vector(256) /* threadIdx.x */
         72, Sum reduction generated for sum2D
     64, Generating present(z[:])
         Generating Tesla code
     68, Loop is parallelizable
     70, Loop is parallelizable
main:
     84, Generating enter data create(z[:n][:1])
     90, Generating copyin(zptr[:n])
         Generating present(z[:][:])
         Generating Tesla code
     93, Complex loop carried dependence of 'zptr->' prevents parallelization
         Loop carried dependence of 'z->' prevents parallelization
         Loop carried backward dependence of 'z->' prevents vectorization
         Accelerator scalar kernel generated
A::A():
     26, Generating enter data create(data2D[:ROW][:COLUMN])
         Generating enter data create(data1D[:ROW])
         Generating enter data copyin(this[:1])
A::init():
     43, Generating update device(data2D[:ROW][:COLUMN])
         Generating update device(data1D[:ROW])
C:\temp\pgacc2a4mxcOUqr4lYD.gpu(347): error: a value of type "long long" cannot be assigned to an entity of type "signed
 char *"

1 error detected in the compilation of "C:\temp\pgnvd2awWIccv4M8jJ7.nv0".
PGCC-F-0155-Compiler failed to translate accelerator region (see -Minfo messages): Device compiler exited with error sta
tus code (test_030415.cpp: 1)
PGCC/x86-64 Windows 14.10-0: compilation aborted

How can we fix this problem so that it works on both versions ?

Best Regards,

MatColgrove · March 6, 2015, 11:22pm

The compiler error in 14.10 is due to the cast of zptr to an A*. The workaround would be to use another size_t * variable (zz) to point to z, then use this variable in the loop.

 int main(int argc, char* argv[])
 {
    int n = 8;
    A** z = (A**) malloc(n*sizeof(A*));
    size_t * zz  = (size_t*) z;
    size_t * zptr = (size_t*) malloc(n*sizeof(size_t));
 #pragma acc enter data create(z[0:n][0:1])
    for (int i = 0; i < n; i++)
    {
       z[i] = new A();
       zptr[i] = (size_t) acc_deviceptr(z[i]);
    }

 #pragma acc kernels copyin(zptr[0:n]) present(zz)
    for (int i = 0; i < n; ++i) {
        zz[i] = zptr[i];
   }

Mat

hyuzuguzel · March 11, 2015, 4:58pm

Hi Mat,

Thanks for the answer. Now, I made the code a little bit more complicated in that I wrapped the class by another class (nested classes) and changing the sum1D and sum2D functions into member functions. The code doesn’t compile with pgcpp compiler. Here is the code:

#include <stdlib.h> 
#include <stdio.h> 
#include <assert.h> 

#if defined(_OPENACC)
#include <openacc.h>
#else
#endif


class A
{
public:
	A()					//constructor
	{
#pragma acc enter data copyin(this[0:1]) 
		init();
	}

	~A()				//destructor
	{
		free(data1D);
		data1D = NULL;
		for (int r = 0; r < ROW; r++)
		{
			free(data2D[r]);
			data2D[r] = NULL;
		}
		free(data2D);
#pragma acc exit data delete(data1D[0:ROW], data2D[0:ROW][0:COLUMN], this[0:1])		
	}

	void init()
	{
		ROW = 3, COLUMN = 4;
		int cnt1D = 0, cnt2D = 0;	//counters

		data1D = (int*)malloc(ROW*sizeof(int));
		data2D = (int**)malloc(ROW*sizeof(int*));
		for (int r = 0; r < ROW; r++)
		{
			data2D[r] = (int*)malloc(COLUMN*sizeof(int));
		}
#pragma acc enter data create(data1D[0:ROW], data2D[0:ROW][0:COLUMN]) 

		for (int r = 0; r < ROW; r++)
		{
			data1D[r] = ++cnt1D;
			for (int c = 0; c < COLUMN; c++)
			{
				data2D[r][c] = ++cnt2D; //OR *(*(z[i].data2D+r)+c) = ++cnt2D
			}
		}
#pragma acc update device(data1D[0:ROW], data2D[0:ROW][0:COLUMN]) 
	}

#pragma acc routine seq
	double sum1D(int usegpu) {
		double sum1D = 0.0;
#pragma acc parallel loop reduction(+:sum1D) present(data1D[0:ROW]) if(usegpu) 
		for (int r = 0; r < ROW; r++)
		{
			sum1D += data1D[r];
		}
		return sum1D;
	}

#pragma acc routine seq
	double sum2D(int usegpu) {
		double sum2D = 0.0;
#pragma acc parallel loop reduction(+:sum2D) present(data2D[0:ROW][0:COLUMN]) if(usegpu) 
		for (int r = 0; r < ROW; r++)
		{
			for (int c = 0; c < COLUMN; c++)
			{
				sum2D += data2D[r][c];
			}
		}
		return sum2D;
	}

public:
	int** data2D;
	int* data1D;
	int ROW, COLUMN;
};

class B
{
public:
#pragma acc routine seq
	B()					//constructor
	{
		num = 2;
		a = (A**)malloc(num*sizeof(A*));
		aptr = (size_t*)malloc(num*sizeof(size_t));
#pragma acc enter data create(a[0:num][0:1]) 
		for (int i = 0; i < num; i++)
		{
			a[i] = new A();
#ifdef _OPENACC
			aptr[i] = (size_t)acc_deviceptr(a[i]);
#endif
		}

#pragma acc kernels copyin(aptr[0:num]) present(a) 
		for (int i = 0; i < num; ++i) { 
			a[i] = (A*) aptr[i]; 
		} 
	}

	~B()				//destructor
	{
		free(a);
#pragma acc exit data delete(a[0:1], this[0:1])		
	}

public:

	A** a;
	int num;
	size_t* aptr;
};



int main(int argc, char* argv[])
{
	int n = 4;
	//hsum1D: host sum 1D,   dsum1D: device sum1D ,... etc
	double hsum1D = 0., hsum2D = 0., dsum1D = 0., dsum2D = 0.;
	B** z = (B**)malloc(n*sizeof(B*));
	size_t* zz = (size_t*)z;
	size_t* zptr = (size_t*)malloc(n*sizeof(size_t));

#pragma acc enter data create(z[0:n][0:1]) 
	for (int i = 0; i < n; i++)
	{
		z[i] = new B();
		#ifdef _OPENACC
		zptr[i] = (size_t)acc_deviceptr(z[i]);
		#endif
	}



	#ifdef _OPENACC
#pragma acc kernels copyin(zptr[0:n]) present(z)
	for (int i = 0; i < n; i++)
	{
		zz[i] = zptr[i];
	}
	#endif 


	//Compute host sum and device sum
#pragma acc parallel loop reduction(+:hsum1D, hsum2D, dsum1D, dsum2D)
	for (int i = 0; i < n; i++)
	{
		for (int j = 0; j < z[i][0].num; j++){
			hsum1D += z[i][0].a[j][0].sum1D(0);
			hsum2D += z[i][0].a[j][0].sum2D(0);
			dsum1D += z[i][0].a[j][0].sum1D(1);
			dsum2D += z[i][0].a[j][0].sum2D(1);
		}

	}

	// print host sum 
	printf("host_sum1D=%g\n", hsum1D);
	printf("host_sum2D=%g\n", hsum2D);

	//print accelerator sum 
	printf("acc sum1D=%g\n", dsum1D);
	printf("acc sum2D=%g\n", dsum2D);

	//free memory
	free(z);
	z = NULL;
#pragma acc exit data delete(z[0:n][0:1])

	system("PAUSE");
	return 0;
}

I get the following compiler output message (>>pgcpp -acc -ta=nvidia -Minfo -Minline=levels:3 ComplexNestedClass_OpenACC.cpp ) :

"ComplexNestedClass_OpenACC.cpp", line 191: warning: last line of file ends
          without a newline
  }
   ^

main:
    141, Generating enter data create(z[:n][:1])
    150, Generating copyin(zptr[:n])
         Generating present(z[:][:])
         Generating copyout(zz[:4])
         Generating Tesla code
    156, Complex loop carried dependence of zptr-> prevents parallelization
         Loop carried dependence of zz-> prevents parallelization
         Loop carried backward dependence of zz-> prevents vectorization
         Accelerator scalar kernel generated
    159, Accelerator kernel generated
        165, #pragma acc loop gang, vector(256) /* blockIdx.x threadIdx.x */
        168, Sum reduction generated for hsum1D
        169, Sum reduction generated for hsum2D
        170, Sum reduction generated for dsum1D
        171, Sum reduction generated for dsum2D
    159, Generating copyin(z[:1],z[:4][:1])
         Generating Tesla code
    189, Generating exit data delete(z[:n][:1])
A::A():
     24, Generating enter data copyin(this[:1])
A::init():
     50, Generating enter data create(data2D[:ROW][:COLUMN],data1D[:ROW])
     62, Generating update device(data2D[:ROW][:COLUMN],data1D[:ROW])
PGCC-S-0155-Kernel region ignored; see -Minfo messages  (ComplexNestedClass_Open
ACC.cpp)
A::sum1D(int):
     66, Accelerator restriction: unsupported statement type
         Accelerator restriction: loop contains unsupported statement type
PGCC/x86-64 Windows 15.3-0: compilation completed with severe errors

I have to note that, I made these two changes separately to the previous code. For example, in one attempt I moved the functions into the class and tried. In the other case, I just added a second class which wraps the first one. These two cases didn’t work seperately.

Do you have any idea?

Best Regards,

MatColgrove · March 12, 2015, 10:38pm

Hi hyuzuguzel,

You have a couple of errors in the code. You can’t have a compute region with a OpenACC device routine (at least not until nested parallelism is supported) and you don’t want to have “routine seq” around “B”'s constructor.

I’m still working through the example and trying to get it to work as expected. I’ve modified it so that all the data gets created on the device, but the results aren’t coming back correctly. The example works if I use CUDA Unified Memory. This might be a case where the structure has gotten too complex to manually manage. I’ll keep working on it though.

However, I’ve run out of time since I need to focus on my GTC 2015 presentations for next week. I’ll revisit it when I can.

Mat

hyuzuguzel · March 12, 2015, 10:56pm

Hi Mat,

Thanks for your effort. Can you post the current version which has still problems for now? You can post the final version when it’s ready. I’ll try to check UM too.

Best Regards,

MatColgrove · March 13, 2015, 10:55pm

Ok, here’s what I have so far. It works, for the most part, with Cuda Unified Memory. There is one problem. For some reason in the 2D case, I need to hardcode the number of columns. When “COLUMNS” is used, the 2D sum is zero.

Now I’m wondering if I had it wired up correctly before since this was the same error I was getting without UM. When I get a chance, I’ll revisit the example.

#include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>

 #if defined(_OPENACC)
 #include <openacc.h>
 #else
 #endif


 class A
 {
 public:
    A()               //constructor
    {
       init();
    }

    ~A()            //destructor
    {
       free(data1D);
       data1D = NULL;
       for (int r = 0; r < ROW; r++)
       {
          free(data2D[r]);
          data2D[r] = NULL;
       }
       free(data2D);
    }

    void init()
    {
       ROW = 3, COLUMN = 4;
       int cnt1D = 0, cnt2D = 0;   //counters

       data1D = (double*)malloc(ROW*sizeof(double));
       data2D = (double**)malloc(ROW*sizeof(double*));
       for (int r = 0; r < ROW; r++)
       {
          data2D[r] = (double*)malloc(COLUMN*sizeof(double));
       }

       for (int r = 0; r < ROW; r++)
       {
          data1D[r] = ++cnt1D;
          for (int c = 0; c < COLUMN; c++)
          {
             data2D[r][c] = ++cnt2D; //OR *(*(z[i].data2D+r)+c) = ++cnt2D
          }
       }
       #pragma acc enter data copyin(this)
    }

    double sum1D(int usegpu) {
       double sum1D = 0.0;
       for (int r = 0; r < ROW; r++)
       {
          sum1D += data1D[r];
       }
       return sum1D;
    }

    double sum2D(int usegpu) {
       double sum2D = 0.0;
       int c, r;
       for (r = 0; r < ROW; r++)
       {
          //for (c = 0; c < COLUMN; c++)
          for (c = 0; c < 4; c++)
          {
             sum2D += data2D[r][c];
          }
       }
       return sum2D;
    }

 public:
    double** data2D;
    double* data1D;
    int ROW, COLUMN;
 };

 class B
 {
 public:
    B()               //constructor
    {
       num = 2;
       a = (A**)malloc(num*sizeof(A*));
       for (int i = 0; i < num; i++)
       {
          a[i] = new A();
       }
       #pragma acc enter data copyin(this)
    }

    ~B()            //destructor
    {
       free(a);
    }

 public:

    A** a;
    int num;
 };



 int main(int argc, char* argv[])
 {
    int n = 4;
    //hsum1D: host sum 1D,   dsum1D: device sum1D ,... etc
    double hsum1D = 0., hsum2D = 0., dsum1D = 0., dsum2D = 0.;
    B** z = (B**)malloc(n*sizeof(B*));
    for (int i = 0; i < n; i++)
    {
       z[i] = new B();
    }

    //Compute host sum and device sum
    for (int i = 0; i < n; i++)
    {
       for (int j = 0; j < z[i][0].num; j++){
          hsum1D += z[i][0].a[j][0].sum1D(0);
          hsum2D += z[i][0].a[j][0].sum2D(0);
       }
    }
#pragma acc parallel loop gang vector reduction(+:dsum2D, dsum1D) present(z)
    for (int i = 0; i < n; i++)
    {
       for (int j = 0; j < z[i][0].num; j++){
          dsum2D += z[i][0].a[j][0].sum2D(1);
          dsum1D += z[i][0].a[j][0].sum1D(1);
       }
    }

    // print host sum
    printf("host_sum1D=%g\n", hsum1D);
    printf("host_sum2D=%g\n", hsum2D);

    //print accelerator sum
    printf("acc sum1D=%g\n", dsum1D);
    printf("acc sum2D=%g\n", dsum2D);

    //free memory
    free(z);
    z = NULL;

//    system("PAUSE");
    return 0;
 }

% pgc++ test_031115.cpp -Minfo=accel -ta=tesla:managed; a.out
main:
    128, Generating present(z[:][:])
         Accelerator kernel generated
        130, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
        133, Sum reduction generated for dsum2D
        134, Sum reduction generated for dsum1D
    128, Generating Tesla code
A::init():
     52, Generating enter data copyin(this[:1])
A::sum1D(int):
     54, Generating implicit acc routine seq
         Generating Tesla code
A::sum2D(int):
     63, Generating implicit acc routine seq
         Generating Tesla code
B::B():
     95, Generating enter data copyin(this[:1])
host_sum1D=48
host_sum2D=624
acc sum1D=48
acc sum2D=624

Mat

hyuzuguzel · March 26, 2015, 9:53pm

Hi Mat,

Did you have a chance to revisit the problem? Also it would be nice if you can make it work the without UM.

Best Regards,

MatColgrove · March 27, 2015, 5:50pm

Hi hyuzuguzel,

Sorry, no I haven’t been able to get back to this yet. I’m still trying to catch-up after GTC.

Mat