Purpose: Testing of deep copy of user-defined data structure with dynamically allocated jagged 2d array field
Compiler: PGI Community Edition 18.10
Platform: Windows 10 64-bit
GPU: Quadro P5000 (Mobile)
Command: pgcc -acc -Minfo acc_deepcopy_jagged.c -o a.exe
The source code of acc_deepcopy_jagged.c is:
#include <stdlib.h>
#include <stdio.h>
#include <openacc.h>
#include <math.h>
int ROW = 3;
struct A
{
int n; //number of elements in x
int **data2D;
int *data1D;
};
double sum1D(const struct A *z, const int n, int usegpu)
{
double sum1D = 0.0;
/* The disjoint data structure is expected to be present */
#pragma acc parallel loop reduction(+:sum1D) present(z[0:n]) if(usegpu)
for (int i = 0; i < n; i++)
{
for (int r = 0; r < ROW; r++)
{
sum1D += z[i].data1D[r];
}
}
return sum1D;
}
double sum2D(const struct A *z, const int n, int usegpu)
{
double sum2D = 0.0;
/* The disjoint data structure is expected to be present */
#pragma acc parallel loop reduction(+:sum2D) present(z[0:n]) if(usegpu)
for (int i = 0; i < n; i++)
{
for (int r = 0; r < ROW; r++)
{
if (r > 0)
{
for (int c = 0; c < r; c++)
{
sum2D += z[i].data2D[r][c];
}
}
}
}
return sum2D;
}
int main(int argc, char* argv[])
{
int n = 2, cnt1D, cnt2D; // cnt1D and cnt2D are counters
acc_set_device_num(1, acc_device_nvidia); // use the second GPU
//Allocate and initalize a disjoint data structure
struct A *z;
//The top level is an array of struct A objects
z = (struct A*) malloc(n * sizeof(struct A));
#pragma acc enter data create(z[0:n])
for (int i = 0; i < n; i++)
{
z[i].n = n;
#pragma acc update device(z[i].n)
z[i].data1D = (int*)malloc(ROW * sizeof(int));
#pragma acc enter data create(z[i].data1D[0:ROW])
z[i].data2D = (int**)malloc(ROW * sizeof(int*));
#pragma acc enter data create(z[i].data2D[0:ROW][0:1])
for (int r = 0; r < ROW; r++)
{
if (r > 0)
{
z[i].data2D[r] = (int*)malloc(r * sizeof(int));
#pragma acc enter data create(z[i].data2D[r:1][0:r])
}
}
cnt1D = 0;
cnt2D = 0;
for (int r = 0; r < ROW; r++)
{
z[i].data1D[r] = ++cnt1D;
if (r > 0)
{
for (int c = 0; c < r; c++)
{
z[i].data2D[r][c] = ++cnt2D; //OR *(*(z[i].data2D+r)+c) = ++cnt2D
}
#pragma acc update device(z[i].data2D[r:1][0:r])
}
}
#pragma acc update device(z[i].data1D[0:ROW])
}
//Compute and print host sum
printf("host_sum1D=%g\n", sum1D(z, n, 0));
printf("host_sum2D=%g\n", sum2D(z, n, 0));
//compute and print accelerator sum
printf("acc_sum1D=%g\n", sum1D(z, n, 1));
printf("acc_sum2D=%g\n", sum2D(z, n, 1));
return 0;
}
The compiler response is:
sum1D:
19, Generating present(z[:n])
Generating implicit copy(sum1D)
Accelerator kernel generated
Generating Tesla code
20, #pragma acc loop gang /* blockIdx.x */
Generating reduction(+:sum1D)
22, #pragma acc loop vector(128) /* threadIdx.x */
22, Loop is parallelizable
sum2D:
34, Generating present(z[:n])
Generating implicit copy(sum2D)
Accelerator kernel generated
Generating Tesla code
35, #pragma acc loop gang /* blockIdx.x */
Generating reduction(+:sum2D)
37, #pragma acc loop seq
41, #pragma acc loop vector(128) /* threadIdx.x */
37, Loop is parallelizable
41, Loop is parallelizable
main:
62, Generating enter data create(z[:n])
68, Generating update device(z->n)
69, Generating enter data create(z->data1D[:ROW])
72, Generating enter data create(z->data2D[:ROW][:1])
78, Generating enter data create(z->data2D[r][:r])
96, Generating update device(z->data2D[r][:r])
98, Generating update device(z->data1D[:ROW])
When run the exe, error information is:
host_sum1D=12
host_sum2D=12
acc_sum1D=12
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
call to cuMemFreeHost returned error 700: Illegal address during kernel execution
It can be seen that for 1d field the result is correct. However, for 2d field, the GPU result is incorrect.
How to solve this?