Here’s a modified version of jagged_array.c where I’m doing data management through the API calls. It’s bit more manual, but does allow for the device to device copy when doing the realloc.
#include <stdlib.h>
#include <stdio.h>
#ifdef _OPENACC
#include <openacc.h>
#endif
#ifndef N
#define N 32
#endif
double ** allocData(size_t size1, int * sizes);
void reallocData(double ** A, size_t size1, int * oldsizes, int * newsizes);
int deleteData(double ** A, size_t size1i, int * sizes);
int initData(double **A, size_t size1, int * sizes, double val);
int printData(double **A, size_t size1, int * sizes);
int main() {
double **A, **B;
int * sizes, *newsizes;
size_t size1,size2, i, j;
size1 = N;
/* create an array of sizes for each jagged array */
sizes=(int*) malloc(sizeof(int)*size1);
newsizes=(int*) malloc(sizeof(int)*size1);
for (i=0; i < size1; ++i) {
sizes[i]=i+10;
newsizes[i] = sizes[i]+32;
}
#pragma acc enter data copyin(sizes[:size1],newsizes[:size1])
/* Allocate the jagged arrays */
A=allocData(size1,sizes);
B=allocData(size1,sizes);
/* set the inital values of B */
initData(B,size1,sizes,2.5);
/* Perform the computation on the device */
#pragma acc parallel loop gang present(A,B,sizes)
for (j=0; j < size1; ++j) {
int size2 = sizes[j];
#pragma acc loop vector
for (i=0; i < size2; ++i) {
A[j][i] = B[j][i] + (double) ((j*size2)+i);
}
}
#ifdef _OPENACC
/* Copy back the results */
for (j=0; j < size1; ++j) {
acc_update_self(A[j],sizes[j]*sizeof(double));
}
#endif
printData(A,size1,sizes);
/* reallocate A and B's jagged arrays using new sizes */
reallocData(A,size1,sizes,newsizes);
reallocData(B,size1,sizes,newsizes);
initData(B,size1,newsizes,3.0);
#pragma acc parallel loop gang present(A,B,newsizes)
for (j=0; j < size1; ++j) {
int size2 = newsizes[j];
#pragma acc loop vector
for (i=0; i < size2; ++i) {
A[j][i] = B[j][i] + (double) ((j*size2)+i);
}
}
#ifdef _OPENACC
/* copy back the results */
for (j=0; j < size1; ++j) {
acc_update_self(A[j],newsizes[j]*sizeof(double));
}
#endif
printData(A,size1,newsizes);
deleteData(A,size1,newsizes);
deleteData(B,size1,newsizes);
#pragma acc exit data delete(sizes,newsizes)
free(sizes);
exit(0);
}
double ** allocData(size_t size1, int * sizes) {
double ** tmp;
int i;
/* Create an array of pointers */
tmp = (double **) malloc(size1*sizeof(double*));
#ifdef _OPENACC
/* Create the pointer array on the device */
acc_create(tmp,sizeof(double*)*size1);
#endif
for (i=0; i < size1; ++i) {
/* Create the vector */
tmp[i] = (double *) malloc(sizes[i]*sizeof(double));
#ifdef _OPENACC
/* allocate the vector on the device */
double * dptr = (double *) acc_malloc(sizeof(double)*sizes[i]);
/* map the host vector to the device vector */
acc_map_data(tmp[i],dptr,sizeof(double)*sizes[i]);
/* attach the vector (i.e. fill-in the device pointer value in the device jagged array at element i) */
acc_attach((void**)&tmp[i]);
#endif
}
return tmp;
}
void reallocData(double ** A, size_t size1, int * oldsizes, int * newsizes) {
int i;
#ifdef _OPENACC
double * olddptr, *newdptr;
#endif
for (i=0; i < size1; ++i) {
#ifdef _OPENACC
/* store the device pointer for this vector and unmap A so it's no longer associated with this device array */
olddptr = acc_deviceptr(A[i]);
acc_unmap_data(A[i]);
#endif
A[i] = (double *) realloc(A[i],newsizes[i]*sizeof(double));
#ifdef _OPENACC
/* malloc a new device vector with the new size and map A[i] to the new device vector*/
newdptr = (double *) acc_malloc(sizeof(double)*newsizes[i]);
acc_map_data(A[i],newdptr,sizeof(double)*newsizes[i]);
acc_attach((void**)&A[i]);
/* device to device copy from old to new device vector */
if (newsizes[i] > oldsizes[i]) {
acc_memcpy_device(newdptr,olddptr,oldsizes[i]);
} else {
acc_memcpy_device(newdptr,olddptr,newsizes[i]);
}
/* delete the old device vector */
acc_free(olddptr);
#endif
}
}
int deleteData(double ** A, size_t size1, int * sizes) {
int i;
for (i=0; i < size1; ++i) {
#ifdef _OPENACC
double * dptr = acc_deviceptr(A[i]);
acc_unmap_data(A[i]);
acc_free(dptr);
#endif
free(A[i]);
}
#ifdef _OPENACC
acc_delete(A,sizeof(double*)*size1);
#endif
free(A);
}
int initData(double **A, size_t size1, int * sizes, double val) {
size_t i,j;
for (j=0; j < size1; ++j) {
int size2=sizes[j];
for (i=0; i < size2; ++i) {
A[j][i] = val;
}
/* Update the device with the initial values */
#ifdef _OPENACC
acc_update_device(A[j],sizeof(double)*size2);
#endif
}
}
int printData(double **A, size_t size1, int * sizes) {
size_t i,j;
printf("Values:\n");
for (i=0; i < 5; ++i) {
int last = sizes[i]-1;
printf("A[%d][0]=%f A[%d][%d]=%f\n",i,A[i][0],i,last,A[i][last]);
}
printf("....\n");
for (i=size1-5; i < size1; ++i) {
int last = sizes[i]-1;
printf("A[%d][0]=%f A[%d][%d]=%f\n",i,A[i][0],i,last,A[i][last]);
}
}
% nvc -fast -w jagged.c -acc=gpu -Minfo=accel ; a.out
main:
34, Generating enter data copyin(newsizes[:size1],sizes[:size1])
42, Generating present(A[:1],sizes[:1],B[:1])
Generating Tesla code
42, #pragma acc loop gang /* blockIdx.x */
45, #pragma acc loop vector(128) /* threadIdx.x */
45, Loop is parallelizable
64, Generating present(A[:1],newsizes[:1],B[:1])
Generating Tesla code
64, #pragma acc loop gang /* blockIdx.x */
67, #pragma acc loop vector(128) /* threadIdx.x */
67, Loop is parallelizable
82, Generating exit data delete(sizes[:1],newsizes[:1])
Values:
A[0][0]=2.500000 A[0][9]=11.500000
A[1][0]=13.500000 A[1][10]=23.500000
A[2][0]=26.500000 A[2][11]=37.500000
A[3][0]=41.500000 A[3][12]=53.500000
A[4][0]=58.500000 A[4][13]=71.500000
....
A[27][0]=1001.500000 A[27][36]=1037.500000
A[28][0]=1066.500000 A[28][37]=1103.500000
A[29][0]=1133.500000 A[29][38]=1171.500000
A[30][0]=1202.500000 A[30][39]=1241.500000
A[31][0]=1273.500000 A[31][40]=1313.500000
Values:
A[0][0]=3.000000 A[0][41]=44.000000
A[1][0]=46.000000 A[1][42]=88.000000
A[2][0]=91.000000 A[2][43]=134.000000
A[3][0]=138.000000 A[3][44]=182.000000
A[4][0]=187.000000 A[4][45]=232.000000
....
A[27][0]=1866.000000 A[27][68]=1934.000000
A[28][0]=1963.000000 A[28][69]=2032.000000
A[29][0]=2062.000000 A[29][70]=2132.000000
A[30][0]=2163.000000 A[30][71]=2234.000000
A[31][0]=2266.000000 A[31][72]=2338.000000