Hi Alexey,
The problem here is that rgb is a pointer into a larger memory area, buffer. What you need to do is create the buffer on the GPU and then use the present clause to map rgb, lab, and homo into this buffer. (See below). Note that your program has an error where the buffer size is too small. A short is 2-bytes, hence you need to adjust the buffer size accordingly.
For Example:
% cat test.c
#include <stdlib.h>
#define TS 256
#define MIN(a,b) (a)<(b)?(a):(b)
int width=800;
int height=600;
void main(void){
int i, j, k, l, top, left, row, col, tr, tc, bufsize;
float r, cbrt[0x10000], xyz0, xyz1, xyz2, xyz_cam[3][4];
static const int dir[4] = { -1, 1, -TS, TS };
ushort (*rgb)[TS][TS][3];
short (*lab)[TS][TS][3], (*lix)[3];
char (*homo)[TS][TS], *buffer;
bufsize = (24*TS*TS*sizeof(short)) + (2*TS*TS*sizeof(char));
buffer = (char *) malloc (bufsize);
rgb = (ushort(*)[TS][TS][3]) buffer;
lab = (short (*)[TS][TS][3])(buffer + 12*TS*TS);
homo = (char (*)[TS][TS]) (buffer + 24*TS*TS);
#pragma acc data create(buffer[0:bufsize]), copyin(cbrt, xyz_cam, dir)
{
#pragma acc kernels present(rgb[0:4][0:TS][0:TS][0:3], lab[0:4][0:TS][0:TS][0:3], homo[0:2][0:TS][0:TS])
{
for (i=0; i < 4; ++i) {
for (j=0; j < TS; ++j) {
for (k=0; k < TS; ++k) {
for (l=0; l < 3; ++l) {
rgb[i][j][k][l] = i+l;
lab[i][j][k][l] = j+k;
}}}}
for (i=0; i < 2; ++i) {
for (j=0; j < TS; ++j) {
for (k=0; k < TS; ++k) {
homo[i][j][k] = 'a';
}}}
}
#pragma acc update host(rgb[0:4][0:TS][0:TS][0:3])
#pragma acc update host(lab[0:4][0:TS][0:TS][0:3])
#pragma acc update host(homo[0:2][0:TS][0:TS])
for (top=2; top < height-5; top += TS-6)
for (left=2; left < width-5; left += TS-6) {
int lr, lc;
lr = MIN(top+TS, height-2);
lc = MIN(left+TS, width-2);
printf("%d%d ", lr, lc);
}
printf("\n");
printf("%d %d\n", rgb[0][1][2][0], rgb[3][TS-1][TS-1][2]);
printf("%d %d\n", lab[0][1][2][0], lab[3][TS-1][TS-1][2]);
printf("%c %c\n", homo[0][1][2], homo[1][TS-1][TS-1]);
}
}
% pgcc test.c -Minfo=accel -Msafeptr -acc
main:
21, Generating local(buffer[0:bufsize])
Generating copyin(dir[0:])
Generating copyin(xyz_cam[0:][0:])
Generating copyin(cbrt[0:])
23, Generating present(homo[0:2][0:][0:])
Generating present(lab[0:4][0:][0:][0:])
Generating present(rgb[0:4][0:][0:][0:])
Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
25, Loop is parallelizable
26, Loop is parallelizable
27, Loop is parallelizable
28, Loop is parallelizable
Accelerator kernel generated
27, #pragma acc loop gang /* blockIdx.y */
28, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
CC 1.0 : 20 registers; 32 shared, 16 constant, 0 local memory bytes
CC 2.0 : 34 registers; 0 shared, 48 constant, 0 local memory bytes
33, Loop is parallelizable
34, Loop is parallelizable
35, Loop is parallelizable
Accelerator kernel generated
34, #pragma acc loop gang /* blockIdx.y */
35, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
CC 1.0 : 10 registers; 24 shared, 4 constant, 0 local memory bytes
CC 2.0 : 13 registers; 0 shared, 40 constant, 0 local memory bytes
40, Accelerator clause: upper bound for dimension 0 of array 'rgb' is unknown
Generating update host(rgb[0:4][0:][0:][0:])
41, Accelerator clause: upper bound for dimension 0 of array 'lab' is unknown
Generating update host(lab[0:4][0:][0:][0:])
43, Accelerator clause: upper bound for dimension 0 of array 'homo' is unknown
Generating update host(homo[0:2][0:][0:])
% a.out
258258 258508 258758 258798 508258 508508 508758 508798 598258 598508 598758 598798
0 5
2 510
a a