variable data clause is partially present on the GPU

Hi all!

What does this error means?

FATAL ERROR: variable data clause is partially present on the GPU

I know my boundaries and I’m not gonna r/w outside.
here is the test code

#include <stdlib.h>

#define TS 256
#define MIN(a,b) (a)<(b)?(a):(b)

int width=800;
int height=600;

void main(void){
  int i, j, k, top, left, row, col, tr, tc;
  float r, cbrt[0x10000], xyz0, xyz1, xyz2, xyz_cam[3][4];
  static const int dir[4] = { -1, 1, -TS, TS };
  ushort (*rgb)[TS][TS][3];
  short (*lab)[TS][TS][3], (*lix)[3];
   char (*homo)[TS][TS], *buffer;

  buffer = (char *) malloc (26*TS*TS);      
  rgb  = (ushort(*)[TS][TS][3]) buffer;
  lab  = (short (*)[TS][TS][3])(buffer + 12*TS*TS);
  homo = (char  (*)[TS][TS])   (buffer + 24*TS*TS);
#pragma acc data create(rgb[0:4][0:TS][0:TS][0:3],lab[4][TS][TS][3], homo[2][TS][TS]) \
                 copyin(cbrt, xyz_cam, dir)
  for (top=2; top < height-5; top += TS-6)
    for (left=2; left < width-5; left += TS-6) {
      int lr, lc;

      lr = MIN(top+TS, height-2);
      lc = MIN(left+TS, width-2);
          printf("%d%d ", lr, lc);
        }

  printf("\n");

}

here is the output

arom@cuda:~/test_pgi$ pgcc -acc -Minfo=all ./test1.c
main:
     24, Generating local(homo[0:2][0:][0:])
         Generating local(lab[0:4][0:][0:][0:])
         Generating local(rgb[0:4][0:][0:][0:])
         Generating copyin(dir[0:])
         Generating copyin(xyz_cam[0:][0:])
         Generating copyin(cbrt[0:])
arom@cuda:~/test_pgi$ ./a.out
FATAL ERROR: variable data clause is partially present on the GPU: name=rgb
 file:/home/arom/cuda/dcraw/./test1.c main line:24
  for (top=2; top < height-5; top += TS-6) 
    for (left=2; left < width-5; left += TS-6) {

Should there be a brace after that first for-statement (and an ending one)?

Matt

Hi Alexey,

The problem here is that rgb is a pointer into a larger memory area, buffer. What you need to do is create the buffer on the GPU and then use the present clause to map rgb, lab, and homo into this buffer. (See below). Note that your program has an error where the buffer size is too small. A short is 2-bytes, hence you need to adjust the buffer size accordingly.

For Example:

% cat test.c
#include <stdlib.h>

#define TS 256
#define MIN(a,b) (a)<(b)?(a):(b)

int width=800;
int height=600;

void main(void){
  int i, j, k, l, top, left, row, col, tr, tc, bufsize;
  float r, cbrt[0x10000], xyz0, xyz1, xyz2, xyz_cam[3][4];
  static const int dir[4] = { -1, 1, -TS, TS };
  ushort (*rgb)[TS][TS][3];
  short (*lab)[TS][TS][3], (*lix)[3];
   char (*homo)[TS][TS], *buffer;
  bufsize = (24*TS*TS*sizeof(short)) + (2*TS*TS*sizeof(char));
  buffer = (char *) malloc (bufsize);     
  rgb  = (ushort(*)[TS][TS][3]) buffer;
  lab  = (short (*)[TS][TS][3])(buffer + 12*TS*TS);
  homo = (char  (*)[TS][TS])   (buffer + 24*TS*TS);
#pragma acc data create(buffer[0:bufsize]), copyin(cbrt, xyz_cam, dir)
{
#pragma acc kernels present(rgb[0:4][0:TS][0:TS][0:3], lab[0:4][0:TS][0:TS][0:3], homo[0:2][0:TS][0:TS])
{
    for (i=0; i < 4; ++i) {
    for (j=0; j < TS; ++j) {
    for (k=0; k < TS; ++k) {
    for (l=0; l < 3; ++l) {
        rgb[i][j][k][l] = i+l;
        lab[i][j][k][l] = j+k;
    }}}}

    for (i=0; i < 2; ++i) {
    for (j=0; j < TS; ++j) {
    for (k=0; k < TS; ++k) {
	homo[i][j][k] = 'a';
    }}}
}
#pragma acc update host(rgb[0:4][0:TS][0:TS][0:3])
#pragma acc update host(lab[0:4][0:TS][0:TS][0:3])
#pragma acc update host(homo[0:2][0:TS][0:TS])

  for (top=2; top < height-5; top += TS-6)
    for (left=2; left < width-5; left += TS-6) {
      int lr, lc;

      lr = MIN(top+TS, height-2);
      lc = MIN(left+TS, width-2);
          printf("%d%d ", lr, lc);
        }

  printf("\n");
  printf("%d %d\n", rgb[0][1][2][0], rgb[3][TS-1][TS-1][2]);
  printf("%d %d\n", lab[0][1][2][0], lab[3][TS-1][TS-1][2]);
  printf("%c %c\n", homo[0][1][2], homo[1][TS-1][TS-1]);
}
}
% pgcc test.c -Minfo=accel -Msafeptr -acc 
main:
     21, Generating local(buffer[0:bufsize])
         Generating copyin(dir[0:])
         Generating copyin(xyz_cam[0:][0:])
         Generating copyin(cbrt[0:])
     23, Generating present(homo[0:2][0:][0:])
         Generating present(lab[0:4][0:][0:][0:])
         Generating present(rgb[0:4][0:][0:][0:])
         Generating compute capability 1.0 binary
         Generating compute capability 2.0 binary
     25, Loop is parallelizable
     26, Loop is parallelizable
     27, Loop is parallelizable
     28, Loop is parallelizable
         Accelerator kernel generated
         27, #pragma acc loop gang /* blockIdx.y */
         28, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
             CC 1.0 : 20 registers; 32 shared, 16 constant, 0 local memory bytes
             CC 2.0 : 34 registers; 0 shared, 48 constant, 0 local memory bytes
     33, Loop is parallelizable
     34, Loop is parallelizable
     35, Loop is parallelizable
         Accelerator kernel generated
         34, #pragma acc loop gang /* blockIdx.y */
         35, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             CC 1.0 : 10 registers; 24 shared, 4 constant, 0 local memory bytes
             CC 2.0 : 13 registers; 0 shared, 40 constant, 0 local memory bytes
     40, Accelerator clause: upper bound for dimension 0 of array 'rgb' is unknown
         Generating update host(rgb[0:4][0:][0:][0:])
     41, Accelerator clause: upper bound for dimension 0 of array 'lab' is unknown
         Generating update host(lab[0:4][0:][0:][0:])
     43, Accelerator clause: upper bound for dimension 0 of array 'homo' is unknown
         Generating update host(homo[0:2][0:][0:])
% a.out
258258 258508 258758 258798 508258 508508 508758 508798 598258 598508 598758 598798 
0 5
2 510
a a

Thank you, guys!