Problem with global variables

When running the attached piece of code I get the following error:

call to cuModuleGetGlobal returned error 500: Not found

The problem occurs anytime the directive #pragma acc update device
is encountered, meaning that the two global variables are not visible/present on the device… and I can’t understand why!

Compiled with pgi 15.1:

pgcc -acc -o test main.c


#include <stdio.h>
#include <stdlib.h>

#define VOL 256

double *v;
#pragma acc declare create(v)
double ii;
#pragma acc declare create(ii)

int main(int argc, char **argv) {

  int i,j;

  ii = 0.;
  #pragma acc update device(ii)

  posix_memalign((void *)&v, 4096, VOL * sizeof(double));
  #pragma acc enter data create(v[0:VOL])

  for(i = 0; i < VOL; i++);
    v[i] = 1.;

  #pragma acc update device(v[0:VOL])

  for(i = 0; i < 100; i++){
    ii = ii + 1.;
    #pragma acc update device(ii)

    #pragma acc parallel loop 
    for (j = 0; j < VOL; j++){
      v[j] = ii + 1.0;
    }

  }

  #pragma acc update host(v[0:VOL])

  fprintf(stdout,"V(1)=%f\n",v[1]);

  free(v);

  return 0;
}

Hi Alga,

We don’t support “declare” directive on global variables quite yet. However for this code I don’t think they are necessary since the intent for “declare create” is for use with static arrays. Also, scalars are firstprivate by default so you don’t need to manage “ii”.

% cat test.c
#include <stdio.h>
#include <stdlib.h>

#define VOL 256

double *v;
 double ii;

 int main(int argc, char **argv) {

   int i,j;

   ii = 0.;
   posix_memalign((void *)&v, 4096, VOL * sizeof(double));
   #pragma acc enter data create(v[0:VOL])

   for(i = 0; i < VOL; i++);
     v[i] = 1.;

   #pragma acc update device(v[0:VOL])

   for(i = 0; i < 100; i++){
     ii = ii + 1.;
     #pragma acc parallel loop
     for (j = 0; j < VOL; j++){
       v[j] = ii + 1.0;
     }
   }
   #pragma acc update host(v[0:VOL])
   fprintf(stdout,"V(1)=%f\n",v[1]);
   free(v);
   return 0;
 }

% pgcc -acc -Minfo=accel test.c; a.out
main:
     15, Generating enter data create(v[:256])
     22, Generating update device(v[:256])
     24, Accelerator kernel generated
         25, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     24, Generating copyout(v[:256])
         Generating Tesla code
     30, Generating update host(v[:256])
V(1)=101.000000

Thanks,
Mat