Random numbers on device generation

Hey all, I am trying to integrate thermal fluctuations to my code.
For that I need random numbers, lots of them. I do not want to compute them beforehand because it would be a gigantic array.

So the problem is, I need the numbers generated within an #pragma acc kernels region. As I learned there is no straightforward method for doing so… So I took the: $PGI/linux86-64/2017/examples/CUDA-Libraries/cuRAND/test_rand_oacc_ftn/trand8.cpp" as a starting point.

What I do is something like:

#pragma acc routine vector
void please_give_me_rands(float *restrict a, int n);
...
int main(int argc, char** argv)
{...
curandState_t state;
 
  /* Create arrays on the GPU */
  #pragma acc data copy(...)\
		   copyin(..., state)\
 		   create(..., yahoo[0:6])
 		   
  {...
#pragma acc kernels device_type(nvidia)
      {
      #pragma acc loop independent \
      device_type(nvidia)
      for(int k=0; k < 9; k++)
      {
        #pragma acc loop independent \
        device_type(nvidia)
        for(int i=0; i < Lx; i++)
        {
          #pragma acc loop independent \
          device_type(nvidia)
          for(int j=0; j < Ly; j++)
          {  
            please_give_me_rands(yahoo, 5);         
            #pragma acc loop \
            reduction(+:stress_ghost) device_type(nvidia)
            for(int m=3; m < 9; m++)
            {
              stress_ghost += w[k] * cc[m*9 + k] / sqrt(ww[m]) * yahoo[m-3];
            }
            zeta[ind(k,i,j)] = thermal_factor * sqrtf(rho[idx(i,j)]) * stress_ghost; 
          }
        }
      }
      }

and the subroutine is given by trand8,

void please_give_me_rands(float *restrict a, int n) /*, float *restrict b*/
{
  unsigned long long seed;
  unsigned long long seq;
  unsigned long long offset;
  curandState_t state;
  #pragma acc parallel num_gangs(1) pcopy(a[0:n]) private(state)
  {
    seed = 12345ULL; //4294967296ULL^time(NULL)
    seq = 0ULL;
    offset = 0ULL;
    curand_init(seed, seq, offset, &state);
    #pragma acc loop seq
    for (int i = 0; i < n; i++) {
     a[i] = curand_uniform(&state);
      //b[i] = curand_normal(&state);
    }
  }
}

and compile this with something like

...
CXXFLAGS = -fast -ta=tesla,nollvm -Mcuda=cuda8.0 -Minfo=accel -Minline 
CC = pgc++

WHERE = test

WHAT = Readme.dat

.PHONY: all
all: Swalbe.bin 

Swalbe.bin: Swalbe.cpp initial.cpp fields.cpp
	$(CC) $(CXXFLAGS) -o file.bin file.cpp 

.PHONY: run srun clean

otherun: file.bin
	ACC_DEVICE_NUM=1 ./file.bin $(WHAT) $(WHERE)
srun: file.bin
	srun ./file.bin
clean:
	rm -f file.bin *.o

Due to Minfo everything seems to be parallelized nicely however, there is one thing I do not understand,

PGCC-S-0155-Unsupported nested compute construct in compute construct or acc routine  (file.cpp: 887)
PGCC-S-0155-Accelerator region ignored; see -Minfo messages  (file.cpp)
please_give_me_rands(float *, int):
      0, Accelerator region ignored
    887, Accelerator restriction: invalid loop
PGCC/x86 Linux 17.5-0: compilation completed with severe errors
Makefile:23: recipe for target 'file.bin' failed

Sadly Minfo does not offer more information than this.
Is there a workaround for generating random numbers inside openacc kernels?

Best and thanks for the help,
Stefan

Hi Stefan,

PGCC-S-0155-Unsupported nested compute construct in compute construct or acc routine (file.cpp: 887)

This error means that you’re trying to put an OpenACC compute region inside of a device routine.

The problem being that you have a “parallel” region inside of “please_give_me_rands” which is a device routine. Removing the “parallel” region will work around this error. However, there are other issues as well. Things like “yahoo” being shared across threads so you’ll have a race condition, you copy in “state” but it’s a different “state” than what’s used in the routine, etc.

To help, I wrote the sample code below to show one possible way to use the device side random generator. I prefer creating a set of random seeds (one seed per gang, themselves randomly generated on the host) in order to help give more “randomness”. Each vector will have it’s own state and an initial seed based on the gang’s seed. I have not tested this for the random distribution or if the same random numbers can be regenerated.


% cat drand.c
#include <stdio.h>
#include <stdlib.h>
#include <openacc.h>
#include "openacc_curand.h"

#ifndef VL
#define VL 128
#endif

int main(int argc, char** argv)
{
  int sizex, sizey, size;
  int ng, vl, i, j;
  float **val;

  long long seq = 0ULL;
  long long offset = 0ULL;
  long long *seed;
  if (argc > 1) {
        sizex = atoi(argv[1]);
        sizey = atoi(argv[2]);
  } else {
        printf("Useage: a.out <sizex> <sizey> [vectorlength] [numgangs]\n");
        exit(-1);
  }

  vl=VL;
  ng=(sizex+vl-1)/vl;

  seed = (long long*) malloc(ng*sizeof(long long));
  srand(12345);
  for (i=0; i < ng; ++i) {
     seed[i] = rand();
#ifdef ERR_CHECK
     printf("Seed[%d]=%d\n",i,seed[i]);
#endif
  }

  val = (float**) malloc(sizeof(float*)*sizex);
  for (i=0; i < sizex; ++i) {
    val[i] = (float*) malloc(sizeof(float)*sizey);
  }
#pragma acc enter data create(val[:sizex][:sizey]) copyin(seed[:ng])

#pragma acc parallel loop gang vector num_gangs(ng) vector_length(vl) present(val,seed)
  for (i=0; i < sizex; ++i) {
    curandState_t state;
    int gangidx = __pgi_gangidx();
    curand_init(seed[gangidx]+i, seq, offset, &state);
#pragma acc loop seq
    for (j=0; j < sizey; ++j) {
         val[i][j] = curand_uniform(&state);
    }
  }
#pragma acc exit data copyout(val[:sizex][:sizey]) delete(seed)

#ifdef ERR_CHECK
  int errcnt=0;
  for (i=0; i < sizex; ++i) {
    for (j=0; j < sizey; ++j) {
       float v = val[i][j];
       for (int ii=0; ii < sizex; ++ii) {
         for (int jj=0; jj < sizey; ++jj) {
            if (i!=ii && j!=jj && v == val[ii][jj]) {
              ++errcnt;
              printf ("Error Dup %d:%d %d:%d %f\n", i,j,ii,jj,v);
            }
            if (errcnt>100) exit(-1);
         }
      }
    }
  }
#endif

  for (i=0; i < 10; ++i) {
    for (j=0; j < 10; ++j) {
      printf("%d:%d %f\n", i,j,val[i][j]);
  }}

  for (i=0; i < sizex; ++i) {
    free(val[i]);
  }
  free(val);

  exit(0);
}

% pgcc -ta=tesla:cc70,nollvm -Mcudalib=curand drand.c
% a.out 8192 1024
0:0 0.651953
0:1 0.821982
0:2 0.551691
0:3 0.614951
0:4 0.596030
0:5 0.947674
0:6 0.340820
0:7 0.723492
0:8 0.373916
0:9 0.261695
1:0 0.599253
1:1 0.325901
1:2 0.825219
1:3 0.870987
1:4 0.015748
1:5 0.898698
1:6 0.587319
1:7 0.729264
1:8 0.965143
1:9 0.501904
2:0 0.541846
2:1 0.340815
2:2 0.244950
2:3 0.593337
2:4 0.049954
2:5 0.294792
2:6 0.718530
2:7 0.011183
2:8 0.908156
2:9 0.832653
3:0 0.895517
3:1 0.610951
3:2 0.490819
3:3 0.482031
3:4 0.583828
3:5 0.954938
3:6 0.502656
3:7 0.188714
3:8 0.520537
3:9 0.486876
4:0 0.888914
4:1 0.084742
..... cut ...

Hope this helps,
Mat

Hi Mat,

Many thanks to you it does compile and run now :)

Thanks again,
Stefan