nvlink error: Undefined reference to '__pgicudalib_curandUniformXORWOW'

I am trying to develop code for Monte Carlo simulations and want to use the curand library in my fortran code. I wrote a test program based off the example device code at: https://www.pgroup.com/doc/pgi17cudaint.pdf just to get a feel for how it interface with CUDA’s C++ libraries.

From my understanding, the error that the compiler has been giving me is that it is having trouble linking to the library:

nvlink error   : Undefined reference to '__pgicudalib_curandUniformXORWOW' in 'test.o'
nvlink error   : Undefined reference to '__pgicudalib_curandInitXORWOW' in 'test.o'

I would like to know how I can properly link the curand library or edit my code to make this work. I’m sure I am making some stupid mistake.

The interface looks like this:

module curan_m
    integer, public :: CURAND_RNG_PSEUDO_DEFAULT = 100
    integer, public :: CURAND_RNG_PSEUDO_XORWOW = 101
    integer, public :: CURAND_RNG_PSEUDO_MTGP32 = 141
    integer, public :: CURAND_RNG_QUASI_DEFAULT = 200
    integer, public :: CURAND_RNG_QUASI_SOBOL32 = 201
    integer, public :: CURAND_RNG_QUASI_SOBOL64 = 203

    integer, public:: CURAND_ORDERING_PSEUDO_BEST = 100
    integer, public:: CURAND_ORDERING_PSEUDO_SEEDED = 102
    integer, public :: CURAND_ORDERING_QUASI_DEFAULT = 201

    interface curandCreateGenerator
        subroutine curandCreateGenerator ( &
                generator, rng_type) &
                bind(C,name='curandCreateGenerator')
            use iso_c_binding
            integer(c_size_t) :: generator
            integer(c_int), value :: rng_type
        end subroutine curandCreateGenerator
    end interface curandCreateGenerator

    interface curandSetGeneratorOrdering
        subroutine curandSetGeneratorOrdering ( &
                generator, order) &
                bind(C,name='curandSetGeneratorOrdering')
            use iso_c_binding
            integer(c_size_t) :: generator
            integer(c_int), value :: order
        end subroutine curandSetGeneratorOrdering
    end interface curandSetGeneratorOrdering

    interface curandSetPseudoRandomGeneratorSeed
        subroutine curandSetPseudoRandomGeneratorSeed ( &
                generator, seed) &
                bind(C,name='curandSetPseudoRandomGeneratorSeed')
            use iso_c_binding
            integer(c_size_t), value :: generator
            integer(c_long_long), value :: seed
        end subroutine curandSetPseudoRandomGeneratorSeed
    end interface curandSetPseudoRandomGeneratorSeed

    interface curandGenerateUniform

        subroutine curandGenerateUniform ( &
                generator, odata, numele) &
                bind(C,name='curandGenerateUniform')
            use iso_c_binding
            integer(c_size_t), value :: generator
            !pgi$ ignore_tr odata
            real(c_float), device:: odata(*)
            integer(c_size_t), value :: numele
        end subroutine curandGenerateUniform

        subroutine curandGenerateUniformDouble ( &
                generator, odata, numele) &
                bind(C,name='curandGenerateUniformDouble')
            use iso_c_binding
            integer(c_size_t), value :: generator
            !pgi$ ignore_tr odata
            real(c_double), device:: odata(*)
            integer(c_size_t), value :: numele
        end subroutine curandGenerateUniformDouble
    end interface curandGenerateUniform

    interface curandDestroyGenerator
        subroutine curandDestroyGenerator (generator) &
                bind(C,name='curandDestroyGenerator')
            use iso_c_binding
            integer(c_size_t), value :: generator
        end subroutine curandDestroyGenerator

    end interface curandDestroyGenerator

     interface curand_init

      attributes(device) subroutine curand_init(seed,sequence,offset,state) &
        bind(C,name='curand_init')
        use iso_c_binding
        integer(c_long_long),value :: seed
        integer(c_long_long),value :: sequence
        integer(c_long_long),value :: offset
        !pgi$ ignore_tr state
        real(c_float), device :: state(*)
      end subroutine curand_init

        attributes(device) subroutine curandInitXORWOW(seed, sequence, offset, state)

            integer(8) :: seed
            integer(8) :: sequence
            integer(8) :: offset
            TYPE(curandStateXORWOW) :: state
        end subroutine curandInitXORWOW

    end interface curand_init

    interface curand

      attributes(device) subroutine curand(state) &
        bind(C,name='curand')
        use iso_c_binding
        !pgi$ ignore_tr state
        real(c_float),device :: state(*)
      end subroutine curand

    end interface curand

    interface curand_uniform

      attributes(device) subroutine curand_uniform(state) &
        bind(C,name='curand_uniform')
        use iso_c_binding
        !pgi$ ignore_tr state
        real(c_float),device :: state(*)
      end subroutine curand_uniform

      attributes(device) subroutine curand_uniform_double(state) &
      bind(C,name='curand_uniform_double')
        use iso_c_binding
        !pgi$ ignore_tr state
        real(c_double),device :: state(*)
      end subroutine curand_uniform_double

    end interface curand_uniform

    interface curand_normal

      attributes(device) subroutine curand_normal(state) &
        bind(C,name='curand_normal')
        use iso_c_binding
        !pgi$ ignore_tr state
        real(c_float),device :: state(*)
      end subroutine curand_normal

      attributes(device) subroutine curand_normal_double(state) &
        bind(C,name='curand_normal_double')
        use iso_c_binding
        !pgi$ ignore_tr state
        real(c_double),device :: state(*)
      end subroutine curand_normal_double

    end interface curand_normal

end module curan_m

My program is just this:

module monte
    use curand_device
    contains
    attributes(global) subroutine Carlo(dX,n)

        use cudafor

        implicit none

        integer :: seed, offset, seq, id
        integer, value :: n
        real, device :: dX(4096)
        type(curandStateXORWOW) :: h

        seed = 1111
        offset = 0
        seq = 0

        call curand_init(seed,offset,seq,h)

        id = (blockidx%x-1)*blockdim%x + threadidx%x

        if (id < n) then

            dX(id) = curand_uniform(h)

        end if

    end subroutine Carlo

end module monte

program rnd

    use cudafor
    use curand_device
    use curan_m

    real:: x(4096)
    real, device :: dX(4096)
    integer :: n = 4096

    x = 0

    dX = x

    call Carlo<<<32,128>>>(dX,n)

    x = dX

    print *, x(1)


end program rnd

And I compiled it using:

pgfortran -c curan_m.cuf
pgfortran -o test test.cuf curan_m.o -lcurand

Hi bss50,

Try compiling with “-Mcuda=nollvm”.

Calling cuRAND routines from device code is one of the few cases were we need to use the CUDA C back-end code generator instead of the LLVM code-generator since we need to include a CUDA header file to get the device versions of these routines inlined.

Also, no need to write you’re own interface module. Using “curand_device” will include all the needed interfaces. For an example, please see the “trand2.cuf” code under: “$PGI/linux86-64-llvm/2019/examples/CUDA-Libraries/cuRAND/test_rand_cuf/”

Hope this helps,
Mat

Updated version of your code:

% cat test.cuf
module monte
    use curand_device
    contains
    attributes(global) subroutine Carlo(dX,n)

        use cudafor

        implicit none

        integer :: seed, offset, seq, id
        integer, value :: n
        real, device :: dX(4096)
        type(curandStateXORWOW) :: h

        seed = 1111
        offset = 0
        seq = 0

        call curand_init(seed,offset,seq,h)

        id = (blockidx%x-1)*blockdim%x + threadidx%x

        if (id < n) then

            dX(id) = curand_uniform(h)

        end if

    end subroutine Carlo

end module monte

program rnd

    use cudafor
    use curand_device
    use monte

    real:: x(4096)
    real, device :: dX(4096)
    integer :: n = 4096

    x = 0

    dX = x

    call Carlo<<<32,128>>>(dX,n)

    x = dX

    print *, x(1)


end program rnd

% pgfortran -o test test.cuf -Mcudalib=curand -Mcuda
nvlink error   : Undefined reference to '__pgicudalib_curandUniformXORWOW' in '/tmp/pgfortran4C8ODcEBFe1.o'
nvlink error   : Undefined reference to '__pgicudalib_curandInitXORWOW' in '/tmp/pgfortran4C8ODcEBFe1.o'
pgacclnk: child process exit status 2: /proj/pgi/linux86-64-llvm/19.5/bin/pgnvd

% pgfortran -o test test.cuf -Mcudalib=curand -Mcuda=nollvm
% ./test
   0.6084599

Mat,

Thank you for the response. That makes a lot more sense to me now.

I found the example file trand2.cuf and tried to compile it using

$ pgfortran -o test trand2.cuf -Mcudalib=curand -Mcuda=nollvm

but unfortunately it ended up giving me this still:

nvlink error   : Undefined reference to '__pgicudalib_curandNormalXORWOW' in '/tmp/pgfortranqkshW0YzfAWA.o'
nvlink error   : Undefined reference to '__pgicudalib_curandUniformXORWOW' in '/tmp/pgfortranqkshW0YzfAWA.o'
nvlink error   : Undefined reference to '__pgicudalib_curandInitXORWOW' in '/tmp/pgfortranqkshW0YzfAWA.o'
pgacclnk: child process exit status 2: /ihome/crc/install/pgi/18.10/linux86-64/18.10/bin/pgnvd

I am not exactly sure what to do at this point.

I really appreciate your help and patience as I am not super familiar with the PGI compiler.

Hi bss50,

Hmm, I tried the same compile line on three systems using PGI 18.10 with CUDA 9.0, 10.0, and 10.1, but in all cases the example code compiled fine for me. Not sure what’s different here.

This error usually only occurs when the compile line is missing “-Mcuda=nollvm”, so I’d first double check that the flag is being used.

Otherwise, what OS are you using? What CUDA version? What GPU? What CPU? (If you don’t know, try running the command “pgaccelinfo” and “pgcpuid” and post the results). I’ll try to find a system that matches yours and see if I can recreate the error.

Also, you might post the full output from the command line “pgfortran -o test trand2.cuf -Mcudalib=curand -Mcuda=nollvm -v”, where “-v” is verbose mode so I can see what the driver is doing under the hood.

-Mat

Hi Mat,

The CUDA version is 8.0.44. I can see about getting the latest version if that would help. I am using Linux with an Nvidia V100 PCIe and the following CPU:

vendor id  : GenuineIntel
model name      : Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz
cpu family      : 6
model           : 63
name            : Haswell
stepping        : 2
processors      : 12
threads         : 2
clflush size    : 8
L2 cache size   : 256KB
L3 cache size   : 15360KB

Here is the output using verbose mode:

Export PGI_CURR_CUDA_HOME=/ihome/crc/install/pgi/18.10/linux86-64/2018/cuda/9.1
Export PGI=/ihome/crc/install/pgi/18.10

/ihome/crc/install/pgi/18.10/linux86-64/18.10/bin/pgf901 trand2.cuf -opt 1 -nohpf -nostatic -x 19 0x400000 -quad -x 59 4 -x 15 2 -x 49 0x400004 -x 51 0x20 -x 57 0x4c -x 58 0x10000 -x 124 0x1000 -tp haswell -x 57 0xfb0000 -x 58 0x78031040 -x 47 0x08 -x 48 4608 -x 49 0x100 -x 120 0x200 -stdinc /ihome/crc/install/pgi/18.10/linux86-64/18.10/include-gcc48:/ihome/crc/install/pgi/18.10/linux86-64/18.10/include:/ihome/crc/install/python/miniconda3-3.7/include/python3.7m:/usr/lib/gcc/x86_64-redhat-linux/4.8.5/include:/usr/local/include:/usr/include -cmdline '+pgfortran trand2.cuf -Mcuda=nollvm -Mcudalib=curand -v' -def unix -def __unix -def __unix__ -def linux -def __linux -def __linux__ -def __NO_MATH_INLINES -def __LP64__ -def __x86_64 -def __x86_64__ -def __LONG_MAX__=9223372036854775807L -def '__SIZE_TYPE__=unsigned long int' -def '__PTRDIFF_TYPE__=long int' -def __extension__= -def __amd_64__amd64__ -def __k8 -def __k8__ -def __SSE__ -def __MMX__ -def __SSE2__ -def __SSE3__ -def __SSSE3__ -def _CUDA -def _CUDA -def __CUDA_API_VERSION=9010 -freeform -x 137 1 -x 121 0xc00 -x 180 0x4000000 -cudaver 9010 -vect 48 -x 54 1 -def __CUDA_API_VERSION=9010 -cudaver 9.1 -x 70 0x40000000 -x 189 0x8000 -y 163 0xc0000000 -x 137 1 -modexport /tmp/pgfortranAMIcooVrk5IC.cmod -modindex /tmp/pgfortranQMIc_-O0fSvz.cmdx -output /tmp/pgfortran6MIcUufHtTGQ.ilm
  0 inform,   0 warnings,   0 severes, 0 fatal for mtests
  0 inform,   0 warnings,   0 severes, 0 fatal for testany
  0 inform,   0 warnings,   0 severes, 0 fatal for t
PGF90/x86-64 Linux 18.10-0: compilation successful

/ihome/crc/install/pgi/18.10/linux86-64/18.10/bin/pgf902 /tmp/pgfortran6MIcUufHtTGQ.ilm -fn trand2.cuf -opt 1 -x 51 0x20 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -x 59 4 -tp haswell -x 120 0x1000 -x 124 0x1400 -y 15 2 -x 57 0x3b0000 -x 58 0x48000000 -x 49 0x100 -x 120 0x200 -astype 0 -x 137 1 -x 121 0xc00 -x 180 0x4000000 -cudaver 9010 -x 68 0x20 -x 176 0x100 -cudacap 35 -cudacap 50 -cudacap 60 -cudacap 70 -cudaver 9010 -x 70 0x40000000 -x 164 0x800000 -x 124 1 -x 189 0x10 -x 189 0x8000 -y 163 0xc0000000 -y 189 0x4000000 -cudaroot /ihome/crc/install/pgi/18.10/linux86-64/2018/cuda/9.1 -x 137 1 -x 121 0xc00 -x 180 0x4000000 -x 176 0x100 -cudacap 35 -cudacap 50 -cudacap 60 -cudacap 70 -cudaver 9010 -cmdline '+pgfortran trand2.cuf -Mcuda=nollvm -Mcudalib=curand -v' -asm /tmp/pgfortran6MIcU-WwwL97.s
  0 inform,   0 warnings,   0 severes, 0 fatal for mtests
  0 inform,   0 warnings,   0 severes, 0 fatal for testany
  0 inform,   0 warnings,   0 severes, 0 fatal for t
 /ihome/crc/install/pgi/18.10/linux86-64/18.10/bin/pgnvd -dcuda /ihome/crc/install/pgi/18.10/linux86-64/2018/cuda/9.1 -usenvvm -reloc /tmp/pgcudafor5OIcRlB-bJb5.gpu -computecap=35 -ptx /tmp/pgcudaforjOIcBXzAru9L.ptx -o /tmp/pgcudaforrOIcZ8h2z2hV.bin -cuda9010
 /ihome/crc/install/pgi/18.10/linux86-64/18.10/bin/pgnvd -dcuda /ihome/crc/install/pgi/18.10/linux86-64/2018/cuda/9.1 -usenvvm -reloc /tmp/pgcudafor5OIcRlB-bJb5.gpu -computecap=50 -ptx /tmp/pgcudaforXOIctO-g54Uz.ptx -o /tmp/pgcudafor5OIcRQCBbag9.bin -cuda9010
 /ihome/crc/install/pgi/18.10/linux86-64/18.10/bin/pgnvd -dcuda /ihome/crc/install/pgi/18.10/linux86-64/2018/cuda/9.1 -usenvvm -reloc /tmp/pgcudafor5OIcRlB-bJb5.gpu -computecap=60 -ptx /tmp/pgcudaforrOIcZdcJzl2L.ptx -o /tmp/pgcudaforzOIclIYNHMhA.bin -cuda9010
 /ihome/crc/install/pgi/18.10/linux86-64/18.10/bin/pgnvd -dcuda /ihome/crc/install/pgi/18.10/linux86-64/2018/cuda/9.1 -usenvvm -reloc /tmp/pgcudafor5OIcRlB-bJb5.gpu -computecap=70 -ptx /tmp/pgcudaforXOIctph45TfD.ptx -o /tmp/pgcudafor5OIcRjH1bDt7.bin -cuda9010
 /ihome/crc/install/pgi/18.10/linux86-64/18.10/bin/pgnvd -dcuda /ihome/crc/install/pgi/18.10/linux86-64/2018/cuda/9.1 -reloc -cuda9010 -fat trand2.cuf -sm 35 /tmp/pgcudaforrOIcZ8h2z2hV.bin -sm 50 /tmp/pgcudafor5OIcRQCBbag9.bin -sm 60 /tmp/pgcudaforzOIclIYNHMhA.bin -sm 70 /tmp/pgcudafor5OIcRjH1bDt7.bin -compute 70 /tmp/pgcudaforXOIctph45TfD.ptx -o /tmp/pgaccbOIcdxkc1fp3.fat
nvlink error   : Undefined reference to '__pgicudalib_curandNormalXORWOW' in '/tmp/pgfortranAMIcoPaoH9U5.o'
nvlink error   : Undefined reference to '__pgicudalib_curandUniformXORWOW' in '/tmp/pgfortranAMIcoPaoH9U5.o'
nvlink error   : Undefined reference to '__pgicudalib_curandInitXORWOW' in '/tmp/pgfortranAMIcoPaoH9U5.o'
pgacclnk: child process exit status 2: /ihome/crc/install/pgi/18.10/linux86-64/18.10/bin/pgnvd
pgfortran-Fatal-linker completed with exit code 2

Unlinking /tmp/pgfortran6MIcUufHtTGQ.ilm
Unlinking /tmp/pgfortrankMIcE6VUp8G3.stb
Unlinking /tmp/pgfortranAMIcooVrk5IC.cmod
Unlinking /tmp/pgfortranQMIc_-O0fSvz.cmdx
Unlinking /tmp/pgfortran6MIcU-WwwL97.s
Unlinking /tmp/pgfortrankMIcE94XoIfP.ll
Unlinking /tmp/pgfortranAMIcoPaoH9U5.o

One thing that may be important to note is that the GPU is part of a cluster managed by SLURM.

Thanks,
Ben

Hi Ben,

I’m a bit confused by this one so have sent a note to a few of our compiler engineers for help.

From the verbose output, I see that the NVVM back-end device code generator is still being used even though “-Mcuda=nollvm” should disable this.

If debugging was enabled, i.e. “-g”, then NVVM will be used even in the presence of “-Mcuda=nollvm” since NVVM is required for debugging. But since you’re not using “-g” or “-Mcuda=debug”, I’m not sure why it’s being ignored.

When I hear back from our developers, hopefully this will give me more ideas.

-Mat