OpenACC-CUDA interoperability within the same file

Hi,

I was trying to reproduce the idea from https://forums.developer.nvidia.com/t/pgcc-compile-error-in-openacc-cuda-interoperabily-example/133388/1 of having CUDA kernels + OpenACC pragmas in the same file. Using NVCC as the main compiler with PGC++ as the host-compiler.

But I’m struggling to make it to work with cuda 8.0 and PGI 16.9.


#include <stdio.h> 
#include <cuda_runtime.h> 

__global__ void 
vectorAdd(const float *A, const float *B, float *C, int numElements) 
{ 
    int i = blockDim.x * blockIdx.x + threadIdx.x; 

    if (i < numElements) 
    { 
        C[i] = A[i] + B[i]; 
    } 
} 

/** 
 * Host main routine 
 */ 
int 
main(void) 
{ 
    cudaError_t err = cudaSuccess; 
    int numElements = 50000; 
    size_t size = numElements * sizeof(float); 
    printf("[Vector addition of %d elements]\n", numElements); 
    float *h_A = (float *)malloc(size); 
    float *h_B = (float *)malloc(size); 
    float *h_C = (float *)malloc(size); 
    for (int i = 0; i < numElements; ++i) 
    { 
        h_A[i] = rand()/(float)RAND_MAX; 
        h_B[i] = rand()/(float)RAND_MAX; 
    } 

    #pragma acc data copyin(h_A[0:numElements],h_B[0:numElements]), copyout(h_C[0:numElements]) 
    { 
    #pragma acc host_data use_device(h_A,h_B,h_C) 
    { 
        // Launch the Vector Add CUDA Kernel 
        int threadsPerBlock = 256; 
        int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; 
        printf("CUDA kernel launch with %d blocks of %d threadsn\n", blocksPerGrid, threadsPerBlock); 
        vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(h_A, h_B, h_C, numElements); 
        err = cudaGetLastError(); 
        if (err != cudaSuccess) 
        { 
            exit(EXIT_FAILURE); 
        } 
    } // end host_data 
    } // end acc data region 
    for (int i = 0; i < numElements; ++i) 
    { 
        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) 
        { 
            exit(EXIT_FAILURE); 
        } 
    } 

    printf("Test PASSED\n"); 

    // Free host memory 
    free(h_A); 
    free(h_B); 
    free(h_C); 

    printf("Done\n"); 
    return 0; 
}

Compiled with:

/usr/local/cuda-8.0/bin/nvcc -x cu -ccbin pgc++ -Xcompiler -ta=tesla:cuda8.0 -Xcompiler -Mcuda -Xcompiler -Minfo=accel -gencode arch=compute_35,code=sm_35 vecAdd.cpp

and error:

"/usr/local/cuda-8.0/bin/..//include/host_config.h", line 119: catastrophic
          error: #error directive: -- unsupported GNU version! gcc versions
          later than 5 are not supported!
  #error -- unsupported GNU version! gcc versions later than 5 are not supported!
   ^

1 catastrophic error detected in the compilation of "vecAdd.cpp".
Compilation terminated.

It seems that pgc++ defines GNUC > 5. Is this behaviour correct? (It has any relation that my system has g++ 6.1.1 installed?)

Thanks.

I tried regenerating the file

/bin/localrc

using gcc 5 information (with makelocalrc).

Also I added some symlinks (nvlink & fatbinary) of some binaries that were missing in pgi/linux86_64/2016/cuda8.0/bin pointing to binaries of cuda 8 installation.

(I was trying to mimic the /opt/pgi/linux86-64/2016/cuda/7.5/bin/ layout -cudafe,fatbinary,nvlink & ptxas-)


Now I’m able to compile but the kernel launch breaks:

LD_LIBRARY_PATH=/usr/local/cuda-8.0/lib64 cuda-memcheck ./a.out 
========= CUDA-MEMCHECK
[Vector addition of 50000 elements]
CUDA kernel launch with 196 blocks of 256 threadsn
========= Program hit cudaErrorInvalidDeviceFunction (error 8) due to "invalid device function" on CUDA API call to cudaLaunch. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so [0x2e7c03]
=========     Host Frame:./a.out [0x3358e]
=========     Host Frame:./a.out [0x3af9]
=========     Host Frame:./a.out [0x3ae5]
=========     Host Frame:./a.out [0x3b09]
=========     Host Frame:./a.out [0x3745]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20700]
=========     Host Frame:./a.out [0x3299]
=========
========= Program hit cudaErrorInvalidDeviceFunction (error 8) due to "invalid device function" on CUDA API call to cudaGetLastError. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so [0x2e7c03]
=========     Host Frame:./a.out [0x35683]
=========     Host Frame:./a.out [0x374c]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20700]
=========     Host Frame:./a.out [0x3299]
=========
========= ERROR SUMMARY: 2 errors

compilation output:

"vecAdd.cpp", line 1: warning: unrecognized GCC pragma
  #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
              ^

"vecAdd.cpp", line 1: warning: unrecognized GCC pragma
  #pragma GCC diagnostic push
              ^

"vecAdd.cpp", line 1: warning: unrecognized GCC pragma
  #pragma GCC diagnostic ignored "-Wunused-variable"
              ^

"vecAdd.cpp", line 1: warning: unrecognized GCC pragma
  #pragma GCC diagnostic ignored "-Wunused-function"
              ^

"vecAdd.cpp", line 1: warning: unrecognized GCC pragma
  #pragma GCC diagnostic pop
              ^

"vecAdd.cpp", line 1: warning: unrecognized GCC pragma
  #pragma GCC diagnostic ignored "-Wunused-variable"
              ^

"/usr/local/cuda-8.0/bin/..//include/cuda_runtime.h", line 56: warning:
          unrecognized GCC pragma
  #pragma GCC diagnostic push
              ^

"/usr/local/cuda-8.0/bin/..//include/cuda_runtime.h", line 59: warning:
          unrecognized GCC pragma
  #pragma GCC diagnostic ignored "-Wunused-function"
              ^

"/usr/local/cuda-8.0/bin/..//include/cuda_runtime.h", line 1899: warning:
          unrecognized GCC pragma
  #pragma GCC diagnostic pop
              ^

"/tmp/tmpxft_0000442c_00000000-4_vecAdd.cudafe1.stub.c", line 1: warning:
          unrecognized GCC pragma
  #pragma GCC diagnostic push
              ^

"/tmp/tmpxft_0000442c_00000000-4_vecAdd.cudafe1.stub.c", line 2: warning:
          unrecognized GCC pragma
  #pragma GCC diagnostic ignored "-Wunused-function"
              ^

"/tmp/tmpxft_0000442c_00000000-4_vecAdd.cudafe1.stub.c", line 3: warning:
          unrecognized GCC pragma
  #pragma GCC diagnostic ignored "-Wcast-qual"
              ^

"/tmp/tmpxft_0000442c_00000000-4_vecAdd.cudafe1.stub.c", line 4: warning:
          unrecognized GCC pragma
  #pragma GCC diagnostic pop
              ^

main:
      1, include "tmpxft_0000442c_00000000-4_vecAdd.cudafe1.cpp"
           4, include "vecAdd.cpp"
               60, include "device_types.h"
                    69, include "builtin_types.h"
                         65, include "host_runtime.h"
                              72, include "stddef.h"
                                  218, include "driver_types.h"
                                      1481, include "surface_types.h"
                                            114, include "texture_types.h"
                                                 212, include "vector_types.h"
                                                       33, include "vecAdd.cpp"
                                                            35, Generating copyin(h_A[:numElements],h_B[:numElements])
                                                                Generating copyout(h_C[:numElements])
"/usr/local/cuda-8.0/bin/crt/link.stub", line 6: warning: unrecognized GCC
          pragma
  #pragma GCC diagnostic push
              ^

"/usr/local/cuda-8.0/bin/crt/link.stub", line 9: warning: unrecognized GCC
          pragma
  #pragma GCC diagnostic ignored "-Wcast-qual"
              ^

"/usr/local/cuda-8.0/bin/crt/link.stub", line 10: warning: unrecognized GCC
          pragma
  #pragma GCC diagnostic ignored "-Wunused-parameter"
              ^

"/usr/local/cuda-8.0/bin/crt/link.stub", line 11: warning: unrecognized GCC
          pragma
  #pragma GCC diagnostic ignored "-Wunused-variable"
              ^

"/usr/local/cuda-8.0/bin/crt/link.stub", line 12: warning: unrecognized GCC
          pragma
  #pragma GCC diagnostic ignored "-Wunused-function"
              ^

"/usr/local/cuda-8.0/bin/crt/link.stub", line 127: warning: unrecognized GCC
          pragma
  #pragma GCC diagnostic pop
              ^







Hi pfarre83876,

It seems that pgc++ defines GNUC > 5. Is this behaviour correct? (It has any relation that my system has g++ 6.1.1 installed?)

Something else is going on since pgc++ version 16.9 sets GNUC = 4 since we only officially interoperate up to GNU 4.9.3. Most likely nvcc is getting it from your GNU install.


Also I added some symlinks (nvlink & fatbinary) of some binaries that were missing in pgi/linux86_64/2016/cuda8.0/bin pointing to binaries of cuda 8 installation.

(I was trying to mimic the /opt/pgi/linux86-64/2016/cuda/7.5/bin/ layout -cudafe,fatbinary,nvlink & ptxas-)

Can you post what modification you made to your installation as well as your compilation line?

I’m able to build and run the example without issue.

% nvcc -x cu -ccbin pgc++ -Xcompiler -ta=tesla:cuda8.0 -Xcompiler -Mcuda -Xcompiler -Minfo=accel -Xcompiler -w -Xcompiler -V16.9 -gencode arch=compute_60,code=sm_60 vecAdd.cpp
main:
      1, include "tmpxft_00003c18_00000000-4_vecAdd.cudafe1.cpp"
           4, include "vecAdd.cpp"
               60, include "device_types.h"
                    69, include "builtin_types.h"
                         65, include "host_runtime.h"
                              72, include "stddef.h"
                                  213, include "driver_types.h"
                                      1502, include "surface_types.h"
                                            114, include "texture_types.h"
                                                 212, include "vector_types.h"
                                                       33, include "vecAdd.cpp"
                                                            36, Generating copyin(h_A[:numElements],h_B[:numElements])
                                                                Generating copyout(h_C[:numElements])
% a.out                                                                                  [Vector addition of 50000 elements]
__GNUC__ = 4
CUDA kernel launch with 196 blocks of 256 threadsn
Test PASSED
Done
  • Mat

/opt/pgi/linux86-64/16.9/bin/localrc was generated using makelocalrc

set LFC=-lgfortran;
set LDSO=/lib64/ld-linux-x86-64.so.2;
set GCCDIR=/usr/lib/gcc/x86_64-linux-gnu/5;
set GPPDIR32= /usr/include/c++/5 /usr/include/x86_64-linux-gnu/c++/5/32 /usr/include/c++/5/backward /usr/lib/gcc/x86_64-linux-gnu/5/include /usr/local/include /usr/lib/gcc/x86_64-linux-gnu/5/include-fixed /usr/include;
set GPPDIR64= /usr/include/c++/5 /usr/include/x86_64-linux-gnu/c++/5 /usr/include/c++/5/backward /usr/lib/gcc/x86_64-linux-gnu/5/include /usr/local/include /usr/lib/gcc/x86_64-linux-gnu/5/include-fixed /usr/include/x86_64-linux-gnu /usr/include;
set GCCINC32= /usr/lib/gcc/x86_64-linux-gnu/5/include /usr/local/include /usr/lib/gcc/x86_64-linux-gnu/5/include-fixed /usr/include;
set GCCINC64= /usr/lib/gcc/x86_64-linux-gnu/5/include /usr/local/include /usr/lib/gcc/x86_64-linux-gnu/5/include-fixed /usr/include/x86_64-linux-gnu /usr/include;
set G77DIR=/usr/lib/gcc/x86_64-linux-gnu/6/;
set OEM_INFO=64-bit target on x86-64 Linux $INFOTPVAL;
set GCCINC= /usr/lib/gcc/x86_64-linux-gnu/5/include /usr/local/include /usr/lib/gcc/x86_64-linux-gnu/5/include-fixed /usr/include/x86_64-linux-gnu /usr/include;
set GPPDIR= /usr/include/c++/5 /usr/include/x86_64-linux-gnu/c++/5 /usr/include/c++/5/backward /usr/lib/gcc/x86_64-linux-gnu/5/include /usr/local/include /usr/lib/gcc/x86_64-linux-gnu/5/include-fixed /usr/include/x86_64-linux-gnu /usr/include;
set LOCALRC=YES;
set THROW=__THROW=;
set EXTENSION=__extension__=;
set COMPGCCINCDIR=include-gcc50;
set LC=$if(-Bstatic,-lgcc -lgcc_eh -lc -lgcc -lgcc_eh -lc, -lgcc -lc -lgcc -lgcc_s);
set DEFSTDOBJDIR=/usr/lib/x86_64-linux-gnu;
# GLIBC version 2.23
# GCC version 5.4.1
set GCCVERSION=50401;
set LOCALDEFS=__STDC_HOSTED__;
export PGI=$COMPBASE;
# makelocalrc executed by root vie nov 4 17:32:40



I’m able to build and run the example without issue.

Yes, It was my fault. I was compiling for kepler architecture and the machine has maxwells.
With

 /usr/local/cuda-8.0/bin/nvcc -x cu -ccbin pgc++ -Xcompiler -ta=tesla:cuda8.0 -Xcompiler -Mcuda -Xcompiler -Minfo=accel  -gencode arch=compute_50,code=sm_52 vecAdd.cpp

works well.

Thank you.

Yes, It was my fault. I was compiling for kepler architecture and the machine has maxwells.

No worries. Glad you got it working.