Struggling with CUDA, Clang and LLVM IR on PowerPC, and getting: CUDA failure: 'Invalid device function'

I am trying to optimize a CUDA code with LLVM passes on a PowerPC system (RHEL 7.6 with no root access) equipped with V100 GPUs, CUDA 10.1, and LLVM 11 (built from source). Also, I tested clang, lli, and opt on a simple C++ code, and everything works just fine.

After days of searching, reading, and trials-and-errors, I managed to compile a simple CUDA source. The code is the famous axpy:

#include <iostream>

#define cudaCheckError()                                       \
  {                                                            \
    cudaError_t e = cudaGetLastError();                        \
    if (e != cudaSuccess) {                                    \
      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
             cudaGetErrorString(e));                           \
      exit(EXIT_FAILURE);                                      \
    }                                                          \
  }

__global__ void axpy(float a, float* x, float* y) {
  y[threadIdx.x] = a * x[threadIdx.x];
}

int main(int argc, char* argv[]) {
  const int kDataLen = 4;

  float a = 2.0f;
  float host_x[kDataLen] = {1.0f, 2.0f, 3.0f, 4.0f};
  float host_y[kDataLen];

  // Copy input data to device.
  float* device_x;
  float* device_y;
  cudaMalloc(&device_x, kDataLen * sizeof(float));
  cudaMalloc(&device_y, kDataLen * sizeof(float));
  cudaMemcpy(device_x, host_x, kDataLen * sizeof(float),
             cudaMemcpyHostToDevice);
  
  // Launch the kernel.
  axpy<<<1, kDataLen>>>(a, device_x, device_y);
  cudaCheckError();
  
  // Copy output data to host.
  cudaDeviceSynchronize();
  cudaMemcpy(host_y, device_y, kDataLen * sizeof(float),
             cudaMemcpyDeviceToHost);
 
  // Print the results.
  for (int i = 0; i < kDataLen; ++i) {
    std::cout << "y[" << i << "] = " << host_y[i] << "\n";
  }

  cudaDeviceReset();
  return 0;
}

And I wrote the compilation steps in a Makefile like this (I know, its style can be improved!):
I haven’t added any LLVM passes yet.

BIN_FILE=axpy
SRC_FILE=$(BIN_FILE).cu

main: $(BIN_FILE)

$(BIN_FILE).ll: $(SRC_FILE)
	clang++ -stdlib=libc++ -Wall $(SRC_FILE) --cuda-host-only --cuda-gpu-arch=sm_70 -S -c -emit-llvm

$(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll: $(SRC_FILE)
	clang++ -stdlib=libc++ -Wall $(SRC_FILE) --cuda-device-only --cuda-gpu-arch=sm_70 -S -c -emit-llvm

$(BIN_FILE).ptx: $(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll
	llc -march=nvptx64 -mcpu=sm_70 $(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll -o $(BIN_FILE).ptx

$(BIN_FILE)_dlink.o: $(BIN_FILE).ptx
	ptxas -m64 --gpu-name=sm_70 $(BIN_FILE).ptx -o $(BIN_FILE).ptx.o
	fatbinary --64 --create $(BIN_FILE).fatbin --image=profile=sm_70,file=$(BIN_FILE).ptx.o \
                         --image=profile=compute_70,file=$(BIN_FILE).ptx
	nvcc $(BIN_FILE).fatbin -arch=sm_70 -dlink -o $(BIN_FILE)_dlink.o -rdc=true

# For the host code:
$(BIN_FILE).o: $(BIN_FILE).ll
	llc -mcpu=ppc64le $(BIN_FILE).ll -o $(BIN_FILE).s
	clang++ -c $(BIN_FILE).s -o $(BIN_FILE).o

# Link both object files together with a linker:
$(BIN_FILE): $(BIN_FILE).o $(BIN_FILE)_dlink.o
	nvcc $(BIN_FILE).o $(BIN_FILE)_dlink.o -o $(BIN_FILE) -arch=sm_70 -lc++

clean:
	rm *.ll *.s *.ptx *.ptx.o *.fatbin $(BIN_FILE) $(BIN_FILE).o $(BIN_FILE)_dlink.o

It seems all the steps run smoothly without any warning, but after running the executable file, I get the error:
Cuda failure axpy.cu:33: 'invalid device function'

I have also replaced the last linker command with the following, and it runs ok, but with the same error.

clang++ -stdlib=libc++ $(BIN_FILE).o $(BIN_FILE)_dlink.o -o $(BIN_FILE) -lcuda -lcudart -lcudadevrt \
		-L/path-to-gcc-lib/ 

I really appreciate any help. It’s worth mentioning that I need to utilize some of the modern features of CUDA, most of all cooperative groups, so I thought recent LLVM releases might be working. Wondering whether changing LLVM version might be helpful.


Edit:

The output of cuobjdump axpy.fatbin -ptx -sass:

Fatbin elf code:
================
arch = sm_70
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

code for sm_70
		Function : _Z4axpyfPfS_
.headerflags    @"EF_CUDA_SM70 EF_CUDA_PTX_SM(EF_CUDA_SM70)"
   /*0000*/           MOV R1, c[0x0][0x28] ;                       /* 0x00000a0000017a02 */
                                                                   /* 0x000fd00000000f00 */
   /*0010*/      @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;                /* 0x000000fffffff389 */
                                                                   /* 0x000fe200000e00ff */
   /*0020*/           IADD3 R1, R1, -0x18, RZ ;                    /* 0xffffffe801017810 */
                                                                   /* 0x000fe20007ffe0ff */
   /*0030*/           IMAD.MOV.U32 R9, RZ, RZ, c[0x0][0x16c] ;     /* 0x00005b00ff097624 */
                                                                   /* 0x000fe200078e00ff */
   /*0040*/           MOV R8, c[0x0][0x168] ;                      /* 0x00005a0000087a02 */
                                                                   /* 0x000fe40000000f00 */
   /*0050*/           IADD3 R2, P0, R1, c[0x0][0x20], RZ ;         /* 0x0000080001027a10 */
                                                                   /* 0x000fc80007f1e0ff */
   /*0060*/           IADD3.X R3, RZ, c[0x0][0x24], RZ, P0, !PT ;  /* 0x00000900ff037a10 */
                                                                   /* 0x000fd000007fe4ff */
   /*0070*/           ST.E.64.SYS [R2+0x8], R8 ;                   /* 0x0000000802007385 */
                                                                   /* 0x0001e8000010eb08 */
   /*0080*/           LD.E.64.SYS R4, [R2+0x8] ;                   /* 0x0000000802047980 */
                                                                   /* 0x000ea2000010eb00 */
   /*0090*/           IMAD.MOV.U32 R10, RZ, RZ, c[0x0][0x170] ;    /* 0x00005c00ff0a7624 */
                                                                   /* 0x000fe200078e00ff */
   /*00a0*/           MOV R11, c[0x0][0x174] ;                     /* 0x00005d00000b7a02 */
                                                                   /* 0x000fe20000000f00 */
   /*00b0*/           IMAD.MOV.U32 R15, RZ, RZ, c[0x0][0x160] ;    /* 0x00005800ff0f7624 */
                                                                   /* 0x000fe200078e00ff */
   /*00c0*/           S2R R13, SR_TID.X ;                          /* 0x00000000000d7919 */
                                                                   /* 0x000eac0000002100 */
   /*00d0*/           ST.E.64.SYS [R2+0x10], R10 ;                 /* 0x0000001002007385 */
                                                                   /* 0x000fe8000010eb0a */
   /*00e0*/           LD.E.64.SYS R6, [R2+0x10] ;                  /* 0x0000001002067980 */
                                                                   /* 0x000ee8000010eb00 */
   /*00f0*/           ST.E.SYS [R2], R15 ;                         /* 0x0000000002007385 */
                                                                   /* 0x000fe8000010e90f */
   /*0100*/           LD.E.SYS R0, [R2] ;                          /* 0x0000000002007980 */
                                                                   /* 0x000e22000010e900 */
   /*0110*/           IMAD.WIDE.U32 R4, R13, 0x4, R4 ;             /* 0x000000040d047825 */
                                                                   /* 0x004fd400078e0004 */
   /*0120*/           LD.E.SYS R5, [R4] ;                          /* 0x0000000004057980 */
                                                                   /* 0x000e22000010e900 */
   /*0130*/           IMAD.WIDE.U32 R6, R13, 0x4, R6 ;             /* 0x000000040d067825 */
                                                                   /* 0x008fe400078e0006 */
   /*0140*/           FMUL R9, R0, R5 ;                            /* 0x0000000500097220 */
                                                                   /* 0x001fd00000400000 */
   /*0150*/           ST.E.SYS [R6], R9 ;                          /* 0x0000000006007385 */
                                                                   /* 0x000fe2000010e909 */
   /*0160*/           EXIT ;                                       /* 0x000000000000794d */
                                                                   /* 0x000fea0003800000 */
   /*0170*/           BRA 0x170;                                   /* 0xfffffff000007947 */
                                                                   /* 0x000fc0000383ffff */
	.......................

The output ends with dots. Also, nvdisasm axpy throws this error: nvdisasm fatal : axpy is not a supported Elf file

Hi Amir,

Unfortunately, I’m not sure how much help we can offer. This forum is for questions on the NVIDIA HPC Compilers and we don’t have insights with using clang. Your code seems to run correctly when built with nvcc.

Have you tried contacting the LLVM folks? How to submit an LLVM bug report — LLVM 12 documentation

-Mat

Hi Mat,

Thanks for answering and sharing LLVM bug report link. I’ll send them the problem, but before I do that, can you please take a look at the sequence of creating the fat binary and linking it to the final executable file. I think the problem is originated in these steps:ptxas, fatbinary, and nvccchain. Have I done this correctly as if there were no Clang was involved?

Unfortunately, I don’t know if the commands are correct or not.

Though, I’m not clear on why you need to add these extra steps. Shouldn’t you be able to use just clang++? Compiling CUDA with clang — LLVM 12 documentation

The reason is that I need to apply LLVM passes on both, host and device, generated IRs. Is there a simpler way to do that?
And it’s worth mentioning that using Clang++ or nvcc with proper options builds the functional code successfully.

I needed to pass the fatbin file to the host-side compilation command with -Xclang -fcuda-include-gpubinary -Xclang axpy.fatbin to replicate the whole compilation behavior.

Here is the corrected Makefile:

# Host Side

$(BIN_FILE).ll: $(SRC_FILE) $(BIN_FILE).fatbin
    clang++ -stdlib=libc++ -Wall -Werror $(BIN_FILE).cu -march=ppc64le --cuda-host-only -relocatable-pch \
        -Xclang -fcuda-include-gpubinary -Xclang $(BIN_FILE).fatbin -S -g -c -emit-llvm

$(BIN_FILE).o: $(BIN_FILE).ll
    llc -march=ppc64le $(BIN_FILE).ll -o $(BIN_FILE).s
    clang++ -c -Wall $(BIN_FILE).s -o $(BIN_FILE).o

# GPU Side
$(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll: $(SRC_FILE)
    clang++ -x cuda -stdlib=libc++ -Wall -Werror $(BIN_FILE).cu --cuda-device-only \
        --cuda-gpu-arch=sm_70 -S -g -emit-llvm

$(BIN_FILE).ptx: $(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll
    llc -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 $(BIN_FILE)-cuda-nvptx64-nvidia-cuda-sm_70.ll -o $(BIN_FILE).ptx

$(BIN_FILE).ptx.o: $(BIN_FILE).ptx
    ptxas -m64 --gpu-name=sm_70 $(BIN_FILE).ptx -o $(BIN_FILE).ptx.o

$(BIN_FILE).fatbin: $(BIN_FILE).ptx.o
    fatbinary --64 --create $(BIN_FILE).fatbin --image=profile=sm_70,file=$(BIN_FILE).ptx.o \
        --image=profile=compute_70,file=$(BIN_FILE).ptx -link

$(BIN_FILE)_dlink.o: $(BIN_FILE).fatbin
    nvcc $(BIN_FILE).fatbin -gencode arch=compute_70,code=sm_70 \
        -dlink -o $(BIN_FILE)_dlink.o -lcudart -lcudart_static -lcudadevrt

# Link both object files together (either nvcc or clang works here):
$(BIN_FILE): $(BIN_FILE).o $(BIN_FILE)_dlink.o
    nvcc $(BIN_FILE).o $(BIN_FILE)_dlink.o -o $(BIN_FILE) -arch=sm_70 -lc++