Illegal Instruction Error when PTX generated device function is used

Our goal is to execute a user-define function within a kernel. We are approaching this by generating a device function using PTX. We chose this road because in Python via Numba, because we can easily generate PTX from Python code. Since we do not have access to the address of the device function from host, we generate the PTX of file containing the device function and use the cuModule and cuLink interfaces to load the desired device function on the device and get its address using the driver API as shown in main.cpp.

Our approach seems to be working for simple device functions such as func_device() but if we try more complicated transformations such as the ones shown in illegal_func_device() the program in main() returns a CUDA error saying an illegal instruction was encountered. Inspecting the PTX generated for a transformation containing a sqrt() or sin() operations is visibly much more complex compared to the one of the func_device() but it still generates a valid PTX. That leaves me wondering if the illegal instruction is because of a linking issue or something else. Any help on the topic is greatly appreciated.

Below we provide a small example of our approach where func_device.cpp and illegal_func_device.cpp contain the devices functions out of which we generate the PTX and main.cpp contains the driver program, along with build example for a P100 GPU.

func_device.cpp

__device__ int func_device(float *return_value, float x) {
  *return_value = x;            // OK
  // *return_value = fabs(x)    // Also OK
  return 0;
}

__device__ op_func_t func_op = func_device;

illegal_func_device.cpp

__device__ int illegal_func_device(float *return_value, float x) {
  *return_value = sqrt(x);    // an illegal instruction was encountered
  // *return_value = sin(x);  // an illegal instruction was encountered
  return 0;
}

__device__ op_func_t func_op = illegal_func_device;
nvcc -ccbin g++ -x cu --ptx -m64 -Xcompiler -fPIC --std=c++11 -gencode arch=compute_60,code=sm_60 -dc -o func_device.ptx -c func_device.cpp
nvcc -ccbin g++ -x cu --ptx -m64 -Xcompiler -fPIC --std=c++11 -gencode arch=compute_60,code=sm_60 -dc -o illegal_func_device.ptx -c illegal_func_device.cpp

main.cpp

#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>

typedef int (*op_func_t)(float *, float);

__global__ void transform_kernel(int N, float *w, float *x, op_func_t op) {
  const int i = blockDim.x * blockIdx.x + threadIdx.x;

  if (i < N) {
    // (*op)(w + i, x[i]);
    float out;
    (*op)(&out, x[i]);
    w[i] = out;
  }
}

void ptxJIT(std::string fname, void* module){
  // Load & link ptx using cuLink... and cuModule... interface
  // ...
}

int main() {
  cuInit(0);
  // Get the device and CUDA Context
  CUdevice cuDevice;
  CUcontext cuContext;
  CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
  CUDA_SAFE_CALL(cuCtxCreate(&cuContext, 0, cuDevice));

  CUmodule fmodule;
  // func_device.cpp or illegal_func_device.cpp
  ptxJIT("/path/to/ptx/func/device/ptx", (void *)&fmodule);

  // Load func_op_handle from global variable func_op
  CUdeviceptr func_op_handle, h_func_op_handle;
  CUDA_SAFE_CALL(cuModuleGetGlobal(&func_op_handle, NULL, fmodule, "func_op"));

  // Copy device function pointer to host side
  CUDA_SAFE_CALL(cuMemcpyDtoH(&h_func_op_handle, func_op_handle, sizeof(op_func_t)));

  // Data Management
  int N     = 1000000;
  float *x = (float *)malloc(N * sizeof(float));
  float *w = (float *)malloc(N * sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = (float)i;
    w[i] = 0;
  }

  float *x_d, *w_d;
  gpuErrchk(cudaMalloc(&x_d, N * sizeof(float)));
  gpuErrchk(cudaMalloc(&w_d, N * sizeof(float)));

  gpuErrchk(cudaMemcpy(x_d, x, N * sizeof(float), cudaMemcpyHostToDevice));

  const size_t BLOCK_SIZE = 128;
  const size_t NUM_BLOCKS = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;

  transform_kernel<<<NUM_BLOCKS, BLOCK_SIZE>>>(N, w, x, (op_func_t)op);
  gpuErrchk(cudaDeviceSynchronize());

  gpuErrchk(cudaMemcpy(w, w_d, N * sizeof(float), cudaMemcpyDeviceToHost));

  return 0;
}
nvcc -ccbin g++ -x cu -m64 -Xcompiler -fPIC --std=c++11 -gencode arch=compute_60,code=sm_60 -dc -o main.o -c main.cpp
nvcc -ccbin g++ -m64 -Xcompiler -fPIC --std=c++11 -L/path/to/cuda/lib64 -lcudart -lcuda -lcudadevrt -lm -gencode arch=compute_60,code=sm_60 -o main.exe main.o