NVCC 5.5 bug trying to pass pointer-to-member to kernel function

I’m trying to pass a pointer-to-member to a device function in CUDA. Since the pointer is really just relative to the struct/class it doesn’t seem like there should be any reason it wouldn’t work but NVCC seems to generate wrong code.

#include <stdio.h>
    
    
struct S {
    int F1;
    int F2;
    int F3;
};
    
__device__ S x;
    
__global__ void initialize_S() {
    x.F1 = 100;
    x.F2 = 200;
    x.F3 = 300;
}
    
__global__ void print_S(int S::* m) {
    printf("val: %d\n", x.*m);
}
    
int main() {
    
    initialize_S<<<1, 1>>>();
    print_S<<<1, 1>>>(&S::F1);
    
    cudaDeviceSynchronize();
}

When compiling I get the following error with NVCC v5.5

/tmp/tmpxft_000068a5_00000000-16_ptm.o: In function `main':
tmpxft_000068a5_00000000-3_ptm.cudafe1.cpp:(.text+0xcf): undefined reference to `print_S(int S::*)'
/tmp/tmpxft_000068a5_00000000-16_ptm.o: In function `__device_stub__Z7print_SM1Si(long)':
tmpxft_000068a5_00000000-3_ptm.cudafe1.cpp:(.text+0x17f): undefined reference to `print_S(int S::*)'
tmpxft_000068a5_00000000-3_ptm.cudafe1.cpp:(.text+0x184): undefined reference to `print_S(int S::*)'
collect2: error: ld returned 1 exit status

Traipsing through the code genrerated by NVCC with the -cuda flag it actually looks like it’s generating it wrong:

extern void __device_stub__Z7print_SM1Si(long);
void __device_stub__Z7print_SM1Si( long __par0) { if (cudaSetupArgument((void *)(char *)&__par0, sizeof(__par0), (size_t)0UL) != cudaSuccess) return; { volatile static char *__f __attribute__((unused)); __f = ((char *)((void ( *)(long))print_S)); (void)cudaLaunch(((char *)((void ( *)(long))print_S))); }; }
# 18 "ptm.cu"
void print_S( long __cuda_0)
# 18 "ptm.cu"
{__device_stub__Z7print_SM1Si( __cuda_0);
    
}

By patching the generated code to convert these "long"s to "int S::*"s it compiles and functions correctly.

extern void __device_stub__Z7print_SM1Si(int S::*);
void __device_stub__Z7print_SM1Si(int S::* __par0) { if (cudaSetupArgument((void *)(char *)&__par0, sizeof(__par0), (size_t)0UL) != cudaSuccess) return; { volatile static char *__f __attribute__((unused)); __f = ((char *)((void ( *)(int S::*))print_S)); (void)cudaLaunch(((char *)((void ( *)(int S::*))print_S))); }; }
# 18 "ptm.cu"
void print_S(int S::* __cuda_0)
# 18 "ptm.cu"
{__device_stub__Z7print_SM1Si( __cuda_0);
    
}

There is an answer on stackoverflow that has some additional information:

[url]C++ CUDA Pointer-to-member - Stack Overflow