How to pass a NVRTC-compiled device function pointer to an already compiled device function?

nvrtcCreateProgram buffer

std::string  text = "                                          \n\
"#pragma once\n"
"#include \"test.h\"\n"
"extern \"C\" __global__ void nvrtcfunctest(double* a, double* b, double* c, size_t size, func* f)\n"
"{\n"
"    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n"
"    if (tid < size) {\n"
"        c[tid] = (*f)(a[tid], b[tid]);\n"
"    }\n"
"}\n"
"\n";

header

template <typename T>
struct cudaCallableFunctionPointer
{
public:
    cudaCallableFunctionPointer(T* f_)
    {
        T* host_ptr = (T*)malloc(sizeof(T));
        cudaMalloc((void**)&ptr, sizeof(T));

        cudaMemcpyFromSymbol(host_ptr, *f_, sizeof(T));
        cudaMemcpy(ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice);

        cudaFree(host_ptr);
    }

    ~cudaCallableFunctionPointer()
    {
        cudaFree(ptr);
    }

    T* ptr;
};



typedef double (*func)(double a, double b);

__device__ double devpuls(double a, double b)
{                                                         
    return  a + b;
}

main

__device__ func devpulsptr = devpuls;
__device__ func devminusptr = devminus;

size_t n = 10;
size_t bufferSize = n * sizeof(double);
double* hosta = new double[n],
        * hostb = new double[n],
        * hostc = new double[n];
   
for (size_t i = 0; i < n; ++i) {
        hosta[i] = static_cast<double>(i);
        hostb[i] = static_cast<double>(i * 2);
}
double* devA, * devB, * devC;

cudaCallableFunctionPointer<func> pulsptr(&devpulsptr);
cudaMalloc((void**)&devA, bufferSize);
cudaMalloc((void**)&devB, bufferSize);
cudaMalloc((void**)&devC, bufferSize);

CUdeviceptr ddX, ddY, ddOut, dsize, ddfunc, ;
ddX         = (CUdeviceptr)devA;
ddY         = (CUdeviceptr)devB;
ddOut       = (CUdeviceptr)devC;
ddfunc      = (CUdeviceptr)pulsptr.ptr;
dsize       = (CUdeviceptr)n;

/////////////////////////////

~
~
~
void *kernelParams[] = { &ddX, &ddY, &ddOut, &dsize ,&ddfunc };

cuLaunchKernel(kernel, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);


Does the code look plausible? I’d appreciate any feedback.