nvrtcCreateProgram buffer
std::string text = " \n\
"#pragma once\n"
"#include \"test.h\"\n"
"extern \"C\" __global__ void nvrtcfunctest(double* a, double* b, double* c, size_t size, func* f)\n"
"{\n"
" size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n"
" if (tid < size) {\n"
" c[tid] = (*f)(a[tid], b[tid]);\n"
" }\n"
"}\n"
"\n";
header
template <typename T>
struct cudaCallableFunctionPointer
{
public:
cudaCallableFunctionPointer(T* f_)
{
T* host_ptr = (T*)malloc(sizeof(T));
cudaMalloc((void**)&ptr, sizeof(T));
cudaMemcpyFromSymbol(host_ptr, *f_, sizeof(T));
cudaMemcpy(ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice);
cudaFree(host_ptr);
}
~cudaCallableFunctionPointer()
{
cudaFree(ptr);
}
T* ptr;
};
typedef double (*func)(double a, double b);
__device__ double devpuls(double a, double b)
{
return a + b;
}
main
__device__ func devpulsptr = devpuls;
__device__ func devminusptr = devminus;
size_t n = 10;
size_t bufferSize = n * sizeof(double);
double* hosta = new double[n],
* hostb = new double[n],
* hostc = new double[n];
for (size_t i = 0; i < n; ++i) {
hosta[i] = static_cast<double>(i);
hostb[i] = static_cast<double>(i * 2);
}
double* devA, * devB, * devC;
cudaCallableFunctionPointer<func> pulsptr(&devpulsptr);
cudaMalloc((void**)&devA, bufferSize);
cudaMalloc((void**)&devB, bufferSize);
cudaMalloc((void**)&devC, bufferSize);
CUdeviceptr ddX, ddY, ddOut, dsize, ddfunc, ;
ddX = (CUdeviceptr)devA;
ddY = (CUdeviceptr)devB;
ddOut = (CUdeviceptr)devC;
ddfunc = (CUdeviceptr)pulsptr.ptr;
dsize = (CUdeviceptr)n;
/////////////////////////////
~
~
~
void *kernelParams[] = { &ddX, &ddY, &ddOut, &dsize ,&ddfunc };
cuLaunchKernel(kernel, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
Does the code look plausible? I’d appreciate any feedback.