Having some problems copying function pointers from device to host and back. I am running CUDA 5.0 and compiling with Visual Studio 2010 with compute mode 3.0 on a GeForce 650 card. The code shown below is embedded within a shared library and the executable basically makes calls to the RenderKernel::execute method.
The error I receive is ‘unknown error’ after executing the kernel which I presume implies the function pointer was not valid. If I replace (*func) with (*symbol_CClipGroup_INTERSECT) in the code, it executes as expected. This is boiled down to the minimal code required to get the error so I believe it has something to do with copying from the symbol (but I recieve no error when I do so).
The similar CUDA function pointers example compiles and executes without errors on the same system.
Does anyone have any suggestions on how to resolve this?
namespace vox {
/** Format of clipping function */
typedef void(*ClipFunc)(void *, Ray3f &);
}
// ----------------------------------------------------------------------------
// Intersection callback function
// ----------------------------------------------------------------------------
__device__ void CClipGroup_INTERSECT(void * dataPtr, vox::Ray3f & ray)
{
ray.min = 1.0f;
// :TODO:
}
__device__ vox::ClipFunc symbol_CClipGroup_INTERSECT = CClipGroup_INTERSECT;
namespace vox {
namespace {
namespace filescope {
__global__ void renderKernel(ClipFunc func)
{
Ray3f ray;
(*func)(nullptr, ray);
}
} // namespace filescope
} // namespace anonymous
// --------------------------------------------------------------------
// Executes the rendering stage kernel on the active device
// --------------------------------------------------------------------
void RenderKernel::execute(size_t xstart, size_t ystart,
size_t width, size_t height)
{
// Setup the execution configuration
static const unsigned int BLOCK_SIZE = 32;
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 blocks(
(width + threads.x - 1) / threads.x,
(height + threads.y - 1) / threads.y
);
ClipFunc result;
VOX_CUDA_CHECK(cudaMemcpyFromSymbol(&result, symbol_CClipGroup_INTERSECT, sizeof(ClipFunc)));
// Execute the device rendering kernel
filescope::renderKernel<<<blocks,threads>>>(result);
}
} // namespace vox
– Lucas Sherman
Edit: Expanded my device macros for clarity