General note: Invoking functions via function pointers in device code requires compute capability 2.x.
Only global functions and their addresses are visible inside host code. device functions, and thus their addresses, are not visible inside the host code. Therefore cannot pass function pointers to device functions to a kernel call (which is inside the host portion of the code).
However, one can pass the information needed to select the desired device function to the kernel. In the following example, a kernel finds either the minimum or the maximum element in an array of floats, with the argument findMin specifying which operation is desired. The kernel then selects a pointer to the appropriate selection function (either minimum or maximum) and passes that to a device function minmax() that does all the work.
#include <stdio.h>
#include <stdlib.h>
#define BLOCK_COUNT 240
#define THREAD_COUNT 128
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
typedef float (*pf)(float a, float b);
__device__ float minimum (float a, float b)
{
return fminf(a,b);
}
__device__ float maximum (float a, float b)
{
return fmaxf(a,b);
}
__device__ pf func_d[2] = { maximum, minimum };
__shared__ float partExtr[THREAD_COUNT];
__device__ void minmax(float *x, float *res, int n, pf func)
{
int i;
int tid = threadIdx.x;
float extr = x[0];
for (i = THREAD_COUNT*blockIdx.x+tid; i < n; i += gridDim.x*THREAD_COUNT) {
extr = func (extr, x[i]);
}
partExtr[tid] = extr;
for (i = THREAD_COUNT >> 1; i > 0; i >>= 1) {
__syncthreads();
if (tid < i) {
partExtr[tid] = func (partExtr[tid], partExtr[tid+i]);
}
}
if (tid == 0) {
res[blockIdx.x] = partExtr[tid];
}
}
__global__ void minmax_kernel(float *x, float *res, int n, int findmin)
{
minmax (x, res, n, func_d[findmin]);
}
float findExtremum (float *x, int n, int findmin)
{
pf func_h[2] = { fmaxf, fminf };
float *res_d;
float *res_h;
float *x_d;
float r;
if (n < 1) return sqrtf(-1.0f); // NaN
CUDA_SAFE_CALL (cudaMalloc ((void**)&res_d, BLOCK_COUNT*sizeof(res_d[0])));
CUDA_SAFE_CALL (cudaMalloc ((void**)&x_d, n * sizeof(x_d[0])));
CUDA_SAFE_CALL (cudaMemcpy (x_d, x, n * sizeof(x_d[0]),
cudaMemcpyHostToDevice));
minmax_kernel<<<BLOCK_COUNT,THREAD_COUNT>>>(x_d, res_d, n, !!findmin);
CHECK_LAUNCH_ERROR();
res_h = (float *)malloc (BLOCK_COUNT * sizeof(res_h[0]));
if (!res_h) {
fprintf (stderr, "res_h allocation failed\n");
exit (EXIT_FAILURE);
}
CUDA_SAFE_CALL (cudaMemcpy (res_h, res_d, BLOCK_COUNT * sizeof(res_d[0]),
cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL (cudaFree (res_d));
CUDA_SAFE_CALL (cudaFree (x_d));
r = res_h[0];
for (int i = 1; i < BLOCK_COUNT; i++) r = func_h[findmin](r, res_h[i]);
free (res_h);
return r;
}
#define ELEM_COUNT 8
int main (void)
{
float x[ELEM_COUNT] = {-1.3f, 2.4f, 3.5f, -2.3f, 4.5f, 0.4f, -5.3f, -1.6f};
float minimum = findExtremum (x, ELEM_COUNT, 1);
float maximum = findExtremum (x, ELEM_COUNT, 0);
printf ("min=% 13.6e max=% 13.6e\n", minimum, maximum);
return EXIT_SUCCESS;
}