[Clarified the wording of the initial paragraph in a later edit – nj]
Function pointers for device functions are supported in CUDA 3.2 on sm_2x platforms, based on the ABI that was introduced with CUDA 3.1. Here is a very simple app that shows that everything works exactly the same as function pointers in host code.
#include <stdio.h>
#include <stdlib.h>
#define N 5
__device__ float add_func (float x, float y)
{
return x + y;
}
__device__ float mul_func (float x, float y)
{
return x * y;
}
__device__ float div_func (float x, float y)
{
return x / y;
}
typedef float (*op_func) (float, float);
__device__ op_func func[3] = { add_func, mul_func, div_func };
__device__ char* op_name[3] = { "add", "mul", "div" };
__device__ void op_array (const float *a, const float *b, float *res, int op, int n)
{
for (int i = 0; i < N; i++) {
res[i] = func[op](a[i], b[i]);
}
}
__global__ void kernel (void)
{
float x[N];
float y[N];
float res[N];
for (int i = 0; i < N; i++) {
x[i] = (float)(10 + i);
}
for (int i = 0; i < N; i++) {
y[i] = (float)(100 + i);
}
for (int op = 0; op < 3; op++) {
printf ("\nop=%s\n", op_name[op]);
op_array (x, y, res, op, N);
for (int i = 0; i < N; i++) {
printf ("res = % 16.9e\n", res[i]);
}
}
}
int main (void)
{
kernel<<<1,1>>>();
cudaThreadSynchronize();
return EXIT_SUCCESS;
}
The build and run log (Linux64, C2050) looks as follows:
~ $ nvcc -arch=sm_20 -o funcptr funcptr.cu
~ $ funcptr
op=add
res = 1.100000000e+02
res = 1.120000000e+02
res = 1.140000000e+02
res = 1.160000000e+02
res = 1.180000000e+02
op=mul
res = 1.000000000e+03
res = 1.111000000e+03
res = 1.224000000e+03
res = 1.339000000e+03
res = 1.456000000e+03
op=div
res = 1.000000015e-01
res = 1.089108884e-01
res = 1.176470593e-01
res = 1.262135953e-01
res = 1.346153915e-01