Is there a possibility of CUDA supporting **device** function pointers within **global** functions in the future? Is there an alternative?

I have a **global** function which in the innermost loop I need to call a **device** function. But in different calls to the **global** I want to call a different **device** function inside that loop. Since function pointers aren’t supported for **device** functions, that means that I would need an if statement to decide, in the innermost part of my loop, but that’ll be unnecessarily slow. The only other alternative, which is not an attractive one is to duplicate a rather large **global** function once for each different **device** function I want to support.

I’m speaking in very general terms, so below I provided the particular code which I’m referring to.

```
__device__ float fourier_transform_sph(float r)
{
if(r >= 0.06f)
{
float m = pi*r;
float _2_m = 2.0f*m;
float cos_2_m, sin_2_m;
__sincosf(_2_m, &sin_2_m, &cos_2_m);
return ((2.3561944901923448f)/(m*m*m*m*m*m))*(cos_2_m-1.0f)*(cos_2_m+m*sin_2_m-1.0f);
}
return (pi + r*(-0.007968913156311f + r*(-18.293608272337678f))); }
__device__ float fourier_transform_wendland_d3_c2(float r)
{
if(r >= 0.24f)
{
float m = pi*r;
float cos_2_m, sin_2_m;
__sincosf(2.0f*m, &sin_2_m, &cos_2_m);
float m_2 = m*m;
float m_4 = m_2*m_2;
return ((pi*7.5f)/(m_4*m_4))*(4.0f*m_2 - 6.0f + (6.0f-m_2)*cos_2_m+4.5f*m*sin_2_m);
}
return 0.299199300341890f + r*(-0.002379178586124f + r*(-0.370530218163545f));
}
__global__ void fourier_transform_interpolant_half_domain(int first_term, int number_of_terms, MeshlessInterpolant meshless_interpolant, FVRConfig fvr_config)
{
extern __shared__ float shared[];
int block_area = fvr_config._block_area;
float3* ds_positions = (float3*)shared;
float* ds_weights = (float*)(ds_positions + block_area);
float* ds_radii = ds_weights + block_area;
int x = (blockDim.x*blockIdx.x + threadIdx.x);
int y = (blockDim.y*blockIdx.y + threadIdx.y);
int index_x = x;
if(x > fvr_config.cutoff_frequency.x)
{
x = x-2*fvr_config.cutoff_frequency.x;
index_x = (x+fvr_config.number_of_samples.x);
}
int index = index_x*(fvr_config.number_of_samples.y/2 + 1) + y;
float fu = fvr_config.step_size.x*x, fv = fvr_config.step_size.y*y;
float3 f_coord = make_float3(fu*fvr_config.u_axis.x + fv*fvr_config.v_axis.x, fu*fvr_config.u_axis.y + fv*fvr_config.v_axis.y, fu*fvr_config.u_axis.z + fv*fvr_config.v_axis.z);
float r = sqrtf((float)(fu*fu + fv*fv));
float2 sum = make_float2(0.0f, 0.0f);
float sin_v, cos_v;
int thread_index = blockDim.x*threadIdx.y + threadIdx.x;
for(unsigned int k = first_term; k < number_of_terms; )
{
if (k+thread_index < number_of_terms)
{
ds_positions[thread_index] = meshless_interpolant.d_positions[k + thread_index];
ds_weights[thread_index] = meshless_interpolant.d_weights[k + thread_index];
ds_radii[thread_index] = meshless_interpolant.d_radii[k + thread_index];
}
__syncthreads();
for(int j = 0; j != block_area && k < number_of_terms; ++j, ++k)
{
float term = ds_weights[j]*fourier_transform_sph(r*ds_radii[j]); //HERE is where I would like to call an arbitrary device function, but I would like to decide which one outside of this double loop
__sincosf(_2pi*dot(f_coord, ds_positions[j]), &sin_v, &cos_v);
sum.x += term*cos_v;
sum.y += term*sin_v;
}
__syncthreads();
}
float2 prev = fvr_config._d_half_image[index];
fvr_config._d_half_image[index].x = prev.x + sum.x;
fvr_config._d_half_image[index].y = prev.y - sum.y;
}
```

Also, if you happen to have looked at this code, and can offer some performance hints, they’re very much appreciated.