Passing __global__ function pointer as argument to __global__ function for dynamic parallelism

I want to pass a global function as argument to a __global__function. E.g.

global
void a(void)
{
printf(“Hello from a”);
}

global
void b(void)
{
printf(“Hello from b”);
}

// Uses dynamic parallelism
global
kernel_launcher(void(*func)(void))
{
func<<<1,1>>>();
}

int main(int argc, char **argv)
{
assert(argc == 2);
if (strcmp(argv[1], “a”) == 0) {
kernel_launcher<<<1,1>>>(a);
} else {
kernel_launcher<<<1,1>>>(b);
}
}

Is this possible to do? If not, what’s the alternate?
The problem is that the cuda programming guide has this line:

" The address of a global function taken in host code cannot be used in device code (e.g. to launch the kernel)."

If you are using C++, you can create a device lambda which calls the inner kernel, and which is passed to the outer kernel

__global__
void kernel2(int n){
	printf("thread %d in kernel2 called by thread %d from kernel1\n", threadIdx.x, n);
}

__global__ 
void kernel1(){
	kernel2<<<1,4>>>(threadIdx.x);
}

__global__
void kernel2_mod(int n, int a){
	printf("thread %d in kernel2_mod called by thread %d from kernel1_mod. a = %d\n", threadIdx.x, n, a);
}

template<class Func>
__global__
void kernel1_mod(Func f){
	f(threadIdx.x);
}

int main(){
	kernel1<<<1,2>>>();
	cudaDeviceSynchronize();
	
	int a = 42;
	
	auto lambda = [=] __device__(int n){
		kernel2_mod<<<1,4>>>(n,a);
	};
	
	kernel1_mod<<<1,2>>>(lambda);
	cudaDeviceSynchronize();
	
	cudaDeviceReset();
}

Compiled with: nvcc -std=c++14 -arch=sm_61 -rdc=true --expt-extended-lambda main.cu -o main