How can I use __device__ function pointer in CUDA ?

Occam’s Razor in action. That does work just fine. Thanks for the suggestion.

Could you, please, provide an example? I believe I tried this to no avail.

I’m thinking something like this (although I haven’t actually tested it):

__device__ int func1(int par1)

{

    ...

}

__device__ int func2(int par1)

{

    ...

}

__device__ int func3(int par1)

{

    ...

}

__constant__ int (*func_table_d(int))[3] = {&func1, &func2, &func3);

int (*func_table(int))[3];

__global__ testkernel((*func)(int))

{

   ...

}

int main(void)

{

    int n=1;

    ...

    cudaMemcpyFromSymbol(func_table, "func_table_d", sizeof(func_table), 0, cudaMemcpyDeviceToHost);

testkernel<<<12, 34>>>(func_table[n]);

    ...

}

Caveat: Completely untested!

I was able to prove it to myself using this hacked version of Norbert’s example:

#include <stdio.h>

#include <stdlib.h>

#define N 5

__device__ float add_func (float x, float y)

{

    return x + y;

}

__device__ float mul_func (float x, float y)

{

    return x * y;

}

__device__ float div_func (float x, float y)

{ 

    return x / y;

}

typedef float (*op_func) (float, float);

__device__ op_func action;

__device__ op_func funcs[3] = { add_func, mul_func, div_func };

__device__ void op_array (const float *a, const float *b, float *res, op_func f, int n)

{

    for (int i = 0; i < N; i++) {

        res[i] = f(a[i], b[i]);

    }

}

__global__ void kernel (void)

{

    float x[N];

    float y[N];

    float res[N];

for (int i = 0; i < N; i++) {

        x[i] = (float)(10 + i);

    }

    for (int i = 0; i < N; i++) {

        y[i] = (float)(100 + i);

    }

op_array (x, y, res, action, N);

    for (int i = 0; i < N; i++) {

        printf ("res = % 16.9e\n", res[i]);

    }

}
#include <stdio.h>

#include <cuda.h>

int main(void)

{

	CUmodule   mhandle;

	CUcontext  chandle;

	CUfunction khandle;

	CUdeviceptr funcshandle, actionhandle;

	size_t op_funcsz;

	cuInit(0);

	cuCtxCreate( &chandle, 0, 0 );

	cuModuleLoad( &mhandle, "funcpointer.cubin");

	cuModuleGetFunction( &khandle, mhandle, "_Z6kernelv");

	cuModuleGetGlobal( &funcshandle, NULL, mhandle, "funcs");

	cuModuleGetGlobal( &actionhandle, &op_funcsz, mhandle, "action");

	for(int i=0; i<3; i++) {

		size_t offset = size_t(i) * op_funcsz;

		cuMemcpyDtoD(actionhandle, funcshandle+offset, op_funcsz);

		cuFuncSetBlockShape( khandle, 1, 1, 1 );

		cuLaunchGrid( khandle, 1, 1);

		cuCtxSynchronize();

	}

	cuCtxDestroy(chandle);

	return 0;

}
avidday@cuda:~$ nvcc -cubin -arch=sm_20 -Xptxas="-v" funcpointer.cu 

ptxas info    : Compiling entry function '_Z6kernelv' for 'sm_20'

ptxas info    : Used 22 registers, 32 bytes cmem[0], 28 bytes cmem[14]

avidday@cuda:~$ g++ funcpointmain.c -I $CUDA_INSTALL_PATH/include -lcuda -o funcpointer.exe

avidday@cuda:~$ ./funcpointer.exe 

res =  1.100000000e+02

res =  1.120000000e+02

res =  1.140000000e+02

res =  1.160000000e+02

res =  1.180000000e+02

res =  1.000000000e+03

res =  1.111000000e+03

res =  1.224000000e+03

res =  1.339000000e+03

res =  1.456000000e+03

res =  1.000000015e-01

res =  1.089108884e-01

res =  1.176470593e-01

res =  1.262135953e-01

res =  1.346153915e-01

This works, yes.

But it’s a slightly different than the problem I had in mind.

Some pseudocode is below.

On device:

typedef float (*op_func) (float, float);

struct foo

{

   float a, b;

   op_func op;

};

__device__ foo* foo_array; 

__global__ void kernel( ... )

{

	float val = ( *foo_array[i].op )( .... );

}

On host:

// Allocate memory for foo_array

foo* foo_array_h;

cudaMalloc( foo_array_h, sizeof( foo )*N );

for( i = 0; i < N; i++ ) {

     foo_array_h[i].a = ..

     foo_array_h[i].b = ..

     foo_array_h[i].op = ??????

}

// Copy foo_array_h to device to

// be used in kernel

Since you cannot take an address of a device function

on host, I can’t set the op function pointer. It seems that this

cannot be done at all given current CUDA restrictions.

One way around it is to create a table of function pointers on

a device and and index to this table instead of the pointer

itself:

struct foo

{

   float a, b;

   int op;

};

__device__ foo* foo_array;

__device__ op_func op_array[] = { add_op, mul_op, ... }

__global__ void kernel( ... )

{

	float val = ( *op_array[foo_array[i].op] )( .... );

}

This works, but it’s not very flexible and requires maintaining

a table of all ops which becomes problematic if other users

(that don’t necessarily have access to the op table) are

allowed to write their own op_func and extend the table.

Sugestions?

But you can, and the code I posted does exactly that:

cuModuleGetGlobal( &funcshandle, NULL, mhandle, "funcs");

This is getting a pointer to the array of function pointers on the device side. You should be able to do the assignment you want using the same idea: read the function pointer value from a device symbol, then assign it to a value in host memory and copy it back into device memory. My code uses a device to device copy to do the same thing, but the idea is still the same.

Can we do it in CUDA runtime api?

I don’t see why not, although I haven’t tried it withe the runtime API.

What would be the steps to do it using Runtime API?

I am trying to use your example for my application, but the difference is that my kernel is not included in the cubin. I am trying to call loaded device functions from a kernel that is compiled in my main code.

I am getting an unknown CUDA error when calling the function, most likely because the function addresses are not correct from the address space of my kernel. Is there some way I can do this?

Do you mean you are trying to use the driver API code I posted with runtime API code? That pretty clearly won’t work. Use the runtime API symbol manipulation functions instead, that is what they are there for.

Getting this to work with runtime API is still somehow foggy.

I have been staring at FunctionPointers example from Cuda SDK,

but I am still encountering lots of problems.

I am trying to get a variant of example that appeared

earlier in this thread:

typedef float (*op_func_t) (float, float);

typedef struct TestStruct

{

    op_func_t op; 

} TestStruct;

__device__ float add_func (float x, float y)

{

    return x + y;

}

__device__ float mul_func (float x, float y)

{

    return x * y;

}

#define N 10

__global__ void kernel( OpStruct* ops )

{

    float x[N];

    float y[N];

    float res[N];

for (int i = 0; i < N; i++) {

        x[i] = (float)(10 + i);

    }

    for (int i = 0; i < N; i++) {

        y[i] = (float)(100 + i);

    }

for (int i = 0; i < N; i++) {

       res[i] = ( *ops->op )( x[i], y[i] );

   }

}

int main()

{

    OpStruct* h_op = ( OpStruct* ) malloc( sizeof( OpStruct ) );

    // Copy device function pointer to host side

    cudaMemcpyFromSymbol( &h_op->op, mul_func, sizeof( op_func_t ) );

OpStruct* d_op;

    cudaMalloc( ( void**) &d_op, sizeof( OpStruct ) );

    // Copy device pointer from host side to device

    cudaMemcpyToSymbol( &d_op->op, &h_op->op, sizeof( op_func_t ) );

kernel<<<1,1>>>( d_op );

    cudaThreadSynchronize();

cudaFree( d_op );

    free( h_op );

return EXIT_SUCCESS;

}

The above is clearly not working and any pointers about

how to fix it would save me some gray hair. What’s the right

way to get a device function pointer and assign it correctly?

When you say “not working”, I presume you mean “not compiling”, because clearly that won’t.

No, I mean conceptually (using symbol manipulation functions).

Going back to the posts on the first page - device functions don’t get the necessary elf index entries that would allow cudaMemcpyFromSymbol (or any of the other runtime/driver cubin access functions) to get a device pointer function directly. If, however, you declare a device variable which is a static function pointer to a given device function, then you can read that symbol.

So something like this

__device__ float mul_func (float x, float y)

{

    return x * y;

}

__device__ op_func_t h_mul_func = mul_func;

will give a symbol which can be read at runtime.

Thanks for your pointer. Creating a static function pointer variable was the key. A simplified (and working) example is:

#include <stdio.h>

typedef float (*op_func_t) (float, float);

__device__ float add_func (float x, float y)

{

    return x + y;

}

__device__ float mul_func (float x, float y)

{

    return x * y;

}

// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;

__device__ op_func_t p_mul_func = mul_func;

__global__ void kernel( op_func_t op )

{

        printf("Result: %f\n", ( *op )( 1.0, 2.0 ) );

}

int main()

{

op_func_t h_add_func;

    op_func_t h_mul_func;

// Copy device function pointer to host side

    cudaMemcpyFromSymbol( &h_mul_func, p_mul_func, sizeof( op_func_t ) );

    cudaMemcpyFromSymbol( &h_add_func, p_add_func, sizeof( op_func_t ) );

op_func_t d_myfunc = h_mul_func;

kernel<<<1,1>>>( d_myfunc );

cudaThreadSynchronize();

return EXIT_SUCCESS;

}

Referring to example above, would it be possible to have the two device functions in two different files? Further, can we compile those files individually and then link them with the .cu file containing the kernel?? I am thinking of something on the lines of,

File → fn1.cu

include <cuda_runtime.h>

include <stdio.h>

typedef float (*op_func_t) (float, float);

device float add_func (float x, float y)

{

return x + y;

}

// Static pointers to device functions

device op_func_t p_add_func = add_func;

void set_fn_pointer_add(op_func_t h_add_func) {

printf(“Pointer of Function: %p”, h_add_func);

cudaMemcpyFromSymbol( &h_add_func, “p_add_func”, sizeof( op_func_t ) );

printf(“Pointer of Function: %p”, h_add_func);

}

File → fn2.cu

include <cuda_runtime.h>

include <stdio.h>

typedef float (*op_func_t) (float, float);

device float mul_func (float x, float y)

{

return x * y;

}

// Static pointers to device functions

device op_func_t p_mul_func = mul_func;

void set_fn_pointer_mul(op_func_t h_mul_func) {

printf(“Pointer of Function: %p”, h_mul_func);

cudaMemcpyFromSymbol( &h_mul_func, “p_mul_func”, sizeof( op_func_t ) );

printf(“Pointer of Function: %p”, h_mul_func);

}

And the main file would look like,

File → main.cu

include <stdio.h>

include <cuda_runtime.h>

typedef float (*op_func_t) (float, float);

void set_fn_pointer_add(op_func_t h_add_func);

void set_fn_pointer_mul(op_func_t h_mul_func);

global void kernel( op_func_t op )

{

   printf("Result: %f\n", ( *op )( 1.0, 2.0 ) );

}

int main()

{

op_func_t h_add_func;

op_func_t h_mul_func;

set_fn_pointer_add(h_add_func);

set_fn_pointer_mul(h_mul_func);

// Copy device function pointer to host side

op_func_t d_myfunc = h_mul_func;

kernel<<<1,1>>>( d_myfunc );

cudaThreadSynchronize();

return EXIT_SUCCESS;

}

I am not able to get this code to work (getting segmentation fault). However, would this concept of separating device functions into individual files work?? If it is not possible with Runtime API, can we use Driver API to do such a thing??

Many thanks in advance for the replies.

I run the code On GTX460 and cuda verison is 3.2 ,but it does not what we want and the result is 0

$nvcc -arch=sm_21 -o 1 test.cu

$./1

$0

I do not konw where is wrong!Please help me thanks.

Hello!

I search for a method of transmission of the pointer on __ device __ function from main () function.

Has interested a post #18, but I can not compile it.

Where I can find a manual of hacked version of Norbert’s example?

Thanks for attention!