How can I use device function pointer in CUDA ?

Harbor · March 21, 2011, 12:17pm

Can we do it in CUDA runtime api?

But you can, and the code I posted does exactly that:
cuModuleGetGlobal( &funcshandle, NULL, mhandle, "funcs");
This is getting a pointer to the array of function pointers on the device side. You should be able to do the assignment you want using the same idea: read the function pointer value from a device symbol, then assign it to a value in host memory and copy it back into device memory. My code uses a device to device copy to do the same thing, but the idea is still the same.

avidday · March 21, 2011, 12:43pm

I don’t see why not, although I haven’t tried it withe the runtime API.

Boson30 · March 22, 2011, 6:44am

What would be the steps to do it using Runtime API?

adamjmac · March 25, 2011, 6:19pm

But you can, and the code I posted does exactly that:
cuModuleGetGlobal( &funcshandle, NULL, mhandle, "funcs");
This is getting a pointer to the array of function pointers on the device side. You should be able to do the assignment you want using the same idea: read the function pointer value from a device symbol, then assign it to a value in host memory and copy it back into device memory. My code uses a device to device copy to do the same thing, but the idea is still the same.

I am trying to use your example for my application, but the difference is that my kernel is not included in the cubin. I am trying to call loaded device functions from a kernel that is compiled in my main code.

I am getting an unknown CUDA error when calling the function, most likely because the function addresses are not correct from the address space of my kernel. Is there some way I can do this?

avidday · March 28, 2011, 8:22pm

Do you mean you are trying to use the driver API code I posted with runtime API code? That pretty clearly won’t work. Use the runtime API symbol manipulation functions instead, that is what they are there for.

brdavs · March 30, 2011, 4:27am

Getting this to work with runtime API is still somehow foggy.

I have been staring at FunctionPointers example from Cuda SDK,

but I am still encountering lots of problems.

I am trying to get a variant of example that appeared

earlier in this thread:

typedef float (*op_func_t) (float, float);

typedef struct TestStruct

{

    op_func_t op; 

} TestStruct;

__device__ float add_func (float x, float y)

{

    return x + y;

}

__device__ float mul_func (float x, float y)

{

    return x * y;

}

#define N 10

__global__ void kernel( OpStruct* ops )

{

    float x[N];

    float y[N];

    float res[N];

for (int i = 0; i < N; i++) {

        x[i] = (float)(10 + i);

    }

    for (int i = 0; i < N; i++) {

        y[i] = (float)(100 + i);

    }

for (int i = 0; i < N; i++) {

       res[i] = ( *ops->op )( x[i], y[i] );

   }

}

int main()

{

    OpStruct* h_op = ( OpStruct* ) malloc( sizeof( OpStruct ) );

    // Copy device function pointer to host side

    cudaMemcpyFromSymbol( &h_op->op, mul_func, sizeof( op_func_t ) );

OpStruct* d_op;

    cudaMalloc( ( void**) &d_op, sizeof( OpStruct ) );

    // Copy device pointer from host side to device

    cudaMemcpyToSymbol( &d_op->op, &h_op->op, sizeof( op_func_t ) );

kernel<<<1,1>>>( d_op );

    cudaThreadSynchronize();

cudaFree( d_op );

    free( h_op );

return EXIT_SUCCESS;

}

The above is clearly not working and any pointers about

how to fix it would save me some gray hair. What’s the right

way to get a device function pointer and assign it correctly?

avidday · March 30, 2011, 5:50am

When you say “not working”, I presume you mean “not compiling”, because clearly that won’t.

brdavs · March 30, 2011, 6:12am

No, I mean conceptually (using symbol manipulation functions).

brdavs · March 30, 2011, 6:22am

avidday · March 30, 2011, 6:27am

Going back to the posts on the first page - device functions don’t get the necessary elf index entries that would allow cudaMemcpyFromSymbol (or any of the other runtime/driver cubin access functions) to get a device pointer function directly. If, however, you declare a device variable which is a static function pointer to a given device function, then you can read that symbol.

So something like this

__device__ float mul_func (float x, float y)

{

    return x * y;

}

__device__ op_func_t h_mul_func = mul_func;

will give a symbol which can be read at runtime.

brdavs · March 31, 2011, 4:20am

Thanks for your pointer. Creating a static function pointer variable was the key. A simplified (and working) example is:

#include <stdio.h>

typedef float (*op_func_t) (float, float);

__device__ float add_func (float x, float y)

{

    return x + y;

}

__device__ float mul_func (float x, float y)

{

    return x * y;

}

// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;

__device__ op_func_t p_mul_func = mul_func;

__global__ void kernel( op_func_t op )

{

        printf("Result: %f\n", ( *op )( 1.0, 2.0 ) );

}

int main()

{

op_func_t h_add_func;

    op_func_t h_mul_func;

// Copy device function pointer to host side

    cudaMemcpyFromSymbol( &h_mul_func, p_mul_func, sizeof( op_func_t ) );

    cudaMemcpyFromSymbol( &h_add_func, p_add_func, sizeof( op_func_t ) );

op_func_t d_myfunc = h_mul_func;

kernel<<<1,1>>>( d_myfunc );

cudaThreadSynchronize();

return EXIT_SUCCESS;

}

Boson30 · April 21, 2011, 10:55pm

Thanks for your pointer. Creating a static function pointer variable was the key. A simplified (and working) example is:

#include <stdio.h>

typedef float (*op_func_t) (float, float);

__device__ float add_func (float x, float y)

{

    return x + y;

}

__device__ float mul_func (float x, float y)

{

    return x * y;

}

// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;

__device__ op_func_t p_mul_func = mul_func;

__global__ void kernel( op_func_t op )

{

        printf("Result: %f\n", ( *op )( 1.0, 2.0 ) );

}

int main()

{

op_func_t h_add_func;

    op_func_t h_mul_func;

// Copy device function pointer to host side

    cudaMemcpyFromSymbol( &h_mul_func, p_mul_func, sizeof( op_func_t ) );

    cudaMemcpyFromSymbol( &h_add_func, p_add_func, sizeof( op_func_t ) );

op_func_t d_myfunc = h_mul_func;

kernel<<<1,1>>>( d_myfunc );

cudaThreadSynchronize();

return EXIT_SUCCESS;

}

Referring to example above, would it be possible to have the two device functions in two different files? Further, can we compile those files individually and then link them with the .cu file containing the kernel?? I am thinking of something on the lines of,

File → fn1.cu

include <cuda_runtime.h>

include <stdio.h>

typedef float (*op_func_t) (float, float);

device float add_func (float x, float y)

{

return x + y;

}

// Static pointers to device functions

device op_func_t p_add_func = add_func;

void set_fn_pointer_add(op_func_t h_add_func) {

printf(“Pointer of Function: %p”, h_add_func);

cudaMemcpyFromSymbol( &h_add_func, “p_add_func”, sizeof( op_func_t ) );

printf(“Pointer of Function: %p”, h_add_func);

}

File → fn2.cu

include <cuda_runtime.h>

include <stdio.h>

typedef float (*op_func_t) (float, float);

device float mul_func (float x, float y)

{

return x * y;

}

// Static pointers to device functions

device op_func_t p_mul_func = mul_func;

void set_fn_pointer_mul(op_func_t h_mul_func) {

printf(“Pointer of Function: %p”, h_mul_func);

cudaMemcpyFromSymbol( &h_mul_func, “p_mul_func”, sizeof( op_func_t ) );

printf(“Pointer of Function: %p”, h_mul_func);

}

And the main file would look like,

File → main.cu

include <stdio.h>

include <cuda_runtime.h>

typedef float (*op_func_t) (float, float);

void set_fn_pointer_add(op_func_t h_add_func);

void set_fn_pointer_mul(op_func_t h_mul_func);

global void kernel( op_func_t op )

{

   printf("Result: %f\n", ( *op )( 1.0, 2.0 ) );

}

int main()

{

op_func_t h_add_func;

op_func_t h_mul_func;

set_fn_pointer_add(h_add_func);

set_fn_pointer_mul(h_mul_func);

// Copy device function pointer to host side

op_func_t d_myfunc = h_mul_func;

kernel<<<1,1>>>( d_myfunc );

cudaThreadSynchronize();

return EXIT_SUCCESS;

}

I am not able to get this code to work (getting segmentation fault). However, would this concept of separating device functions into individual files work?? If it is not possible with Runtime API, can we use Driver API to do such a thing??

Many thanks in advance for the replies.

integrate_cuda · August 17, 2011, 1:55pm

Thanks for your pointer. Creating a static function pointer variable was the key. A simplified (and working) example is:

#include <stdio.h>

typedef float (*op_func_t) (float, float);

__device__ float add_func (float x, float y)

{

    return x + y;

}

__device__ float mul_func (float x, float y)

{

    return x * y;

}

// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;

__device__ op_func_t p_mul_func = mul_func;

__global__ void kernel( op_func_t op )

{

        printf("Result: %f\n", ( *op )( 1.0, 2.0 ) );

}

int main()

{

op_func_t h_add_func;

    op_func_t h_mul_func;

// Copy device function pointer to host side

    cudaMemcpyFromSymbol( &h_mul_func, p_mul_func, sizeof( op_func_t ) );

    cudaMemcpyFromSymbol( &h_add_func, p_add_func, sizeof( op_func_t ) );

op_func_t d_myfunc = h_mul_func;

kernel<<<1,1>>>( d_myfunc );

cudaThreadSynchronize();

return EXIT_SUCCESS;

}

I run the code On GTX460 and cuda verison is 3.2 ,but it does not what we want and the result is 0

$nvcc -arch=sm_21 -o 1 test.cu

$./1

$0

I do not konw where is wrong!Please help me thanks.

milovidov · September 20, 2011, 5:46am

Hello!

I search for a method of transmission of the pointer on __ device __ function from main () function.

Has interested a post #18, but I can not compile it.

Where I can find a manual of hacked version of Norbert’s example?

Thanks for attention!

sachad88 · June 3, 2020, 5:35pm

Passing pointer to device function in kernel function of computation graph:

#include <cuda_runtime.h>
#include <driver_types.h>//Типы данных, используемые в CUDA Run Time (cudart).  Сначала driver_types.h
#include <helper_cuda.h>// и только после helper_cuda.h, иначе checkCudaErrors в helper_cuda.h не обнаруживается

#include <iostream>
#include <stdexcept>
#include <string>

using funcFloat_t = float (*) (float, float);    //func_t теперь имя типа "указатель на функцию из 2-х аргументов типа float, возвращающую тип float"

__device__ float add_func_g (float x, float y) //функция сложения 2-х аргументов
{
    return x + y;
}

__device__ float mul_func_g (float x, float y) //функция умножения 2-х аргументов
{
    return x * y;
}

// Для передачи __device__ функций в __kernel__ функцию требуются
// Статические указатели на эти функции.
__device__ funcFloat_t func[2] = { add_func_g, mul_func_g };

__global__ void kernel(funcFloat_t *op, float * in, float * out)
{
    *out = (**op)(in[0], in[1]);
}

funcFloat_t & getFuncDevPtr(int opType){
    if (opType < 0 || opType >= 2){
        throw std::range_error("Operations type: 0 - addiction, 1 - multiplication");
    }
    return func[opType];
}

void getFuncHostPtr(int opType, funcFloat_t *func_ptr){
    if (opType < 0 || opType >= 2){
        throw std::range_error("Operations type: 0 - addiction, 1 - multiplication");
    }
    checkCudaErrors(cudaMemcpyFromSymbol(func_ptr, func, sizeof(funcFloat_t), opType*sizeof(funcFloat_t), cudaMemcpyDeviceToDevice));
}

void createGraph(cudaGraphExec_t &graphExec, cudaGraph_t &graph, funcFloat_t *func_ptr, int opType, float *in, float *out, float *outHost){
    //Создать отдельный поток для обработки графа. Отличный от потока по умолчанию (0).
    checkCudaErrors(cudaGraphCreate(&graph, 0));//Создать граф

    cudaGraphNode_t  kernel_node;
    cudaGraphNode_t  cpOutputDev2Hst_node;

    //-----------------------------------------------
    cudaKernelNodeParams kernelNodeParams;
    memset(&kernelNodeParams, 0, sizeof(kernelNodeParams));//Обнулить переменную, чтобы использовать её для следующего узла
    //getFuncHostPtr(opType, h_func_ptr);
    void *kernelFuncArgs[3] = {(void*)&func_ptr, (void*)&in, (void*)&out};//Массив указателей на аргументы функций
    kernelNodeParams.func = (void*)kernel;

    kernelNodeParams.gridDim = dim3();
    kernelNodeParams.blockDim = dim3();

    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = (void **)kernelFuncArgs;
    kernelNodeParams.extra = NULL;

    checkCudaErrors(cudaGraphAddKernelNode( &kernel_node, graph, nullptr, 0, &kernelNodeParams));

    //-----------------------------------------------

    cudaMemcpy3DParms memcpyParams;// = { 0 };
    memset(&memcpyParams, 0, sizeof(memcpyParams));
    memcpyParams.srcArray = NULL;
    memcpyParams.srcPos = make_cudaPos(0, 0, 0);
    memcpyParams.srcPtr = make_cudaPitchedPtr(out, sizeof(float)*1u, 1u, 1u);
    memcpyParams.dstArray = NULL;
    memcpyParams.dstPos = make_cudaPos(0, 0, 0);
    memcpyParams.dstPtr = make_cudaPitchedPtr(outHost, sizeof(float)*1u, 1u, 1u);
    memcpyParams.extent = make_cudaExtent(sizeof(float)*1u, 1, 1);//extent - Размер копируемых данных
    memcpyParams.kind = cudaMemcpyDeviceToHost;

    checkCudaErrors(cudaGraphAddMemcpyNode(&cpOutputDev2Hst_node, graph, &kernel_node, 1, &memcpyParams)); //Добавим в граф узел выходного копирования

    //-----------------------------------------------

    checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));//Построить пригодный для выполнения вычислительный граф на основании полученных аргументов
}

void destroyGraph(cudaGraphExec_t &graphExec, cudaGraph_t &graph, cudaStream_t &stream){
    checkCudaErrors(cudaGraphExecDestroy(graphExec));
    graphExec = nullptr;
    checkCudaErrors(cudaGraphDestroy(graph));
    graph = nullptr;
    checkCudaErrors(cudaStreamDestroy(stream));
    stream = nullptr;
}

void test(float x, float y, int op = 0)
{    
    size_t argCount = 2u;
    float * _in, * __in;
    float * _out, * __out;
    cudaMallocHost(&_in, sizeof(float) * argCount);
    cudaMalloc(&__in, sizeof(float) * argCount);
    cudaMallocHost(&_out, sizeof(float));
    cudaMalloc(&__out, sizeof(float));
    funcFloat_t *__func_ptr(nullptr);
    cudaMalloc(&__func_ptr, sizeof(funcFloat_t));
    getFuncHostPtr(op, __func_ptr);

    cudaGraphExec_t graphExec;
    cudaGraph_t graph;
    cudaStream_t stream;

    checkCudaErrors(cudaStreamCreate(&stream));
    createGraph(graphExec, graph, __func_ptr, op, __in, __out, _out);

    _in[0] = x;
    _in[1] = y;

    cudaMemcpy(__in, _in, sizeof(float)*argCount, cudaMemcpyHostToDevice);

    checkCudaErrors(cudaGraphLaunch(graphExec, stream));//Запустим граф на выполнение в отдельном потоке
    checkCudaErrors(cudaStreamSynchronize(stream));//Дождёмся выполенения потока


    std::cout << "Function result is: " << *_out << std::endl;
    destroyGraph(graphExec, graph, stream);
    checkCudaErrors(cudaFree(__func_ptr));
    checkCudaErrors(cudaFree(__in));
    checkCudaErrors(cudaFree(__out));
    checkCudaErrors(cudaFreeHost(_in));
    checkCudaErrors(cudaFreeHost(_out));
}

extern "C"
int cuda_typed_graph()
{
    std::cout << "[TYPED FUNCTION POINTERS CUDA GRAPH]" << std::endl<< std::endl;
    std::cout << "Test float mul ..." << std::endl;
    test(2.05, 10.00, 1);
    std::cout << std::endl;
    return 0;
}

How can I use __device__ function pointer in CUDA ?

How can I use device function pointer in CUDA ?