How can I use __device__ function pointer in CUDA ?

Can we do it in CUDA runtime api?

I don’t see why not, although I haven’t tried it withe the runtime API.

What would be the steps to do it using Runtime API?

I am trying to use your example for my application, but the difference is that my kernel is not included in the cubin. I am trying to call loaded device functions from a kernel that is compiled in my main code.

I am getting an unknown CUDA error when calling the function, most likely because the function addresses are not correct from the address space of my kernel. Is there some way I can do this?

Do you mean you are trying to use the driver API code I posted with runtime API code? That pretty clearly won’t work. Use the runtime API symbol manipulation functions instead, that is what they are there for.

Getting this to work with runtime API is still somehow foggy.

I have been staring at FunctionPointers example from Cuda SDK,

but I am still encountering lots of problems.

I am trying to get a variant of example that appeared

earlier in this thread:

typedef float (*op_func_t) (float, float);

typedef struct TestStruct

{

    op_func_t op; 

} TestStruct;

__device__ float add_func (float x, float y)

{

    return x + y;

}

__device__ float mul_func (float x, float y)

{

    return x * y;

}

#define N 10

__global__ void kernel( OpStruct* ops )

{

    float x[N];

    float y[N];

    float res[N];

for (int i = 0; i < N; i++) {

        x[i] = (float)(10 + i);

    }

    for (int i = 0; i < N; i++) {

        y[i] = (float)(100 + i);

    }

for (int i = 0; i < N; i++) {

       res[i] = ( *ops->op )( x[i], y[i] );

   }

}

int main()

{

    OpStruct* h_op = ( OpStruct* ) malloc( sizeof( OpStruct ) );

    // Copy device function pointer to host side

    cudaMemcpyFromSymbol( &h_op->op, mul_func, sizeof( op_func_t ) );

OpStruct* d_op;

    cudaMalloc( ( void**) &d_op, sizeof( OpStruct ) );

    // Copy device pointer from host side to device

    cudaMemcpyToSymbol( &d_op->op, &h_op->op, sizeof( op_func_t ) );

kernel<<<1,1>>>( d_op );

    cudaThreadSynchronize();

cudaFree( d_op );

    free( h_op );

return EXIT_SUCCESS;

}

The above is clearly not working and any pointers about

how to fix it would save me some gray hair. What’s the right

way to get a device function pointer and assign it correctly?

When you say “not working”, I presume you mean “not compiling”, because clearly that won’t.

No, I mean conceptually (using symbol manipulation functions).

Going back to the posts on the first page - device functions don’t get the necessary elf index entries that would allow cudaMemcpyFromSymbol (or any of the other runtime/driver cubin access functions) to get a device pointer function directly. If, however, you declare a device variable which is a static function pointer to a given device function, then you can read that symbol.

So something like this

__device__ float mul_func (float x, float y)

{

    return x * y;

}

__device__ op_func_t h_mul_func = mul_func;

will give a symbol which can be read at runtime.

Thanks for your pointer. Creating a static function pointer variable was the key. A simplified (and working) example is:

#include <stdio.h>

typedef float (*op_func_t) (float, float);

__device__ float add_func (float x, float y)

{

    return x + y;

}

__device__ float mul_func (float x, float y)

{

    return x * y;

}

// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;

__device__ op_func_t p_mul_func = mul_func;

__global__ void kernel( op_func_t op )

{

        printf("Result: %f\n", ( *op )( 1.0, 2.0 ) );

}

int main()

{

op_func_t h_add_func;

    op_func_t h_mul_func;

// Copy device function pointer to host side

    cudaMemcpyFromSymbol( &h_mul_func, p_mul_func, sizeof( op_func_t ) );

    cudaMemcpyFromSymbol( &h_add_func, p_add_func, sizeof( op_func_t ) );

op_func_t d_myfunc = h_mul_func;

kernel<<<1,1>>>( d_myfunc );

cudaThreadSynchronize();

return EXIT_SUCCESS;

}

Referring to example above, would it be possible to have the two device functions in two different files? Further, can we compile those files individually and then link them with the .cu file containing the kernel?? I am thinking of something on the lines of,

File → fn1.cu

include <cuda_runtime.h>

include <stdio.h>

typedef float (*op_func_t) (float, float);

device float add_func (float x, float y)

{

return x + y;

}

// Static pointers to device functions

device op_func_t p_add_func = add_func;

void set_fn_pointer_add(op_func_t h_add_func) {

printf(“Pointer of Function: %p”, h_add_func);

cudaMemcpyFromSymbol( &h_add_func, “p_add_func”, sizeof( op_func_t ) );

printf(“Pointer of Function: %p”, h_add_func);

}

File → fn2.cu

include <cuda_runtime.h>

include <stdio.h>

typedef float (*op_func_t) (float, float);

device float mul_func (float x, float y)

{

return x * y;

}

// Static pointers to device functions

device op_func_t p_mul_func = mul_func;

void set_fn_pointer_mul(op_func_t h_mul_func) {

printf(“Pointer of Function: %p”, h_mul_func);

cudaMemcpyFromSymbol( &h_mul_func, “p_mul_func”, sizeof( op_func_t ) );

printf(“Pointer of Function: %p”, h_mul_func);

}

And the main file would look like,

File → main.cu

include <stdio.h>

include <cuda_runtime.h>

typedef float (*op_func_t) (float, float);

void set_fn_pointer_add(op_func_t h_add_func);

void set_fn_pointer_mul(op_func_t h_mul_func);

global void kernel( op_func_t op )

{

   printf("Result: %f\n", ( *op )( 1.0, 2.0 ) );

}

int main()

{

op_func_t h_add_func;

op_func_t h_mul_func;

set_fn_pointer_add(h_add_func);

set_fn_pointer_mul(h_mul_func);

// Copy device function pointer to host side

op_func_t d_myfunc = h_mul_func;

kernel<<<1,1>>>( d_myfunc );

cudaThreadSynchronize();

return EXIT_SUCCESS;

}

I am not able to get this code to work (getting segmentation fault). However, would this concept of separating device functions into individual files work?? If it is not possible with Runtime API, can we use Driver API to do such a thing??

Many thanks in advance for the replies.

I run the code On GTX460 and cuda verison is 3.2 ,but it does not what we want and the result is 0

$nvcc -arch=sm_21 -o 1 test.cu

$./1

$0

I do not konw where is wrong!Please help me thanks.

Hello!

I search for a method of transmission of the pointer on __ device __ function from main () function.

Has interested a post #18, but I can not compile it.

Where I can find a manual of hacked version of Norbert’s example?

Thanks for attention!

Passing pointer to device function in kernel function of computation graph:

#include <cuda_runtime.h>
#include <driver_types.h>//Типы данных, используемые в CUDA Run Time (cudart).  Сначала driver_types.h
#include <helper_cuda.h>// и только после helper_cuda.h, иначе checkCudaErrors в helper_cuda.h не обнаруживается

#include <iostream>
#include <stdexcept>
#include <string>

using funcFloat_t = float (*) (float, float);    //func_t теперь имя типа "указатель на функцию из 2-х аргументов типа float, возвращающую тип float"

__device__ float add_func_g (float x, float y) //функция сложения 2-х аргументов
{
    return x + y;
}

__device__ float mul_func_g (float x, float y) //функция умножения 2-х аргументов
{
    return x * y;
}

// Для передачи __device__ функций в __kernel__ функцию требуются
// Статические указатели на эти функции.
__device__ funcFloat_t func[2] = { add_func_g, mul_func_g };

__global__ void kernel(funcFloat_t *op, float * in, float * out)
{
    *out = (**op)(in[0], in[1]);
}

funcFloat_t & getFuncDevPtr(int opType){
    if (opType < 0 || opType >= 2){
        throw std::range_error("Operations type: 0 - addiction, 1 - multiplication");
    }
    return func[opType];
}

void getFuncHostPtr(int opType, funcFloat_t *func_ptr){
    if (opType < 0 || opType >= 2){
        throw std::range_error("Operations type: 0 - addiction, 1 - multiplication");
    }
    checkCudaErrors(cudaMemcpyFromSymbol(func_ptr, func, sizeof(funcFloat_t), opType*sizeof(funcFloat_t), cudaMemcpyDeviceToDevice));
}

void createGraph(cudaGraphExec_t &graphExec, cudaGraph_t &graph, funcFloat_t *func_ptr, int opType, float *in, float *out, float *outHost){
    //Создать отдельный поток для обработки графа. Отличный от потока по умолчанию (0).
    checkCudaErrors(cudaGraphCreate(&graph, 0));//Создать граф

    cudaGraphNode_t  kernel_node;
    cudaGraphNode_t  cpOutputDev2Hst_node;

    //-----------------------------------------------
    cudaKernelNodeParams kernelNodeParams;
    memset(&kernelNodeParams, 0, sizeof(kernelNodeParams));//Обнулить переменную, чтобы использовать её для следующего узла
    //getFuncHostPtr(opType, h_func_ptr);
    void *kernelFuncArgs[3] = {(void*)&func_ptr, (void*)&in, (void*)&out};//Массив указателей на аргументы функций
    kernelNodeParams.func = (void*)kernel;

    kernelNodeParams.gridDim = dim3();
    kernelNodeParams.blockDim = dim3();

    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = (void **)kernelFuncArgs;
    kernelNodeParams.extra = NULL;

    checkCudaErrors(cudaGraphAddKernelNode( &kernel_node, graph, nullptr, 0, &kernelNodeParams));

    //-----------------------------------------------

    cudaMemcpy3DParms memcpyParams;// = { 0 };
    memset(&memcpyParams, 0, sizeof(memcpyParams));
    memcpyParams.srcArray = NULL;
    memcpyParams.srcPos = make_cudaPos(0, 0, 0);
    memcpyParams.srcPtr = make_cudaPitchedPtr(out, sizeof(float)*1u, 1u, 1u);
    memcpyParams.dstArray = NULL;
    memcpyParams.dstPos = make_cudaPos(0, 0, 0);
    memcpyParams.dstPtr = make_cudaPitchedPtr(outHost, sizeof(float)*1u, 1u, 1u);
    memcpyParams.extent = make_cudaExtent(sizeof(float)*1u, 1, 1);//extent - Размер копируемых данных
    memcpyParams.kind = cudaMemcpyDeviceToHost;

    checkCudaErrors(cudaGraphAddMemcpyNode(&cpOutputDev2Hst_node, graph, &kernel_node, 1, &memcpyParams)); //Добавим в граф узел выходного копирования

    //-----------------------------------------------

    checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));//Построить пригодный для выполнения вычислительный граф на основании полученных аргументов
}

void destroyGraph(cudaGraphExec_t &graphExec, cudaGraph_t &graph, cudaStream_t &stream){
    checkCudaErrors(cudaGraphExecDestroy(graphExec));
    graphExec = nullptr;
    checkCudaErrors(cudaGraphDestroy(graph));
    graph = nullptr;
    checkCudaErrors(cudaStreamDestroy(stream));
    stream = nullptr;
}

void test(float x, float y, int op = 0)
{    
    size_t argCount = 2u;
    float * _in, * __in;
    float * _out, * __out;
    cudaMallocHost(&_in, sizeof(float) * argCount);
    cudaMalloc(&__in, sizeof(float) * argCount);
    cudaMallocHost(&_out, sizeof(float));
    cudaMalloc(&__out, sizeof(float));
    funcFloat_t *__func_ptr(nullptr);
    cudaMalloc(&__func_ptr, sizeof(funcFloat_t));
    getFuncHostPtr(op, __func_ptr);

    cudaGraphExec_t graphExec;
    cudaGraph_t graph;
    cudaStream_t stream;

    checkCudaErrors(cudaStreamCreate(&stream));
    createGraph(graphExec, graph, __func_ptr, op, __in, __out, _out);

    _in[0] = x;
    _in[1] = y;

    cudaMemcpy(__in, _in, sizeof(float)*argCount, cudaMemcpyHostToDevice);

    checkCudaErrors(cudaGraphLaunch(graphExec, stream));//Запустим граф на выполнение в отдельном потоке
    checkCudaErrors(cudaStreamSynchronize(stream));//Дождёмся выполенения потока


    std::cout << "Function result is: " << *_out << std::endl;
    destroyGraph(graphExec, graph, stream);
    checkCudaErrors(cudaFree(__func_ptr));
    checkCudaErrors(cudaFree(__in));
    checkCudaErrors(cudaFree(__out));
    checkCudaErrors(cudaFreeHost(_in));
    checkCudaErrors(cudaFreeHost(_out));
}

extern "C"
int cuda_typed_graph()
{
    std::cout << "[TYPED FUNCTION POINTERS CUDA GRAPH]" << std::endl<< std::endl;
    std::cout << "Test float mul ..." << std::endl;
    test(2.05, 10.00, 1);
    std::cout << std::endl;
    return 0;
}
1 Like