How to bind Host Functions to HostNodes?

https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaHostNodeParams.html#structcudaHostNodeParams
Shows the function of the HostnodeParams as cudaHostFn_t. I can’t make any sense of the documentation definition for cudaHostFn_t when I click through to it.
Are there any examples of binding host functions to nodes? Or explanations of what cudaHostFn_t is?

This is in the general category of CUDA Graphs programming. One of the types of nodes that can be created there is referred to as:

A graph node can be one of:

  • CPU function call

That is a host node. And when you define a host node in a graph, you must pass a pointer to the host function (via the pNodeParams struct/argument).

here is an example question about using cudaGraphAddHostNode

In the struct you asked about specifically, the fn parameter is the pointer to the host function and the userdata parameter is the pointer to the host function arguments.

Yes I’m trying to create some host function nodes in a graph.

I get the userdata but I’m not sure exactly I am supposed to pass to the fn parameter.

The example shown assigns a global function, whereas I want to bind a member function. I tried to make a lambda to get around this. Looks like this

cudaHostNodeParams hostParams={0};
auto fx = [this, arg]()
{
       // member fx
	memberFx(arg);
};
hostParams.fn = fx;

This yields a compiler error:

Error (active)	E0413	no suitable conversion function from "lambda []void ()->void" to "cudaHostFn_t" exists	

I expect it will need an actual function pointer, as the example shows.

If you want to provide a short complete example that I can compile and see the error, I will take another look.

kernel.cu (2.8 KB)

This is my attempt at a MRE. I can’t get the syntax in that SOF post to work either. I’ve included 3 different methods and each gets a similar error.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <exception>
#include <iostream>
#include <vector>
#include <numeric>

#include <stdio.h>

void cudaErrCheck(cudaError_t err, const std::string label)
{
	if (err != cudaSuccess)
	{
		auto errorLabel = "CUDA API Error : " + label + " : " + cudaGetErrorName(err) + " : " + cudaGetErrorString(err);
		std::cout << "Fatal Error: " << errorLabel << " \n";
		throw std::runtime_error(errorLabel);
	}
}

__global__ void iota(int * data)
{
    data[threadIdx.x] = threadIdx.x;
}

// global function that does the same thing as the member function
void gatherGlb(std::vector<int> * src, int * target)
{
	*target = std::accumulate(src->begin(), src->end(), 0);
}

class crab
{
	public:

		crab() : sum(0)
		{
			auto err = cudaStreamCreate(&stream);
			cudaErrCheck(err, "stream create fail");

			err = cudaGraphCreate(&graph, 0);
			cudaErrCheck(err, "graph create fail");

			err = cudaMalloc((void**)&deviceMem, N * sizeof(int));
			cudaErrCheck(err, "cudaMalloc failed");

			hostMem.resize(N);

			buildGraph();

			err = cudaGraphInstantiate(&execGraph, graph, 0);
			cudaErrCheck(err, "graph instatiate fail");
		}

		void run()
		{
			cudaGraphLaunch(execGraph, stream);

			std::cout << sum << "\n";
		}

	private:

		void buildGraph()
		{
			cudaGraphNode_t kerNode, d2hNode, hostNode;

			// kernel
			{
				cudaKernelNodeParams P = { 0 };
				P.blockDim = dim3{N,1,1};
				P.gridDim = dim3{1,1,1};
				P.func = (void*) &iota;
				P.sharedMemBytes = 0;

				auto err = cudaGraphAddKernelNode(&kerNode, graph, NULL, 0, &P);
				cudaErrCheck(err, "kernel node graph add fail");
			}

			// D2H
			{
				std::vector<cudaGraphNode_t> dep = {kerNode};
				auto err = cudaGraphAddMemcpyNode1D(&d2hNode, graph, dep.data(), dep.size(), hostMem.data(), deviceMem, N * sizeof(int), cudaMemcpyDeviceToHost);
				cudaErrCheck(err, "1d D2H graph add fail");
			}

			// HOST
			{
				std::vector<cudaGraphNode_t> dep = { kerNode, d2hNode };

				cudaHostNodeParams P = {0};

				
				
				{
					auto fx = [this]()
					{
						gather();
					};

					P.fn = fx;
				}
				

				// global method
				{
					P.fn = gatherGlb;
					void* args[2] =
					{
						(void*)&hostMem,
						(void*)&sum
					};
					P.userData = args;		
				}

				// member method
				{
					P.fn = &crab::gather;
					P.userData = {0};
				}
				

				auto err = cudaGraphAddHostNode(&hostNode, graph, dep.data(), dep.size(), &P);
				cudaErrCheck(err, "host node graph add fail");
			}

			auto err = cudaGraphDebugDotPrint(graph, "graphFile", 0);

		}

		void gather()
		{
			sum = std::accumulate(hostMem.begin(), hostMem.end(), 0);
		}

		std::vector<int> hostMem;
		int* deviceMem;

		int sum;

		cudaStream_t stream;
		cudaGraph_t graph;
		cudaGraphExec_t execGraph;

		const size_t N = 128;
};

int main()
{
	crab test;

	test.run();

    return 0;
}

In the future, please post code inline, rather than an attachment. It makes it searchable and promotes discussion. I’ve edited your post in this case.

This is like using a CUDA callback.

The function literally has to have this kind of prototype:

void func(void *ptr);

(obviously it can be something other than func and the name of the pointer can be something other than ptr)

The arguments you pass need to be typically arranged in a struct, and you pass a pointer to that struct. Then in the function body you cast that void pointer to a pointer of that struct type, and retrieve your arguments.

The example I linked shows this.

I don’t think I’ve used CUDA callbacks before. Something about having to match that signature in the doc for cudaHostNodeParams probably would’ve helped there.


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <exception>
#include <iostream>
#include <vector>
#include <numeric>

#include <stdio.h>

void cudaErrCheck(cudaError_t err, const std::string label)
{
	if (err != cudaSuccess)
	{
		auto errorLabel = "CUDA API Error : " + label + " : " + cudaGetErrorName(err) + " : " + cudaGetErrorString(err);
		std::cout << "Fatal Error: " << errorLabel << " \n";
		throw std::runtime_error(errorLabel);
	}
}

struct accumulationArgs
{
	std::vector<int>* vecPtr;
	int* sumPtr;
};

__global__ void iota(int * data)
{
    data[threadIdx.x] = threadIdx.x;
}

// global function that does the same thing as the member function
void gatherGlb(void * arg)
{
	accumulationArgs * argPtr = static_cast<accumulationArgs *>(arg);

	*argPtr->sumPtr = std::accumulate(argPtr->vecPtr->begin(), argPtr->vecPtr->end(), 0);

}

class crab
{
	public:

		crab() : sum(0)
		{
			auto err = cudaStreamCreate(&stream);
			cudaErrCheck(err, "stream create fail");

			err = cudaGraphCreate(&graph, 0);
			cudaErrCheck(err, "graph create fail");

			err = cudaMalloc((void**)&deviceMem, N * sizeof(int));
			cudaErrCheck(err, "cudaMalloc failed");

			hostMem.resize(N);

			buildGraph();

			err = cudaGraphInstantiate(&execGraph, graph, 0);
			cudaErrCheck(err, "graph instatiate fail");
		}

		void run()
		{
			cudaGraphLaunch(execGraph, stream);

			cudaStreamSynchronize(stream);

			std::cout << sum << "\n";
		}

	private:

		void buildGraph()
		{
			cudaGraphNode_t kerNode, d2hNode, hostNode;

			// kernel
			{
				cudaKernelNodeParams P = { 0 };
				P.blockDim = dim3{(unsigned int)N,1,1};
				P.gridDim = dim3{1,1,1};
				P.func = (void*) iota;
				P.sharedMemBytes = 0;

				void* input[1];
				input[0]= (void*)&deviceMem;

				P.kernelParams = input;
				P.extra = NULL;

				auto err = cudaGraphAddKernelNode(&kerNode, graph, NULL, 0, &P);
				cudaErrCheck(err, "kernel node graph add fail");
			}

			// D2H
			{
				std::vector<cudaGraphNode_t> dep = {kerNode};
				auto err = cudaGraphAddMemcpyNode1D(&d2hNode, graph, dep.data(), dep.size(), hostMem.data(), deviceMem, N * sizeof(int), cudaMemcpyDeviceToHost);
				cudaErrCheck(err, "1d D2H graph add fail");
			}

			// HOST
			{
				std::vector<cudaGraphNode_t> dep = { d2hNode };

				cudaHostNodeParams P = {0};
								

				// global method
				{
					glAr.sumPtr = &sum;
					glAr.vecPtr = &hostMem;

					P.fn = gatherGlb;
					P.userData = (void*) &glAr;
				}

				// member method
				/*
				{
					P.fn = gather;
					P.userData = NULL;
				}
				*/		
				

				auto err = cudaGraphAddHostNode(&hostNode, graph, dep.data(), dep.size(), &P);
				cudaErrCheck(err, "host node graph add fail");
			}

			auto err = cudaGraphDebugDotPrint(graph, "graphFile", 0);
			cudaErrCheck(err, "graph printout fail");

		}

		void gather(void * arg)
		{
			sum = std::accumulate(hostMem.begin(), hostMem.end(), 0);
		}

		std::vector<int> hostMem;
		int* deviceMem;

		// persistent storage area for the arguments to the host function
		accumulationArgs glAr;

		int sum;

		cudaStream_t stream;
		cudaGraph_t graph;
		cudaGraphExec_t execGraph;

		const size_t N = 128;
};

int main()
{
	crab test;

	test.run();

    return 0;
}

This updated MRE is now working with a global function, but similar errors coming from the member fx. I could probably store member functions as lambdas and have a global function that just calls the lambda passed to it by reference, but hopefully there is an easier way.

You could pass your class object to a callback and call the member within.

{
    auto callback = [](void* args){
        crab* obj = (crab*)args;
        obj->gather(nullptr);
    };
	P.fn = callback;
	P.userData = (void*)this;
}
1 Like

This works. Thanks.

A little annoying to not be able to bind a member function like when launching a host thread, but it works.