In this example, I create a graph that simulates work done in 10 chunks, where the kernels are executed serially, and the HtoD memcpys are performed to a buffer with space for two chunks. The graph looks like this:
Here’s the code:
#include <cstdio>
#include <vector>
#define gpuErrchk(ans) \
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
__host__ inline void gpuAssert(cudaError_t code, const char* file, int line,
bool abort = true) {
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
}
}
__global__ void kernel(int* indices, size_t num_indices) {
size_t sleep_time = 10'000'000;
size_t start = clock();
size_t clock_offset = 0;
while (clock_offset < sleep_time) {
clock_offset = clock() - start;
}
}
int main() {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
size_t num_indices = prop.totalGlobalMem / (30 * sizeof(int));
size_t num_chunks = 10;
size_t chunk_sizes = num_indices / num_chunks;
size_t num_buffers = 2;
cudaGraph_t graph;
gpuErrchk(cudaGraphCreate(&graph, 0));
int* d_indices;
gpuErrchk(cudaMalloc(&d_indices, num_buffers * chunk_sizes * sizeof(int)));
int* d_results;
gpuErrchk(cudaMalloc(&d_results, num_indices * sizeof(int)));
std::vector<int> indices(num_indices);
std::vector<int> results(num_indices);
// If host results not mapped, memcpys do not run paralell to kernels
// gpuErrchk(cudaHostRegister(indices.data(), num_indices * sizeof(int),
// cudaHostRegisterMapped));
gpuErrchk(cudaHostRegister(results.data(), num_indices * sizeof(int),
cudaHostRegisterMapped));
std::vector<cudaGraphNode_t> copy_indices_nodes(num_chunks);
std::vector<cudaGraphNode_t> kernel_nodes(num_chunks);
std::vector<cudaGraphNode_t> copy_results_nodes(num_chunks);
for (int i = 0; i < num_chunks; i++) {
auto current_chunk = d_indices + chunk_sizes * (i % num_buffers);
gpuErrchk(cudaGraphAddMemcpyNode1D(
©_indices_nodes[i], graph, nullptr, 0, current_chunk,
indices.data() + i * chunk_sizes, chunk_sizes * sizeof(int),
cudaMemcpyHostToDevice));
if (i >= num_buffers) {
gpuErrchk(cudaGraphAddDependencies(graph, &kernel_nodes[i - num_buffers],
©_indices_nodes[i], 1));
}
cudaKernelNodeParams kernel_params;
kernel_params.func = (void*)kernel;
kernel_params.gridDim = dim3(10, 1, 1);
kernel_params.blockDim = dim3(256, 1, 1);
kernel_params.sharedMemBytes = 0;
void* args[2] = {¤t_chunk, &chunk_sizes};
kernel_params.kernelParams = args;
kernel_params.extra = nullptr;
gpuErrchk(cudaGraphAddKernelNode(
&kernel_nodes[i], graph, ©_indices_nodes[i], 1, &kernel_params));
if (i > 0) {
// Sequential execution of kernels
gpuErrchk(cudaGraphAddDependencies(graph, &kernel_nodes[i - 1],
&kernel_nodes[i], 1));
}
gpuErrchk(cudaGraphAddMemcpyNode1D(
©_results_nodes[i], graph, &kernel_nodes[i], 1,
results.data() + i * chunk_sizes, d_results + i * chunk_sizes,
chunk_sizes * sizeof(int), cudaMemcpyDeviceToHost));
}
cudaGraphExec_t graphExec;
gpuErrchk(cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
gpuErrchk(cudaGraphLaunch(graphExec, 0));
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaFree(d_indices));
gpuErrchk(cudaFree(d_results));
return 0;
}
The code is compiled with: nvcc -o example -arch=sm_75 test.cu
with CUDA 12.1, and run on an RTX 2080 Ti. When the host memory of the destination of the DtoH memcpy (results
in the code) is not mapped, everything executes serially. When mapped the graph executes as expected. Mapping the source of the HtoD memcpy does not affect anything. In the images you can see the execution as well as confirmation on which memory is mapped or pageable:
What’s the reason for this? Is this expected behaviour? Here is also a link to the profiling files. Thank you for the help!