Refactoring CDP1 code to use CDP2

I have an algorithm that uses CDP1, where each block will occasionally and repeatedly (depending on some condition) launch a child kernel with a larger blockSize, then do something with the results. I’m looking to refactor this to work with CDP2, but am a little unclear on the optimal/intended way to get achieve behavior. Is there a canonical CDP2 way to do this?

Currently, when the condition is hit, the parent kernel will:

  1. Preprocess data to be passed to child kernel
  2. Launch the child kernel with child blockSize = parent blockSize * SOME_UPSCALE_FACTOR and numBlocks = 1
  3. Wait for the child kernel to finish (with cudaDeviceSynchronize)
  4. Postprocess data returned from child kernel
  5. Continue with parent kernel execution

For example:

#include <cstdio>
#include <cuda_runtime.h>

/* upscale factor for child kernel */
constexpr uint CHILD_BLOCKSIZE_FACTOR = 8;

/* state from parent */
struct State {
    /* e.g. parent blockID, pointers to global memory etc */
    uint parent_block, *d_my_ptr;

    /* update state somehow, e.g. writing to global memory */
    __device__ void update(uint samples) {
        /* do something */
        if (threadIdx.x == 0)
            printf("updating parent_block %u with %u samples\n", parent_block,
                   samples);
    }
};

/* child kernel, updates state at a different resolution */
__global__ void child_kernel(State state) {
    /* update state with more samples */
    state.update(/*samples=*/blockDim.x);
}

/* called by parent kernel before child_kernel */
__device__ void upsample(State state) { /* do something */ }

/* called by parent kernel after child_kernel */
__device__ void downsample(State state) { /* do something */ }

/* some condition to determine when to run child kernel */
__device__ bool child_condition(uint step) {
    return /*some condition, e.g.*/ (step % 2) == 1;
}

/* using CDP1 and cudaDeviceSynchronize -- not supported anymore */
__global__ void parent_kernel(uint nsteps) {
    /* parent initializes some state */
    State state{.parent_block = blockIdx.x, /* ... */};

    for (uint step = 0; step < nsteps; step++) {
        if (child_condition(step)) {
            /* prepare data for child kernel */
            upsample(state);
            /* on some condition, call child kernel, passing state */
            if (threadIdx.x == 0)
                child_kernel<<<1, blockDim.x * CHILD_BLOCKSIZE_FACTOR>>>(state);
            /* need to synchronize to see results for CDP1 */
            cudaDeviceSynchronize();
            /* convert data back to use in parent kernel */
            downsample(state);
        } else {
            /* otherwise update normally */
            state.update(/*samples=*/blockDim.x);
        }
    }
}

From what I can tell, this works:

// ... as above

__global__ void scheduler_kernel(State state, uint next_step, uint nsteps,
                                 bool after_child) {
    /* if we came after a child kernel, we need to downsample */
    if (after_child)
        downsample(state);

    for (uint step = next_step; step < nsteps; step++) {
        if (child_condition(step)) {
            /* prepare data for child kernel */
            upsample(state);
            if (threadIdx.x == 0) {
                /* on some condition, call child kernel, passing state */
                child_kernel<<<1, blockDim.x * CHILD_BLOCKSIZE_FACTOR>>>(state);
                /* queue up scheduler kernel again, if we aren't done */
                if (step != (nsteps - 1)) {
                    scheduler_kernel<<<1, blockDim.x, 0,
                                       cudaStreamTailLaunch>>>(
                        state, step + 1, nsteps, /*after_child=*/true);
                }
            }
            /* and return */
            return;
        } else {
            /* otherwise update normally */
            state.update(/*samples=*/blockDim.x);
        }
    }
}

__global__ void cdp2_parent_kernel(uint nsteps) {
    /* parent initializes some state */
    State state{.parent_block = blockIdx.x, /* ... */};

    if (threadIdx.x == 0)
        scheduler_kernel<<<1, blockDim.x>>>(state, 0, nsteps,
                                            /*after_child=*/false);
}
  • Is this a valid use of CDP2?
  • Is there a better approach?
  • Are there any obvious performance issue with this approach? I’m now launching a lot more kernels.
  • When should I be using cudaStreamFireAndForget? Should I be using it to launch child kernels? If I launch a cudaStreamFireAndForget kernel and then a cudaStreamTailLaunch kernel, will the tail launch kernel wait for the fire and forget kernel?

Some recent posts on this topic:

1 2

Each of those also link to other resources.