I have an algorithm that uses CDP1, where each block will occasionally and repeatedly (depending on some condition) launch a child kernel with a larger blockSize, then do something with the results. I’m looking to refactor this to work with CDP2, but am a little unclear on the optimal/intended way to get achieve behavior. Is there a canonical CDP2 way to do this?
Currently, when the condition is hit, the parent kernel will:
- Preprocess data to be passed to child kernel
- Launch the child kernel with
child blockSize = parent blockSize * SOME_UPSCALE_FACTOR
andnumBlocks = 1
- Wait for the child kernel to finish (with
cudaDeviceSynchronize
) - Postprocess data returned from child kernel
- Continue with parent kernel execution
For example:
#include <cstdio>
#include <cuda_runtime.h>
/* upscale factor for child kernel */
constexpr uint CHILD_BLOCKSIZE_FACTOR = 8;
/* state from parent */
struct State {
/* e.g. parent blockID, pointers to global memory etc */
uint parent_block, *d_my_ptr;
/* update state somehow, e.g. writing to global memory */
__device__ void update(uint samples) {
/* do something */
if (threadIdx.x == 0)
printf("updating parent_block %u with %u samples\n", parent_block,
samples);
}
};
/* child kernel, updates state at a different resolution */
__global__ void child_kernel(State state) {
/* update state with more samples */
state.update(/*samples=*/blockDim.x);
}
/* called by parent kernel before child_kernel */
__device__ void upsample(State state) { /* do something */ }
/* called by parent kernel after child_kernel */
__device__ void downsample(State state) { /* do something */ }
/* some condition to determine when to run child kernel */
__device__ bool child_condition(uint step) {
return /*some condition, e.g.*/ (step % 2) == 1;
}
/* using CDP1 and cudaDeviceSynchronize -- not supported anymore */
__global__ void parent_kernel(uint nsteps) {
/* parent initializes some state */
State state{.parent_block = blockIdx.x, /* ... */};
for (uint step = 0; step < nsteps; step++) {
if (child_condition(step)) {
/* prepare data for child kernel */
upsample(state);
/* on some condition, call child kernel, passing state */
if (threadIdx.x == 0)
child_kernel<<<1, blockDim.x * CHILD_BLOCKSIZE_FACTOR>>>(state);
/* need to synchronize to see results for CDP1 */
cudaDeviceSynchronize();
/* convert data back to use in parent kernel */
downsample(state);
} else {
/* otherwise update normally */
state.update(/*samples=*/blockDim.x);
}
}
}