Dynamic Parallelism synchronization between kernel launches

I am trying to do CDP launching multiple child kernels and am not sure how to achieve synchronization between different kernels. My code looks something like this:

global Parent_kernel()
{
child_kernel_1<<<num_blocks, num_threads>>>()
//1. Some computation using the results of the child_kernel_1 launch.
child_kernel_2<<<num_blocks, num_threads>>>()
//2. Some computation using the results of both the child_kernel_1 and child_kernel_2 launch.
}

Code is running fine and both the kernels are launching fine. I am not able to figure out how to synchronize it such that (1. Some computation using the results of the child_kernel_1 launch. ) will only run after child_kernel_1<<<num_blocks, num_threads>>>() completes and so on.

cudaDeviceSynchronize() and __syncthreads() does not work inside device kernels and cudaStreamTailLaunch stream would not work for me as using it would stop the kernel from executing until other work associated with the parent kernel is complete.

Sorry for the crude explanation, the actual code is very big. Thanks in advance for the suggestions.

With the removal of cudaDeviceSynchronize(), it is no longer possible to access the modifications made by the threads in the child grid from the parent grid. The only way to access the modifications made by the threads in the child grid before the parent grid exits is via a kernel launched into the cudaStreamTailLaunch stream.

See also cudaDeviceSynchronize from device code is deprecated - #16 by Robert_Crovella

I do understand that.
The issue is “Grids launched into the tail launch stream will not launch until the completion of all work by the parent grid, including all other grids (and their descendants) launched by the parent in all non-tail launched streams, including work executed or launched after the tail launch.”

Also Shared memory with Cuda Dynamic Parallelism(CDP ) and CDP 2 - #9 by Robert_Crovella states the same.

But in my case, I want the-> //2. Some computation using the results of both the child_kernel_1 and child_kernel_2 launch to execute AFTER child_kernel_2<<<num_blocks, num_threads>>>() is completed.

Launching child_kernel_2<<<num_blocks, num_threads>>>() in cudaStreamTailLaunch will result in it being launch after the rest of the code is executed.

Launching all kernels in the tail launch stream should give you the desired result, shouldn’t it? Stream ordering ensures that results of child1 and child2 are computed before using them in the next kernel.

This code seems to work:

#include <cstdio>
#include <thrust/device_vector.h>

__global__
void childkernel1(int* child1Result){
    child1Result[0] = 10;
}

__global__
void someComputationUsingChild1Results(int* child1Result){
    printf("someComputationUsingChild1Results %d\n", child1Result[0]);
}

__global__
void childkernel2(int* child2Result){
    child2Result[0] = 20;
}

__global__
void someComputationUsingBothChildrenResults(int* child1Result, int* child2Result){
    printf("someComputationUsingBothChildrenResults %d %d\n", child1Result[0], child2Result[0]);
}

__global__ void parentkernel(int* child1Result, int* child2Result){
    childkernel1<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    someComputationUsingChild1Results<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    childkernel2<<<1,1,0,cudaStreamTailLaunch>>>(child2Result);
    someComputationUsingBothChildrenResults<<<1,1,0,cudaStreamTailLaunch>>>(child1Result, child2Result);
}

int main(){
    thrust::device_vector<int> d_vec(2);
    parentkernel<<<1,1>>>(d_vec.data().get(), d_vec.data().get() + 1);
    cudaDeviceSynchronize();
}
someComputationUsingChild1Results 10
someComputationUsingBothChildrenResults 10 20

If you want the parent kernel to perform computations while the child kernels are executing, you could use the fireandforget stream

#include <cstdio>
#include <thrust/device_vector.h>

__global__
void childkernel1(int* child1Result){
    child1Result[0] = 10;
}

__global__
void someComputationUsingChild1Results(int* child1Result){
    printf("someComputationUsingChild1Results %d\n", child1Result[0]);
}

__global__
void childkernel2(int* child2Result){
    child2Result[0] = 20;
}

__global__
void someComputationUsingBothChildrenResults(int* child1Result, int* child2Result){
    printf("someComputationUsingBothChildrenResults %d %d\n", child1Result[0], child2Result[0]);
}

__global__
void sideComputations(int* child1Result, int* child2Result){
    childkernel1<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    someComputationUsingChild1Results<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    childkernel2<<<1,1,0,cudaStreamTailLaunch>>>(child2Result);
    someComputationUsingBothChildrenResults<<<1,1,0,cudaStreamTailLaunch>>>(child1Result, child2Result);
}

__global__ void parentkernel(int* child1Result, int* child2Result){
    childkernel1<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    someComputationUsingChild1Results<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    childkernel2<<<1,1,0,cudaStreamTailLaunch>>>(child2Result);
    someComputationUsingBothChildrenResults<<<1,1,0,cudaStreamTailLaunch>>>(child1Result, child2Result);
}

__global__ void parentkernel2(int* child1Result, int* child2Result){
    sideComputations<<<1,1,0,cudaStreamFireAndForget>>>(child1Result, child2Result);
    for(int i = 0; i < 10; i++){
        printf("in parent kernel\n");
    }
}

int main(){
    thrust::device_vector<int> d_vec(2);
    parentkernel2<<<1,1>>>(d_vec.data().get(), d_vec.data().get() + 1);
    cudaDeviceSynchronize();
}
in parent kernel
in parent kernel
in parent kernel
someComputationUsingChild1Results 10
in parent kernel
someComputationUsingBothChildrenResults 10 20
in parent kernel
in parent kernel
in parent kernel
in parent kernel
in parent kernel
in parent kernel

In my case the someComputationUsingChild1Results and someComputationUsingBothChildrenResults are Device functions, not global kernels. Hence, I was not able to achieve synchronization using cudaStreamTailLaunch.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdio>
#include <thrust/device_vector.h>

__global__
void childkernel1(int* child1Result) {
    child1Result[0] = 10;
	printf("childkernel1 %d\n", child1Result[0]);
}

__device__
void someComputationUsingChild1Results(int* child1Result) {
    printf("someComputationUsingChild1Results %d\n", child1Result[0]);
}

__global__
void childkernel2(int* child2Result) {
    child2Result[0] = 20;
}

__device__
void someComputationUsingBothChildrenResults(int* child1Result, int* child2Result) {
    printf("someComputationUsingBothChildrenResults %d %d\n", child1Result[0], child2Result[0]);
}

__global__ void parentkernel(int* child1Result, int* child2Result) {
    childkernel1 << <1, 1, 0, cudaStreamTailLaunch >> > (child1Result);
    someComputationUsingChild1Results(child1Result);
    childkernel2 << <1, 1, 0, cudaStreamTailLaunch >> > (child2Result);
    someComputationUsingBothChildrenResults(child1Result, child2Result);
}

int main() {
    thrust::device_vector<int> d_vec(2);
    parentkernel << <1, 1 >> > (d_vec.data().get(), d_vec.data().get() + 1);
    cudaDeviceSynchronize();
}

Was giving the output


someComputationUsingChild1Results 0
someComputationUsingBothChildrenResults 0 0
childkernel1 10

Converting the device code to global kernel and calling it using << <1, 1, 0, cudaStreamTailLaunch >> > did the job. Thanks!!

someComputationUsingChild1Results 10
someComputationUsingBothChildrenResults 10 20