Dynamic Parallelism synchronization between kernel launches

tbhandari563 · January 28, 2025, 1:16pm

I am trying to do CDP launching multiple child kernels and am not sure how to achieve synchronization between different kernels. My code looks something like this:

global Parent_kernel()
{
child_kernel_1<<<num_blocks, num_threads>>>()
//1. Some computation using the results of the child_kernel_1 launch.
child_kernel_2<<<num_blocks, num_threads>>>()
//2. Some computation using the results of both the child_kernel_1 and child_kernel_2 launch.
}

Code is running fine and both the kernels are launching fine. I am not able to figure out how to synchronize it such that (1. Some computation using the results of the child_kernel_1 launch. ) will only run after child_kernel_1<<<num_blocks, num_threads>>>() completes and so on.

cudaDeviceSynchronize() and __syncthreads() does not work inside device kernels and cudaStreamTailLaunch stream would not work for me as using it would stop the kernel from executing until other work associated with the parent kernel is complete.

Sorry for the crude explanation, the actual code is very big. Thanks in advance for the suggestions.

Curefab · January 28, 2025, 1:41pm

With the removal of cudaDeviceSynchronize(), it is no longer possible to access the modifications made by the threads in the child grid from the parent grid. The only way to access the modifications made by the threads in the child grid before the parent grid exits is via a kernel launched into the cudaStreamTailLaunch stream.

See also cudaDeviceSynchronize from device code is deprecated - #16 by Robert_Crovella

tbhandari563 · January 31, 2025, 7:36am

I do understand that.
The issue is “Grids launched into the tail launch stream will not launch until the completion of all work by the parent grid, including all other grids (and their descendants) launched by the parent in all non-tail launched streams, including work executed or launched after the tail launch.”

Also Shared memory with Cuda Dynamic Parallelism(CDP ) and CDP 2 - #9 by Robert_Crovella states the same.

But in my case, I want the-> //2. Some computation using the results of both the child_kernel_1 and child_kernel_2 launch to execute AFTER child_kernel_2<<<num_blocks, num_threads>>>() is completed.

Launching child_kernel_2<<<num_blocks, num_threads>>>() in cudaStreamTailLaunch will result in it being launch after the rest of the code is executed.

striker159 · January 31, 2025, 8:17am

Launching all kernels in the tail launch stream should give you the desired result, shouldn’t it? Stream ordering ensures that results of child1 and child2 are computed before using them in the next kernel.

This code seems to work:

#include <cstdio>
#include <thrust/device_vector.h>

__global__
void childkernel1(int* child1Result){
    child1Result[0] = 10;
}

__global__
void someComputationUsingChild1Results(int* child1Result){
    printf("someComputationUsingChild1Results %d\n", child1Result[0]);
}

__global__
void childkernel2(int* child2Result){
    child2Result[0] = 20;
}

__global__
void someComputationUsingBothChildrenResults(int* child1Result, int* child2Result){
    printf("someComputationUsingBothChildrenResults %d %d\n", child1Result[0], child2Result[0]);
}

__global__ void parentkernel(int* child1Result, int* child2Result){
    childkernel1<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    someComputationUsingChild1Results<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    childkernel2<<<1,1,0,cudaStreamTailLaunch>>>(child2Result);
    someComputationUsingBothChildrenResults<<<1,1,0,cudaStreamTailLaunch>>>(child1Result, child2Result);
}

int main(){
    thrust::device_vector<int> d_vec(2);
    parentkernel<<<1,1>>>(d_vec.data().get(), d_vec.data().get() + 1);
    cudaDeviceSynchronize();
}

someComputationUsingChild1Results 10
someComputationUsingBothChildrenResults 10 20

If you want the parent kernel to perform computations while the child kernels are executing, you could use the fireandforget stream

#include <cstdio>
#include <thrust/device_vector.h>

__global__
void childkernel1(int* child1Result){
    child1Result[0] = 10;
}

__global__
void someComputationUsingChild1Results(int* child1Result){
    printf("someComputationUsingChild1Results %d\n", child1Result[0]);
}

__global__
void childkernel2(int* child2Result){
    child2Result[0] = 20;
}

__global__
void someComputationUsingBothChildrenResults(int* child1Result, int* child2Result){
    printf("someComputationUsingBothChildrenResults %d %d\n", child1Result[0], child2Result[0]);
}

__global__
void sideComputations(int* child1Result, int* child2Result){
    childkernel1<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    someComputationUsingChild1Results<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    childkernel2<<<1,1,0,cudaStreamTailLaunch>>>(child2Result);
    someComputationUsingBothChildrenResults<<<1,1,0,cudaStreamTailLaunch>>>(child1Result, child2Result);
}

__global__ void parentkernel(int* child1Result, int* child2Result){
    childkernel1<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    someComputationUsingChild1Results<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
    childkernel2<<<1,1,0,cudaStreamTailLaunch>>>(child2Result);
    someComputationUsingBothChildrenResults<<<1,1,0,cudaStreamTailLaunch>>>(child1Result, child2Result);
}

__global__ void parentkernel2(int* child1Result, int* child2Result){
    sideComputations<<<1,1,0,cudaStreamFireAndForget>>>(child1Result, child2Result);
    for(int i = 0; i < 10; i++){
        printf("in parent kernel\n");
    }
}

int main(){
    thrust::device_vector<int> d_vec(2);
    parentkernel2<<<1,1>>>(d_vec.data().get(), d_vec.data().get() + 1);
    cudaDeviceSynchronize();
}

in parent kernel
in parent kernel
in parent kernel
someComputationUsingChild1Results 10
in parent kernel
someComputationUsingBothChildrenResults 10 20
in parent kernel
in parent kernel
in parent kernel
in parent kernel
in parent kernel
in parent kernel

tbhandari563 · February 5, 2025, 7:34am

In my case the someComputationUsingChild1Results and someComputationUsingBothChildrenResults are Device functions, not global kernels. Hence, I was not able to achieve synchronization using cudaStreamTailLaunch.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdio>
#include <thrust/device_vector.h>

__global__
void childkernel1(int* child1Result) {
    child1Result[0] = 10;
	printf("childkernel1 %d\n", child1Result[0]);
}

__device__
void someComputationUsingChild1Results(int* child1Result) {
    printf("someComputationUsingChild1Results %d\n", child1Result[0]);
}

__global__
void childkernel2(int* child2Result) {
    child2Result[0] = 20;
}

__device__
void someComputationUsingBothChildrenResults(int* child1Result, int* child2Result) {
    printf("someComputationUsingBothChildrenResults %d %d\n", child1Result[0], child2Result[0]);
}

__global__ void parentkernel(int* child1Result, int* child2Result) {
    childkernel1 << <1, 1, 0, cudaStreamTailLaunch >> > (child1Result);
    someComputationUsingChild1Results(child1Result);
    childkernel2 << <1, 1, 0, cudaStreamTailLaunch >> > (child2Result);
    someComputationUsingBothChildrenResults(child1Result, child2Result);
}

int main() {
    thrust::device_vector<int> d_vec(2);
    parentkernel << <1, 1 >> > (d_vec.data().get(), d_vec.data().get() + 1);
    cudaDeviceSynchronize();
}

Was giving the output


someComputationUsingChild1Results 0
someComputationUsingBothChildrenResults 0 0
childkernel1 10

Converting the device code to global kernel and calling it using << <1, 1, 0, cudaStreamTailLaunch >> > did the job. Thanks!!

someComputationUsingChild1Results 10
someComputationUsingBothChildrenResults 10 20

system · February 19, 2025, 7:35am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Got wrong result when not using cudaDeviceSynchronize in threads CUDA Programming and Performance	6	838	February 1, 2024
Dynamic Parallelism, HyperQ and cudaDeviceSynchronize() CUDA Programming and Performance	1	1036	June 23, 2016
Dynamic parallelism, Kernel didn't launch CUDA Programming and Performance	6	1868	September 12, 2016
cudaDeviceSynchronize from device code is deprecated CUDA Programming and Performance	15	6938	March 18, 2024
device streams CUDA Programming and Performance	10	4252	February 7, 2016
Dynamic Parallelism parent and child memory consistency CUDA Programming and Performance	2	689	November 11, 2015
Synchronization in Dynamic Parallelism CUDA Programming and Performance cuda	3	986	October 17, 2023
Is dynamic parallelism suitable for this application? CUDA Programming and Performance	3	1195	August 20, 2013
Shared memory with Cuda Dynamic Parallelism(CDP ) and CDP 2 CUDA Programming and Performance cuda , kernel , parallel-computing	10	929	January 19, 2024
Dynamic Parallelism 2 CUDA Programming and Performance	6	650	December 14, 2017

Dynamic Parallelism synchronization between kernel launches

Related topics