Launching all kernels in the tail launch stream should give you the desired result, shouldn’t it? Stream ordering ensures that results of child1 and child2 are computed before using them in the next kernel.
This code seems to work:
#include <cstdio>
#include <thrust/device_vector.h>
__global__
void childkernel1(int* child1Result){
child1Result[0] = 10;
}
__global__
void someComputationUsingChild1Results(int* child1Result){
printf("someComputationUsingChild1Results %d\n", child1Result[0]);
}
__global__
void childkernel2(int* child2Result){
child2Result[0] = 20;
}
__global__
void someComputationUsingBothChildrenResults(int* child1Result, int* child2Result){
printf("someComputationUsingBothChildrenResults %d %d\n", child1Result[0], child2Result[0]);
}
__global__ void parentkernel(int* child1Result, int* child2Result){
childkernel1<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
someComputationUsingChild1Results<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
childkernel2<<<1,1,0,cudaStreamTailLaunch>>>(child2Result);
someComputationUsingBothChildrenResults<<<1,1,0,cudaStreamTailLaunch>>>(child1Result, child2Result);
}
int main(){
thrust::device_vector<int> d_vec(2);
parentkernel<<<1,1>>>(d_vec.data().get(), d_vec.data().get() + 1);
cudaDeviceSynchronize();
}
someComputationUsingChild1Results 10
someComputationUsingBothChildrenResults 10 20
If you want the parent kernel to perform computations while the child kernels are executing, you could use the fireandforget stream
#include <cstdio>
#include <thrust/device_vector.h>
__global__
void childkernel1(int* child1Result){
child1Result[0] = 10;
}
__global__
void someComputationUsingChild1Results(int* child1Result){
printf("someComputationUsingChild1Results %d\n", child1Result[0]);
}
__global__
void childkernel2(int* child2Result){
child2Result[0] = 20;
}
__global__
void someComputationUsingBothChildrenResults(int* child1Result, int* child2Result){
printf("someComputationUsingBothChildrenResults %d %d\n", child1Result[0], child2Result[0]);
}
__global__
void sideComputations(int* child1Result, int* child2Result){
childkernel1<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
someComputationUsingChild1Results<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
childkernel2<<<1,1,0,cudaStreamTailLaunch>>>(child2Result);
someComputationUsingBothChildrenResults<<<1,1,0,cudaStreamTailLaunch>>>(child1Result, child2Result);
}
__global__ void parentkernel(int* child1Result, int* child2Result){
childkernel1<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
someComputationUsingChild1Results<<<1,1,0,cudaStreamTailLaunch>>>(child1Result);
childkernel2<<<1,1,0,cudaStreamTailLaunch>>>(child2Result);
someComputationUsingBothChildrenResults<<<1,1,0,cudaStreamTailLaunch>>>(child1Result, child2Result);
}
__global__ void parentkernel2(int* child1Result, int* child2Result){
sideComputations<<<1,1,0,cudaStreamFireAndForget>>>(child1Result, child2Result);
for(int i = 0; i < 10; i++){
printf("in parent kernel\n");
}
}
int main(){
thrust::device_vector<int> d_vec(2);
parentkernel2<<<1,1>>>(d_vec.data().get(), d_vec.data().get() + 1);
cudaDeviceSynchronize();
}
in parent kernel
in parent kernel
in parent kernel
someComputationUsingChild1Results 10
in parent kernel
someComputationUsingBothChildrenResults 10 20
in parent kernel
in parent kernel
in parent kernel
in parent kernel
in parent kernel
in parent kernel