kernel launched via cudaLaunchCooperativeKernel runs in different stream

Hi.
For kernel synchronization, the kernel must be launched via API cudaLaunchCooperativeKernel.
Is it not possible that two kernels which are launched via API run concurrently?

I noticed that the stream parameter which is passed to cudaLaunchCooperativeKernel is used in a somewhat different way than in the common <<<>>> launch. While it is possible to use cudaStreamSynchronize with this stream, all kernels launched via cudaLaunchCooperativeKernel seem to be executed sequentially in a separate, extra stream.

This picture shows this.

https://abload.de/img/nvvpxhuxa.png

kernel8 is launched with via <<<>>>, kernel9 via API. If I run two kernel9 kernels, in the same streams which were used for concurrent kernel8 runs, the two kernels are run sequentially in a new third stream.

The testcode; compile with

nvcc -arch=sm_60 -rdc=true cooptest.cu -o cooptest

#include <stdio.h>
#include <assert.h>
#include <cooperative_groups.h>

using namespace cooperative_groups;

#define CUERR {                                                            \
    cudaError_t err;                                                       \
    if ((err = cudaGetLastError()) != cudaSuccess) {                       \
        printf("CUDA error: %s : %s, line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
        exit(1);                                                           \
    }                                                                      \
}

__global__
void kernel8(int* a, int b, int n){
    for(int k = 0; k < 64; k++){
        for(int i = threadIdx.x + blockDim.x * blockIdx.x; i < n; i += blockDim.x * gridDim.x){
            a[i] *= b;
        }
        __syncthreads();
    }
}

__global__
void kernel9(int* a, int b, int n){
    for(int k = 0; k < 64; k++){
        for(int i = threadIdx.x + blockDim.x * blockIdx.x; i < n; i += blockDim.x * gridDim.x){
            a[i] *= b;
        }
        this_grid().sync();
    }
}




int main(){
    int deviceId = 0;
    cudaSetDevice(deviceId);
    
    int N = 1 << 27;
    int b = 2;
    int* a, *a1, *a2, *c1, *c2;
    
    a = (int*)malloc(sizeof(int) * N);
    a1 = (int*)malloc(sizeof(int) * N);
    a2 = (int*)malloc(sizeof(int) * N);
    
    cudaMalloc(&c1, sizeof(int) * N); CUERR;
    cudaMalloc(&c2, sizeof(int) * N); CUERR;
    
    for(int i = 0; i < N; i++)
        a[i] = 1;
            
    cudaStream_t s1,s2;
    cudaStreamCreate(&s1); CUERR;
    cudaStreamCreate(&s2); CUERR;
    
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, deviceId);  CUERR;
    
    assert(deviceProp.concurrentKernels == 1);
    assert(deviceProp.cooperativeLaunch == 1);
    
    int threadsPerBlock = 256;
    
    int numBlocksPerSm1, numBlocksPerSm2;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm1, kernel8, threadsPerBlock, deviceId); CUERR;    
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm2, kernel9, threadsPerBlock, deviceId); CUERR;
        
    dim3 block(threadsPerBlock,1,1);
    dim3 kernel8grid(deviceProp.multiProcessorCount * numBlocksPerSm1 / 2, 1, 1);
    dim3 kernel9grid(deviceProp.multiProcessorCount * numBlocksPerSm2 / 2, 1, 1);
     
    void* params1[3];
    params1[0] = (void*)&c1;
    params1[1] = (void*)&b;
    params1[2] = (void*)&N;
    
    void* params2[3];
    params2[0] = (void*)&c2;
    params2[1] = (void*)&b;
    params2[2] = (void*)&N;    
    
    
    
    // two normal launches
    
    cudaMemcpy(c1, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;
    cudaMemcpy(c2, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;
    
    kernel8<<<kernel8grid, block, 0, s1>>>(c1, b, N); CUERR; 
    kernel8<<<kernel8grid, block, 0, s2>>>(c2, b, N); CUERR; 
    
    cudaStreamSynchronize(s1); CUERR;
    cudaStreamSynchronize(s2); CUERR;
    
    cudaMemcpy(a1, c1, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
    cudaMemcpy(a2, c2, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
    
    for(int i = 0; i < N; i++){
        if(a1[i] != a2[i]){
                printf("error\n"); 
                break;
        }
    }
    
    // two API launches   

    cudaMemcpy(c1, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;
    cudaMemcpy(c2, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;   
    
    cudaLaunchCooperativeKernel((void*) kernel9, kernel9grid, block, params1, 0, s1); CUERR;
    cudaLaunchCooperativeKernel((void*) kernel9, kernel9grid, block, params2, 0, s2); CUERR;

    cudaStreamSynchronize(s1); CUERR;
    cudaStreamSynchronize(s2);   CUERR;
    
    cudaMemcpy(a1, c1, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
    cudaMemcpy(a2, c2, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
    
    for(int i = 0; i < N; i++){
        if(a1[i] != a2[i]){
                printf("error\n"); 
                break;
        }
    }    
    
    // one normal launch and one API launch
    
    cudaMemcpy(c1, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;
    cudaMemcpy(c2, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;   
    
    cudaLaunchCooperativeKernel((void*) kernel9, kernel9grid, block, params1, 0, s1); CUERR;
    kernel8<<<kernel8grid, block, 0, s2>>>(c2, b, N); CUERR; 

    cudaStreamSynchronize(s1); CUERR;
    cudaStreamSynchronize(s2);   CUERR;
    
    cudaMemcpy(a1, c1, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
    cudaMemcpy(a2, c2, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
    
    for(int i = 0; i < N; i++){
        if(a1[i] != a2[i]){
                printf("error\n"); 
                break;
        }
    }      

    cudaStreamDestroy(s1); CUERR;
    cudaStreamDestroy(s2); CUERR;
    
    free(a); CUERR;
    free(a1); CUERR;
    free(a2); CUERR;
    cudaFree(c1); CUERR;
    cudaFree(c2); CUERR;
    
    cudaDeviceReset(); CUERR;
}

I recently got a response from Nvidia which I want to share with you.

It is currently not possible to have multiple concurrent cooperative kernels.

This is also mentioned in the programming guide.
“This function uses standard default stream semantics.”
I thought this just means that the default stream 0 is used if no stream is passed as an argument to the function but I was wrong.

Hi,

I want to follow this thread about the cooperative kernels.

Is there any update on using cooperative kernels with multiple regular streams in CUDA 10?

Specifically, if I have two cooperative kernels running on two different regular streams (created by cuStreamCreate), respectively, are they implicitly synchronous to each other? (Please notice that I have the grid synchronization inside these two cooperative kernels.)

It is illegal for a cooperative kernel to run concurrently with another kernel (cooperative, or otherwise).

They must be synchronous with each other for correct behavior guarantees.

Thank you for the clarification!