Hi.
For kernel synchronization, the kernel must be launched via API cudaLaunchCooperativeKernel.
Is it not possible that two kernels which are launched via API run concurrently?
I noticed that the stream parameter which is passed to cudaLaunchCooperativeKernel is used in a somewhat different way than in the common <<<>>> launch. While it is possible to use cudaStreamSynchronize with this stream, all kernels launched via cudaLaunchCooperativeKernel seem to be executed sequentially in a separate, extra stream.
This picture shows this.
https://abload.de/img/nvvpxhuxa.png
kernel8 is launched with via <<<>>>, kernel9 via API. If I run two kernel9 kernels, in the same streams which were used for concurrent kernel8 runs, the two kernels are run sequentially in a new third stream.
The testcode; compile with
nvcc -arch=sm_60 -rdc=true cooptest.cu -o cooptest
#include <stdio.h>
#include <assert.h>
#include <cooperative_groups.h>
using namespace cooperative_groups;
#define CUERR { \
cudaError_t err; \
if ((err = cudaGetLastError()) != cudaSuccess) { \
printf("CUDA error: %s : %s, line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
exit(1); \
} \
}
__global__
void kernel8(int* a, int b, int n){
for(int k = 0; k < 64; k++){
for(int i = threadIdx.x + blockDim.x * blockIdx.x; i < n; i += blockDim.x * gridDim.x){
a[i] *= b;
}
__syncthreads();
}
}
__global__
void kernel9(int* a, int b, int n){
for(int k = 0; k < 64; k++){
for(int i = threadIdx.x + blockDim.x * blockIdx.x; i < n; i += blockDim.x * gridDim.x){
a[i] *= b;
}
this_grid().sync();
}
}
int main(){
int deviceId = 0;
cudaSetDevice(deviceId);
int N = 1 << 27;
int b = 2;
int* a, *a1, *a2, *c1, *c2;
a = (int*)malloc(sizeof(int) * N);
a1 = (int*)malloc(sizeof(int) * N);
a2 = (int*)malloc(sizeof(int) * N);
cudaMalloc(&c1, sizeof(int) * N); CUERR;
cudaMalloc(&c2, sizeof(int) * N); CUERR;
for(int i = 0; i < N; i++)
a[i] = 1;
cudaStream_t s1,s2;
cudaStreamCreate(&s1); CUERR;
cudaStreamCreate(&s2); CUERR;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceId); CUERR;
assert(deviceProp.concurrentKernels == 1);
assert(deviceProp.cooperativeLaunch == 1);
int threadsPerBlock = 256;
int numBlocksPerSm1, numBlocksPerSm2;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm1, kernel8, threadsPerBlock, deviceId); CUERR;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm2, kernel9, threadsPerBlock, deviceId); CUERR;
dim3 block(threadsPerBlock,1,1);
dim3 kernel8grid(deviceProp.multiProcessorCount * numBlocksPerSm1 / 2, 1, 1);
dim3 kernel9grid(deviceProp.multiProcessorCount * numBlocksPerSm2 / 2, 1, 1);
void* params1[3];
params1[0] = (void*)&c1;
params1[1] = (void*)&b;
params1[2] = (void*)&N;
void* params2[3];
params2[0] = (void*)&c2;
params2[1] = (void*)&b;
params2[2] = (void*)&N;
// two normal launches
cudaMemcpy(c1, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;
cudaMemcpy(c2, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;
kernel8<<<kernel8grid, block, 0, s1>>>(c1, b, N); CUERR;
kernel8<<<kernel8grid, block, 0, s2>>>(c2, b, N); CUERR;
cudaStreamSynchronize(s1); CUERR;
cudaStreamSynchronize(s2); CUERR;
cudaMemcpy(a1, c1, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
cudaMemcpy(a2, c2, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
for(int i = 0; i < N; i++){
if(a1[i] != a2[i]){
printf("error\n");
break;
}
}
// two API launches
cudaMemcpy(c1, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;
cudaMemcpy(c2, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;
cudaLaunchCooperativeKernel((void*) kernel9, kernel9grid, block, params1, 0, s1); CUERR;
cudaLaunchCooperativeKernel((void*) kernel9, kernel9grid, block, params2, 0, s2); CUERR;
cudaStreamSynchronize(s1); CUERR;
cudaStreamSynchronize(s2); CUERR;
cudaMemcpy(a1, c1, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
cudaMemcpy(a2, c2, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
for(int i = 0; i < N; i++){
if(a1[i] != a2[i]){
printf("error\n");
break;
}
}
// one normal launch and one API launch
cudaMemcpy(c1, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;
cudaMemcpy(c2, a, sizeof(int) * N, cudaMemcpyHostToDevice); CUERR;
cudaLaunchCooperativeKernel((void*) kernel9, kernel9grid, block, params1, 0, s1); CUERR;
kernel8<<<kernel8grid, block, 0, s2>>>(c2, b, N); CUERR;
cudaStreamSynchronize(s1); CUERR;
cudaStreamSynchronize(s2); CUERR;
cudaMemcpy(a1, c1, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
cudaMemcpy(a2, c2, sizeof(int) * N, cudaMemcpyDeviceToHost); CUERR;
for(int i = 0; i < N; i++){
if(a1[i] != a2[i]){
printf("error\n");
break;
}
}
cudaStreamDestroy(s1); CUERR;
cudaStreamDestroy(s2); CUERR;
free(a); CUERR;
free(a1); CUERR;
free(a2); CUERR;
cudaFree(c1); CUERR;
cudaFree(c2); CUERR;
cudaDeviceReset(); CUERR;
}