Launching several kernels on one stream while another kernel running persistently in the background

Platform : Fedora Linux 4.1.13-100.fc21.x86_64
Nvidia Driver : 367.44
CUDA Driver Version/RT Version: 8.0/8.0
Compute Capability : 5.2

Here I have found a weird behaviour of CUDA. Following is the pseudocode.

int Main() {
    int nstreams = 2;

    cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));

    for (int i = 0; i < nstreams; i++)
    {
        checkCudaErrors(cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking));
    }

    cudaEvent_t* kernelevents = (cudaEvent_t*)malloc(nstreams * sizeof(cudaEvent_t));
    for(int i = 0; i < 2; i++) {
    	checkCudaErrors(cudaEventCreateWithFlags(kernelevents + i, cudaEventDisableTiming));
    }

    clock_block<<<1,1,0,streams[0]>>>(100 * 1000 * 1000); //run approix 100 ms
    cudaEventRecord(kernelevents[0], streams[0]);

    cudaError_t cudastatus;
    while(true){
    	clock_block<<<1,1,0,streams[1]>>>(20 * 1000 * 1000); //run approix 20 ms
    	std::this_thread::sleep_for(std::chrono::milliseconds(20)); //wait for 20 ms before issuing another kernel launch operation
    	if( (cudastatus = cudaEventQuery(kernelevents[0])) == cudaSuccess) break;
    }
}

void clock_block(clock_t clock_count){
    unsigned int start_clock = (unsigned int) clock();

    clock_t clock_offset = 0;

    while (clock_offset < clock_count)
    {
        unsigned int end_clock = (unsigned int) clock();
        clock_offset = (clock_t)(end_clock - start_clock);
    }
}

When waiting for 20ms before issuing another kernel launch on streams 14, the profiler will give this:

In which you can see that the operation has been issued by host thread yet not been executed by the GPU until the persistent kernel launched by stream 13 finished.

When setting the waiting period to 10ms, we have following:

More kernels launched by stream 14 can be executed while the persistent kernel running, but still, the last kernel launched by stream 14 has to wait to be executed after the finish of the persistent kernel.

However, if we set the running period of the persistent kernel to 60ms, the repeatedly launched kernel remains 10ms, and the waiting period is 10ms as well. We will have following:

The concurrency appears correctly finally.

My thought is that the resources used to launch kernel is used up and the GPU scheduler has to wait for the release of resources from the termination of the persistent kernel so that it could launch another kernel from a different stream. However, with using only a handful of blocks, the SM should not be used up to get itself in the way of the appearance of concurrency.

Anyone has any idea about what is happening here?

Thank you

Now, I have changed the code a little bit where I exempted the cuda event api from the code.
The real code looks like this(not pseudo code) :

__global__ void clock_block(clock_t clock_count)
{
    unsigned int start_clock = (unsigned int) clock();

    clock_t clock_offset = 0;

    while (clock_offset < clock_count)
    {
        unsigned int end_clock = (unsigned int) clock();

        clock_offset = (clock_t)(end_clock - start_clock);
    }
}

int main(int argc, char** argv) {
    int nstreams = 2;

    cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));

    for (int i = 0; i < nstreams; i++)
    {
        checkCudaErrors(cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking));
    }

    std::chrono::duration<double> timer;
    std::chrono::time_point<std::chrono::system_clock> start, end;

    clock_block<<<1,1,0,streams[0]>>>(60 * 1000 * 1000); //ms
    start = std::chrono::system_clock::now();

    while(timer.count() <  60.0f * (1.0f / 1000.0f)) //ms
    {
        end = std::chrono::system_clock::now();
        timer = end - start;
    	clock_block<<<1,1,0,streams[1]>>>(10 * 1000 * 1000); //ms
    	std::this_thread::sleep_for(std::chrono::milliseconds(10));
    }

    for(int i = 0; i < nstreams; i++) {
    	checkCudaErrors(cudaStreamDestroy(streams[i]));
    }

    cudaDeviceReset();

    delete[] streams;

    printf("End.\n");
	return 0;
}

The persistent kernel lasts 60ms, the repeated kernel lasts 10ms and the host thread will issue a repeated kernel launch on stream 14 every 10ms within a period of 60ms.

And here we have the profiler snapshot [1]:

and snapshot [2]:

The occurrence of [1] and [2] is totally random. After a fresh reboot, run the program, it will give you either [1] or [2]. If you run it again immediately after the first run, it also behaves randomly.

Also, I have tested the code with cuda-memcheck toolkits and there are no errors reported by any of the tools(because normally the undefined or unexplainable behaviour is caused by silent errors that not aware by the cuda-gdb).

Any idea?