Platform : Fedora Linux 4.1.13-100.fc21.x86_64
Nvidia Driver : 367.44
CUDA Driver Version/RT Version: 8.0/8.0
Compute Capability : 5.2
Here I have found a weird behaviour of CUDA. Following is the pseudocode.
int Main() {
int nstreams = 2;
cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++)
{
checkCudaErrors(cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking));
}
cudaEvent_t* kernelevents = (cudaEvent_t*)malloc(nstreams * sizeof(cudaEvent_t));
for(int i = 0; i < 2; i++) {
checkCudaErrors(cudaEventCreateWithFlags(kernelevents + i, cudaEventDisableTiming));
}
clock_block<<<1,1,0,streams[0]>>>(100 * 1000 * 1000); //run approix 100 ms
cudaEventRecord(kernelevents[0], streams[0]);
cudaError_t cudastatus;
while(true){
clock_block<<<1,1,0,streams[1]>>>(20 * 1000 * 1000); //run approix 20 ms
std::this_thread::sleep_for(std::chrono::milliseconds(20)); //wait for 20 ms before issuing another kernel launch operation
if( (cudastatus = cudaEventQuery(kernelevents[0])) == cudaSuccess) break;
}
}
void clock_block(clock_t clock_count){
unsigned int start_clock = (unsigned int) clock();
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
unsigned int end_clock = (unsigned int) clock();
clock_offset = (clock_t)(end_clock - start_clock);
}
}
When waiting for 20ms before issuing another kernel launch on streams 14, the profiler will give this:
In which you can see that the operation has been issued by host thread yet not been executed by the GPU until the persistent kernel launched by stream 13 finished.
When setting the waiting period to 10ms, we have following:
More kernels launched by stream 14 can be executed while the persistent kernel running, but still, the last kernel launched by stream 14 has to wait to be executed after the finish of the persistent kernel.
However, if we set the running period of the persistent kernel to 60ms, the repeatedly launched kernel remains 10ms, and the waiting period is 10ms as well. We will have following:
The concurrency appears correctly finally.
My thought is that the resources used to launch kernel is used up and the GPU scheduler has to wait for the release of resources from the termination of the persistent kernel so that it could launch another kernel from a different stream. However, with using only a handful of blocks, the SM should not be used up to get itself in the way of the appearance of concurrency.
Anyone has any idea about what is happening here?
Thank you