I found that CUDA stream will block when I launch lots of kernels (more than 1000). I am wondering is there any configuration that I can change?
In my experiments, I launch a small kernel 10000 times. This kernel ran shortly(about 190us). The kernel launched very fast when launching the first 1000 kernels. It takes 4~5us to launch a kernel. But after that, The launch process becomes slow. It takes about 190 us to launch a new kernel. The CUDA stream seems to wait for the previous kernel complete, and the buffer size is about 1000 kernels. When I created 3 streams, each stream can launch 1000 kernel asynchrony. I want to make this buffer bigger. I try to set cudaLimitDevRuntimePendingLaunchCount, but not work. Is there any way?
#include <stdio.h>
#include "cuda_runtime.h"
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
printf("Failed: Cuda error %s:%d '%s'\n", \
__FILE__,__LINE__,cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// a dummy kernel for test
__global__ void add(float *a, int n) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
for (int i=0; i<n; i++) {
a[id] = sqrt(a[id] + 1);
}
}
int main(int argc, char* argv[])
{
//managing 1 devices
int nDev = 1;
int nStream = 1;
int size = 32*1024*1024;
//allocating and initializing device buffers
float** buffer = (float**)malloc(nDev * sizeof(float*));
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev*nStream);
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
//CUDACHECK(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, 10000));
CUDACHECK(cudaMalloc(buffer + i, size * sizeof(float)));
CUDACHECK(cudaMemset(buffer[i], 1, size * sizeof(float)));
for (int j = 0; j<nStream; j++)
CUDACHECK(cudaStreamCreate(s+i*nStream+j));
}
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
for (int j=0; j<10000; j++) {
for (int k=0; k<nStream; k++)
add<<<32, 1024, 0, s[i*nStream+k]>>>(buffer[i], 1000);
}
}
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
cudaDeviceSynchronize();
}
//free device buffers
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaFree(buffer[i]));
}
printf("Success \n");
return 0;
}
Here is the nvprof results:
When I create 3 streams, the first 3000 kernel launched quickly and then become slow
When I create 1 streams, the first 1000 kernel launched quickly and then become slow
I also create a stackoverflow question: https://stackoverflow.com/questions/53970187/cuda-stream-are-blocked-when-luanch-lots-of-kernels-1000