I’m trying to use streams to run the same kernel on several sets of data, but can’t seem to get it to work when I set the number of streams to greater than one.
I’ve extracted the relevant portions of the code below.
// Setup event handles
cudaEvent_t start_event, stop_event;
CUDA_SAFE_CALL(cudaEventCreate(&start_event));
CUDA_SAFE_CALL(cudaEventCreate(&stop_event));
float total_time;
// allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t*) malloc(bigBlocks * sizeof(cudaStream_t));
for (int i=0; i < nstreams; i++)
{
CUDA_SAFE_CALL(cudaStreamCreate(&(streams[i])));
}
// Copy to device
CUDA_SAFE_CALL(cudaMemcpy(Td,T,data_size*nstreams,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d,a,data_size*nstreams,cudaMemcpyHostToDevice));
cudaEventRecord(start_event,0);
for (int i = 0; i < nstreams; i++)
{
// Process data
cholesky_kernel<<<nBlocks,blockSize,0,streams[i]>>>(outputd+i*outsize,Td+i*data_size,d+i*data_size,padM);
CUT_CHECK_ERROR("Kernel execution failed.");
}
CUDA_SAFE_CALL(cudaMemcpy(output,outputd,outsize*nstreams,cudaMemcpyDeviceToHost));
cudaEventRecord(stop_event,0);
cudaEventSynchronize(stop_event);
CUDA_SAFE_CALL(cudaEventElapsedTime(&total_time,start_event,stop_event));
When I run the above with nstreams >1, the execution time is 0.000000, which seems to indicate that the kernel hasn’t launched. When nstreams = 1, the execution time is ~8.
I can get simpleStreams from the SDK to run fine.
Is there something I’m missing? Are there limits to the number of streams that can run at one time?