cudaStreamSynchronize

• RTX2060
• 11.4

void launch_kernel(float* data, cudaStream_t stream)
{
	kernel << <1, 64, 0, stream >> > (data, N);
	//std::this_thread::sleep_for(std::chrono::milliseconds(1));
	cudaStreamSynchronize(stream);
	return;
}

The above code won’t execute in parallel with multiple streams. If you uncomment “std::this_thread::sleep_for(std::chrono::milliseconds(1));”, multiple streams will compute in parallel. Additionally, without executing “cudaStreamSynchronize”, they will also run concurrently. Why does “cudaStreamSynchronize” block other streams?

Are you running this on windows? If so, that is probably a factor. Try both settings for Hardware Accelerated GPU Scheduling.

Otherwise, I don’t have any trouble witnessing kernel overlap with your posted code on L4 GPU on CUDA 12.2 on linux:


# cat t227.cu
#include <thread>

const size_t N = 1048576*32;
template <typename T>
__global__ void kernel(T *data, size_t my_N){

  for (size_t i = 0; i < my_N; i+=blockDim.x) data[i] = 0;
}

void launch_kernel(float* data, cudaStream_t stream)
{
        kernel << <1, 64, 0, stream >> > (data, N);
        //std::this_thread::sleep_for(std::chrono::milliseconds(1));
        cudaStreamSynchronize(stream);
        return;
}

int main(){
  cudaStream_t stream1, stream2;
  cudaStreamCreate(&stream1);
  cudaStreamCreate(&stream2);
  float *data;
  cudaMalloc(&data, N*sizeof(data[0]));
  std::thread t1(launch_kernel, data, stream1);
  std::thread t2(launch_kernel, data, stream2);
  t1.join();
  t2.join();
}
# nvcc -o t227 t227.cu
# nsys nvprof --print-gpu-trace ./t227
WARNING: t227 and any of its children processes will be profiled.

Generating '/tmp/nsys-report-7ac8.qdstrm'
[1/3] [========================100%] report53.nsys-rep
[2/3] [========================100%] report53.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report

 Start (ns)   Duration (ns)  CorrId  GrdX  GrdY  GrdZ  BlkX  BlkY  BlkZ  Reg/Trd  StcSMem (MB)  DymSMem (MB)  Bytes (MB)  Throughput (MBps)  SrcMemKd  DstMemKd     Device      Ctx  Strm                   Name
 -----------  -------------  ------  ----  ----  ----  ----  ----  ----  -------  ------------  ------------  ----------  -----------------  --------  --------  -------------  ---  ----  ---------------------------------------
 686,404,237     10,281,673     121     1     1     1    64     1     1       16         0.000         0.000                                                     NVIDIA L4 (0)    1    13  void kernel<float>(T1 *, unsigned long)
 686,410,925     10,281,545     122     1     1     1    64     1     1       16         0.000         0.000                                                     NVIDIA L4 (0)    1    14  void kernel<float>(T1 *, unsigned long)

Generated:
    /root/bobc/report53.nsys-rep
    /root/bobc/report53.sqlite
#

The kernels start within 10 microseconds of each other, and each kernel has a duration of ~10 milliseconds, so they are largely overlapped.

1 Like

Sorry, I didn’t describe it clearly, yes, the operating system is win10, I tried to run the code you provided, but it still couldn’t be parallelized. The overlap rate is 0. In addition, I tried setting “–default-stream per-thread”, but it still didn’t work.

You are right, I tried settings for Hardware Accelerated GPU and it started parallelizing, thank you very much

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.