• RTX2060
• 11.4
void launch_kernel(float* data, cudaStream_t stream)
{
kernel << <1, 64, 0, stream >> > (data, N);
//std::this_thread::sleep_for(std::chrono::milliseconds(1));
cudaStreamSynchronize(stream);
return;
}
The above code won’t execute in parallel with multiple streams. If you uncomment “std::this_thread::sleep_for(std::chrono::milliseconds(1));”, multiple streams will compute in parallel. Additionally, without executing “cudaStreamSynchronize”, they will also run concurrently. Why does “cudaStreamSynchronize” block other streams?
Are you running this on windows? If so, that is probably a factor. Try both settings for Hardware Accelerated GPU Scheduling .
Otherwise, I don’t have any trouble witnessing kernel overlap with your posted code on L4 GPU on CUDA 12.2 on linux:
# cat t227.cu
#include <thread>
const size_t N = 1048576*32;
template <typename T>
__global__ void kernel(T *data, size_t my_N){
for (size_t i = 0; i < my_N; i+=blockDim.x) data[i] = 0;
}
void launch_kernel(float* data, cudaStream_t stream)
{
kernel << <1, 64, 0, stream >> > (data, N);
//std::this_thread::sleep_for(std::chrono::milliseconds(1));
cudaStreamSynchronize(stream);
return;
}
int main(){
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
float *data;
cudaMalloc(&data, N*sizeof(data[0]));
std::thread t1(launch_kernel, data, stream1);
std::thread t2(launch_kernel, data, stream2);
t1.join();
t2.join();
}
# nvcc -o t227 t227.cu
# nsys nvprof --print-gpu-trace ./t227
WARNING: t227 and any of its children processes will be profiled.
Generating '/tmp/nsys-report-7ac8.qdstrm'
[1/3] [========================100%] report53.nsys-rep
[2/3] [========================100%] report53.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report
Start (ns) Duration (ns) CorrId GrdX GrdY GrdZ BlkX BlkY BlkZ Reg/Trd StcSMem (MB) DymSMem (MB) Bytes (MB) Throughput (MBps) SrcMemKd DstMemKd Device Ctx Strm Name
----------- ------------- ------ ---- ---- ---- ---- ---- ---- ------- ------------ ------------ ---------- ----------------- -------- -------- ------------- --- ---- ---------------------------------------
686,404,237 10,281,673 121 1 1 1 64 1 1 16 0.000 0.000 NVIDIA L4 (0) 1 13 void kernel<float>(T1 *, unsigned long)
686,410,925 10,281,545 122 1 1 1 64 1 1 16 0.000 0.000 NVIDIA L4 (0) 1 14 void kernel<float>(T1 *, unsigned long)
Generated:
/root/bobc/report53.nsys-rep
/root/bobc/report53.sqlite
#
The kernels start within 10 microseconds of each other, and each kernel has a duration of ~10 milliseconds, so they are largely overlapped.
1 Like
Sorry, I didn’t describe it clearly, yes, the operating system is win10, I tried to run the code you provided, but it still couldn’t be parallelized. The overlap rate is 0. In addition, I tried setting “–default-stream per-thread”, but it still didn’t work.
You are right, I tried settings for Hardware Accelerated GPU and it started parallelizing, thank you very much
system
Closed
November 13, 2024, 4:39am
5
This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.