Hi, I have a persistent kernel function that keeps checking a variable until the condition is satisfied. However, kernels after it is no longer executed even they are assigned to another stream and the program stuck on async kernel launch. But when I ran it under Nsight System, it works just fine.
I am running on Ubuntu20.04, GPU A6000 with compute_86 and CUDA12.2. The following is a sample code that cause the stuck.
__global__ void myKernel(size_t total_windows) {
printf("my kernel executed\n");
if (!total_windows) return;
}
__global__ void pkCheck(volatile size_t *curr_window, size_t total_windows) {
while (true)
{
__nanosleep(10000000);
size_t num_windows = *curr_window;
if (num_windows >= total_windows) {
printf("===all windows processed\n");
break;
}
}
}
int run(FFTPlan &plan) {
// windows: 2048*5000
size_t data[2048] = {0};
size_t *in, *out;
cudaMalloc(&in, sizeof(size_t) * plan.total_samples);
cudaMalloc(&out, sizeof(size_t) * plan.total_samples);
size_t total_windows = plan.total_samples / plan.window_size;
size_t *curr_window;
cudaMallocManaged(&curr_window, sizeof(size_t), cudaMemAttachHost);
curr_window[0] = 0;
cudaCheck();
cudaStream_t proc_stream, control_stream;
cudaStreamCreateWithFlags(&control_stream, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&proc_stream, cudaStreamNonBlocking);
cudaCheck();
printf("pkCheck\n");
pkCheck<<<1, 1, 0, control_stream>>>(curr_window, total_windows);
cudaCheck();
printf("myKernel\n");
myKernel<<<1, 1, 0, proc_stream>>>(total_windows);
cudaCheck();
printf("copying\n");
for (int i = 0; i < total_windows; i++) {
// cudaMemset(in+i*plan.window_size, 0, sizeof(size_t) * plan.window_size); // This stuck on window 1016
cudaMemcpy(in+i*plan.window_size, data, sizeof(size_t) * plan.window_size, cudaMemcpyHostToDevice); // This stuck on window 0
// cudaMemcpyAsync(in+i*plan.window_size, data, sizeof(size_t) * plan.window_size, cudaMemcpyHostToDevice, proc_stream); // This stuck on window 249
curr_window[0]++;
printf("window: %d\n", i);
cudaCheck();
}
cudaDeviceSynchronize();
return 0;
}
The output is like
$ ./mycode
pkCheck
myKernel
copying
window: 0
And it won’t proceed anymore, myKernel
is not executed.
However, if I run it under nsight system, it finished as I expected,
pkCheck
myKernel
copying
my kernel executed
window: 0
window: 1
window: 2
window: 3
....
window: 4998
window: 4999
===all windows processed
Generating '/home/liuxs/tmp/nsys-report-79f3.qdstrm'
[1/1] [0% ] nsys_report.nsys-rep
[1/1] [0% ] nsys_report.nsys-rep
[1/1] [8% ] nsys_report.nsys-rep
[1/1] [=16% ] nsys_report.nsys-rep
[1/1] [==================78% ] nsys_report.nsys-rep
[1/1] [========================100%] nsys_report.nsys-rep
[1/1] [========================100%] nsys_report.nsys-rep
Generated:
XX/nsys_report.nsys-rep
Any idea on cause of this weird behavior? What is the difference between running it directly / under nsys?