Also, if the second kernel myKernel was launched before, it also works, to give an example
__global__ void myKernel(size_t total_windows) {
if (!total_windows) return;
else kernel_sleep(total_windows*1000);
printf("my kernel executed\n");
}
__global__ void pkCheck(volatile size_t *curr_window, size_t total_windows) {
while (true)
{
__nanosleep(10000000);
size_t num_windows = *curr_window;
if (num_windows >= total_windows) {
printf("===all windows processed\n");
break;
}
}
}
int run(FFTPlan &plan) {
size_t data[2048] = {0};
size_t *in, *out;
cudaMalloc(&in, sizeof(size_t) * plan.total_samples);
cudaMalloc(&out, sizeof(size_t) * plan.total_samples);
size_t total_windows = plan.total_samples / plan.window_size;
size_t *curr_window;
// cudaMallocManaged(&curr_window, sizeof(size_t), cudaMemAttachHost);
cudaMallocManaged(&curr_window, sizeof(size_t));
curr_window[0] = 0;
cudaCheck();
cudaStream_t proc_stream, control_stream;
cudaStreamCreateWithFlags(&control_stream, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&proc_stream, cudaStreamNonBlocking);
cudaCheck();
printf("myKernel\n");
myKernel<<<1, 1, 0, proc_stream>>>(0); // prelaunch
cudaCheck();
printf("pkCheck\n");
pkCheck<<<1, 1, 0, control_stream>>>(curr_window, total_windows);
cudaCheck();
printf("myKernel\n");
myKernel<<<1, 1, 0, proc_stream>>>(total_windows);
cudaCheck();
printf("copying\n");
for (int i = 0; i < total_windows; i++) {
// cudaMemset(in+i*plan.window_size, 0, sizeof(size_t) * plan.window_size); // This stuck on window 1016
cudaMemcpy(in+i*plan.window_size, data, sizeof(size_t) * plan.window_size, cudaMemcpyHostToDevice); // This stuck on window 0
// cudaMemcpyAsync(in+i*plan.window_size, data, sizeof(size_t) * plan.window_size, cudaMemcpyHostToDevice, proc_stream); // This stuck on window 249
curr_window[0]++;
printf("window: %d\n", i);
cudaCheck();
}
cudaDeviceSynchronize();
return 0;
}
The output is like
myKernel
pkCheck
myKernel
copying
window: 0
window: 1
window: 2
window: 3
window: 4
window: 5
window: 6
....
window: 240
window: 241
my kernel executed
window: 242
window: 243
window: 244
window: 245
....
window: 4999
===all windows processed
which looks like if the kernel’s instructions have already been loaded to the device L2 cache, it is able to proceed. Otherwise, some engine will be taken up by the persistent kernel.