Kernels after a persistent kernel isn't executed unless running under Nsight System

Also, if the second kernel myKernel was launched before, it also works, to give an example

__global__ void myKernel(size_t total_windows) {
    if (!total_windows) return;
    else kernel_sleep(total_windows*1000);
    printf("my kernel executed\n");
}


__global__ void pkCheck(volatile size_t *curr_window, size_t total_windows) {

    while (true)
    {
        __nanosleep(10000000);
        size_t num_windows = *curr_window;
        if (num_windows >= total_windows) {
            printf("===all windows processed\n");
            break;
        }
    }
}

int run(FFTPlan &plan) {

    size_t data[2048] = {0};

    size_t *in, *out;
    cudaMalloc(&in, sizeof(size_t) * plan.total_samples);
    cudaMalloc(&out, sizeof(size_t) * plan.total_samples);
    size_t total_windows = plan.total_samples / plan.window_size;
    size_t *curr_window;
    // cudaMallocManaged(&curr_window, sizeof(size_t), cudaMemAttachHost);
    cudaMallocManaged(&curr_window, sizeof(size_t));
    curr_window[0] = 0;
    cudaCheck();

    cudaStream_t proc_stream, control_stream;
    cudaStreamCreateWithFlags(&control_stream, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&proc_stream, cudaStreamNonBlocking);
    cudaCheck();

    printf("myKernel\n");
    myKernel<<<1, 1, 0, proc_stream>>>(0); // prelaunch
    cudaCheck();

    printf("pkCheck\n");
    pkCheck<<<1, 1, 0, control_stream>>>(curr_window, total_windows);
    cudaCheck();

    printf("myKernel\n");
    myKernel<<<1, 1, 0, proc_stream>>>(total_windows);
    cudaCheck();

    printf("copying\n");
    for (int i = 0; i < total_windows; i++) {
        // cudaMemset(in+i*plan.window_size, 0, sizeof(size_t) * plan.window_size); // This stuck on window 1016
        cudaMemcpy(in+i*plan.window_size, data, sizeof(size_t) * plan.window_size, cudaMemcpyHostToDevice); // This stuck on window 0
        // cudaMemcpyAsync(in+i*plan.window_size, data, sizeof(size_t) * plan.window_size, cudaMemcpyHostToDevice, proc_stream); // This stuck on window 249
        curr_window[0]++;
        printf("window: %d\n", i);
        cudaCheck();
    }
    cudaDeviceSynchronize();
    return 0;
}

The output is like

myKernel
pkCheck
myKernel
copying
window: 0
window: 1
window: 2
window: 3
window: 4
window: 5
window: 6
....
window: 240
window: 241
my kernel executed
window: 242
window: 243
window: 244
window: 245
....
window: 4999
===all windows processed

which looks like if the kernel’s instructions have already been loaded to the device L2 cache, it is able to proceed. Otherwise, some engine will be taken up by the persistent kernel.