Persistent kernel runs slower when with more threads

Hi, I’m new to CUDA, and I’m trying to use a persistent kernel to reduce the scheduling and synchronizing time for a high frequency operation, but the kernel seems to run slower when I increase the total number of threads, while the one-time kernel does not change a lot. What is the reason for this issue, and how can I fix this? Thank you.

#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>

__global__ void warmup(int* data){
	int id = threadIdx.x + blockIdx.x * blockDim.x;
}


__global__ void onetime_kernel(int* data){
	int id = threadIdx.x + blockIdx.x * blockDim.x;
	data[id] = id;
}

__global__ void persistent_kernel( volatile int* exit_flag,volatile int* check_flag, int* data){
	int id = threadIdx.x + blockIdx.x * blockDim.x;
	
	while(*exit_flag==1){
		if(*check_flag==1){
			data[id] = id;
			__threadfence_system();
            if(id==0) *check_flag = 0;
		}
	}
	
}


int main(int argc, char **argv){

	int block;
	int thread_per_block;

	if(argc==3){
		sscanf(argv[1],"%d",&block);
		sscanf(argv[2],"%d",&thread_per_block);
	}
	else{
		block = 1;
		thread_per_block = 20;
	}

	int threads = block*thread_per_block;
	printf("block: %d; thread_per_block: %d; total_threads: %d\n", block, thread_per_block, threads);

	int *data_;
	cudaMallocManaged(&data_, sizeof(int)*threads);
	for(int i=0; i<threads; ++i) data_[i] = 0;

	cudaStream_t *stream_;
    stream_ = (cudaStream_t*) new cudaStream_t[1];
    cudaStreamCreate(&stream_[0]);

    cudaSetDeviceFlags(cudaDeviceMapHost);
	volatile int *exit_flag_d, *exit_flag_h;
    cudaHostAlloc((void **)&exit_flag_h,sizeof(int),cudaHostAllocMapped);
    cudaHostGetDevicePointer((int **)&exit_flag_d, (int *)exit_flag_h, 0);

    volatile int *check_flag_d, *check_flag_h;
    cudaHostAlloc((void **)&check_flag_h,sizeof(int),cudaHostAllocMapped);
    cudaHostGetDevicePointer((int **)&check_flag_d, (int *)check_flag_h, 0);

	warmup<<< 32, 64, 0, stream_[0] >>>(data_);
	cudaDeviceSynchronize();

	onetime_kernel<<< block, thread_per_block, 0, stream_[0] >>>(data_);
	cudaDeviceSynchronize();
	for(int i=0; i<threads; ++i) data_[i] = 0;

	*exit_flag_h = 1;
	*check_flag_h = 0;	

	persistent_kernel<<< block, thread_per_block, 0, stream_[0] >>>(exit_flag_d,check_flag_d,data_);

    clock_t time_1 = clock();

	*check_flag_h = 1;
	while(*check_flag_h == 1);

	clock_t time_2 = clock();
	float timer =  (float)(time_2-time_1)/CLOCKS_PER_SEC;

	printf("timer: %f(ms)\n", timer*1000);

	*exit_flag_h = 0;
	cudaDeviceSynchronize();
}

And I got the following results:

$ nsys profile --stats=true ./test
block: 1; thread_per_block: 20; total_threads: 20
timer: 0.006000(ms)
...
Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)     GridXYZ         BlockXYZ                               Name                          
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  --------------  --------------  --------------------------------------------------------
     80.6           33,952          1  33,952.0  33,952.0    33,952    33,952          0.0     1    1    1    20    1    1  persistent_kernel(volatile int *, volatile int *, int *)
     11.0            4,640          1   4,640.0   4,640.0     4,640     4,640          0.0     1    1    1    20    1    1  onetime_kernel(int *)                                   
      8.4            3,552          1   3,552.0   3,552.0     3,552     3,552          0.0    32    1    1    64    1    1  warmup(int *)
$ nsys profile --stats=true ./test 8 32
block: 8; thread_per_block: 32; total_threads: 256
timer: 0.011000(ms)
...
Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)     GridXYZ         BlockXYZ                               Name                          
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  --------------  --------------  --------------------------------------------------------
     83.5           43,072          1  43,072.0  43,072.0    43,072    43,072          0.0     8    1    1    32    1    1  persistent_kernel(volatile int *, volatile int *, int *)
      9.3            4,800          1   4,800.0   4,800.0     4,800     4,800          0.0     8    1    1    32    1    1  onetime_kernel(int *)                                   
      7.2            3,712          1   3,712.0   3,712.0     3,712     3,712          0.0    32    1    1    64    1    1  warmup(int *)
$ nsys profile --stats=true ./test 16 32
block: 16; thread_per_block: 32; total_threads: 512
timer: 0.022000(ms)
...
Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)     GridXYZ         BlockXYZ                               Name                          
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  --------------  --------------  --------------------------------------------------------
     88.0           60,960          1  60,960.0  60,960.0    60,960    60,960          0.0    16    1    1    32    1    1  persistent_kernel(volatile int *, volatile int *, int *)
      6.8            4,736          1   4,736.0   4,736.0     4,736     4,736          0.0    16    1    1    32    1    1  onetime_kernel(int *)                                   
      5.2            3,584          1   3,584.0   3,584.0     3,584     3,584          0.0    32    1    1    64    1    1  warmup(int *)
$ nsys profile --stats=true ./test 16 128
block: 16; thread_per_block: 128; total_threads: 2048
timer: 0.090000(ms)
...
Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)  StdDev (ns)     GridXYZ         BlockXYZ                               Name                          
 --------  ---------------  ---------  ---------  ---------  --------  --------  -----------  --------------  --------------  --------------------------------------------------------
     95.4          169,600          1  169,600.0  169,600.0   169,600   169,600          0.0    16    1    1   128    1    1  persistent_kernel(volatile int *, volatile int *, int *)
      2.7            4,864          1    4,864.0    4,864.0     4,864     4,864          0.0    16    1    1   128    1    1  onetime_kernel(int *)                                   
      1.9            3,360          1    3,360.0    3,360.0     3,360     3,360          0.0    32    1    1    64    1    1  warmup(int *)
$ nsys profile --stats=true ./test 16 512
block: 16; thread_per_block: 512; total_threads: 8192
timer: 0.320000(ms)
...
Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)  StdDev (ns)     GridXYZ         BlockXYZ                               Name                          
 --------  ---------------  ---------  ---------  ---------  --------  --------  -----------  --------------  --------------  --------------------------------------------------------
     98.4          546,208          1  546,208.0  546,208.0   546,208   546,208          0.0    16    1    1   512    1    1  persistent_kernel(volatile int *, volatile int *, int *)
      0.9            5,120          1    5,120.0    5,120.0     5,120     5,120          0.0    16    1    1   512    1    1  onetime_kernel(int *)                                   
      0.6            3,488          1    3,488.0    3,488.0     3,488     3,488          0.0    32    1    1    64    1    1  warmup(int *)

FYI, I’m working on a Jetson Orin NX 16Gb with Jetpck 5.1.1 and CUDA 11.4. And I tried to comment out __threadfence_system(), but that didn’t chang the results by a lot, I also tried __syncthreads() and let the first thread of each block to atmoicAdd to a flag for the host, and it does not change the behavior by a lot either

Hi,

GPU resources on Jetson are shared in a time-slicing manner.
So if you run multiple threads, the resources will be shared.

For the one-time kernel, the scheduler will allocate the resources immediately once it is available.
But for the persistent kernel, it might need to wait as all the kernels are active for busy waiting.

Thanks.

Hi, thank you for your response. Do you think if there’s anything I can do that may fix this issue? I mean when the total threads are not too much for the GPU, like below 2k or 3k. I got these result when no other program is using the GPU, except for the display, I thought the SM or the cores should be available for persistent kernel, at lest for the kernel when the total threads are under 4k. Also, in my actually application, I repeat the code between clock_t time_1 = clock(); and clock_t time_2 = clock(); for hundreds of times before setting the exit flag to zero, and I can get the data from all threads correctly, it doesn’t not look like some of the threads are waiting for other threads’s end to even implement, although I had this problem when I set the total threads too high, I think that was something above 8k, but it looks different from this increase in time, which is pretty proportional to the total threads, even when the threads are not too much.

Hi,

The minimal warp size of Orin is 32 so at least 32 threads are used for each kernel code.

Could you share more info about your use case?
For example, why do you need the persistent kernel, and how does it reduce the times for synchronization?

Thanks.

Hi, I’m doing an iterative algorithm. In each iteration, I need to use the kernel to process the some data in parallel, and do some sequential operation based on the results from the kernel, so I cannot parallelize all the work and finish it in one call, I have to run the GPU operation sequentially with some degree of parallelism, and I need to do this as fast as possible, my current rate with persistent kernel is to do around 500 iterations in around 60 milliseconds.

I used Nsight to check the kernel time, if I use a one-time kernel to do the work, and implement the kernel in every iteration, the latency between calling the kernel from the host and the actual start of the kernel is about 15 microseconds, the kernel takes about 20 microseconds to finish, and the cudaStreamSynchronize() take about 20 to 40 microseconds to finish everything after the kernel ends, and the sequential CPU operation for each iteration is much shorter than this, so the algorithm takes much longer than the one with the persistent kernel. I figured that if I use a persistent kernel with zero copy memory , I can skip the scheduling and synchronizing time, and I did get the best (shortest) implementing time for the algorithm with the persistent kernel (in each iteration I use check_flag to control the GPU operation, and use exit_flag to end the kernel after the algorithm finishes).

Now I’m trying to further optimize the algorithm, by reducing the iterations with more threads for each iteration, but I’m kinda stuck here, if I increase the threads, if takes longer to finish the GPU operation (the codes in the if loop), and the total time increases even though the iterations are reduced. I’m currently using 16 blocks with 32 threads per block, I don’t think that’s a lot threads for an Orin GPU, and my actual kernel does not take a lot register or shared memory. I’m hoping to know if there’s any way to reduce the time increment when I increase the threads. FYI, I also tried the code above on my laptop, the results are pretty similar.

Hi,

For any kernel that submits with the same cuda stream.
It’s guaranteed to be executed in order.

So you can create several cuda streams for the parallelism based on your algorithms.
Ex. N stream for N independent camera inputs.

And launch the tasks that need to be sequentially executed to the same stream.
Ex. Use stream k-th for pre-processing, inference, and post-processing for particular k-th camera input.

It looks like your kernel is relatively tiny so you may want to wrap your kernel into a CUDA graph.
CUDA graph is used for reducing the kernel launching/terminating time and might help for your use case.

Blog: Getting Started with CUDA Graphs | NVIDIA Technical Blog
Example: cuda-samples/Samples/3_CUDA_Features/simpleCudaGraphs at master · NVIDIA/cuda-samples · GitHub

Thanks.