Hi, I’m new to CUDA, and I’m trying to use a persistent kernel to reduce the scheduling and synchronizing time for a high frequency operation, but the kernel seems to run slower when I increase the total number of threads, while the one-time kernel does not change a lot. What is the reason for this issue, and how can I fix this? Thank you.
#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>
__global__ void warmup(int* data){
int id = threadIdx.x + blockIdx.x * blockDim.x;
}
__global__ void onetime_kernel(int* data){
int id = threadIdx.x + blockIdx.x * blockDim.x;
data[id] = id;
}
__global__ void persistent_kernel( volatile int* exit_flag,volatile int* check_flag, int* data){
int id = threadIdx.x + blockIdx.x * blockDim.x;
while(*exit_flag==1){
if(*check_flag==1){
data[id] = id;
__threadfence_system();
if(id==0) *check_flag = 0;
}
}
}
int main(int argc, char **argv){
int block;
int thread_per_block;
if(argc==3){
sscanf(argv[1],"%d",&block);
sscanf(argv[2],"%d",&thread_per_block);
}
else{
block = 1;
thread_per_block = 20;
}
int threads = block*thread_per_block;
printf("block: %d; thread_per_block: %d; total_threads: %d\n", block, thread_per_block, threads);
int *data_;
cudaMallocManaged(&data_, sizeof(int)*threads);
for(int i=0; i<threads; ++i) data_[i] = 0;
cudaStream_t *stream_;
stream_ = (cudaStream_t*) new cudaStream_t[1];
cudaStreamCreate(&stream_[0]);
cudaSetDeviceFlags(cudaDeviceMapHost);
volatile int *exit_flag_d, *exit_flag_h;
cudaHostAlloc((void **)&exit_flag_h,sizeof(int),cudaHostAllocMapped);
cudaHostGetDevicePointer((int **)&exit_flag_d, (int *)exit_flag_h, 0);
volatile int *check_flag_d, *check_flag_h;
cudaHostAlloc((void **)&check_flag_h,sizeof(int),cudaHostAllocMapped);
cudaHostGetDevicePointer((int **)&check_flag_d, (int *)check_flag_h, 0);
warmup<<< 32, 64, 0, stream_[0] >>>(data_);
cudaDeviceSynchronize();
onetime_kernel<<< block, thread_per_block, 0, stream_[0] >>>(data_);
cudaDeviceSynchronize();
for(int i=0; i<threads; ++i) data_[i] = 0;
*exit_flag_h = 1;
*check_flag_h = 0;
persistent_kernel<<< block, thread_per_block, 0, stream_[0] >>>(exit_flag_d,check_flag_d,data_);
clock_t time_1 = clock();
*check_flag_h = 1;
while(*check_flag_h == 1);
clock_t time_2 = clock();
float timer = (float)(time_2-time_1)/CLOCKS_PER_SEC;
printf("timer: %f(ms)\n", timer*1000);
*exit_flag_h = 0;
cudaDeviceSynchronize();
}
And I got the following results:
$ nsys profile --stats=true ./test
block: 1; thread_per_block: 20; total_threads: 20
timer: 0.006000(ms)
...
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) GridXYZ BlockXYZ Name
-------- --------------- --------- -------- -------- -------- -------- ----------- -------------- -------------- --------------------------------------------------------
80.6 33,952 1 33,952.0 33,952.0 33,952 33,952 0.0 1 1 1 20 1 1 persistent_kernel(volatile int *, volatile int *, int *)
11.0 4,640 1 4,640.0 4,640.0 4,640 4,640 0.0 1 1 1 20 1 1 onetime_kernel(int *)
8.4 3,552 1 3,552.0 3,552.0 3,552 3,552 0.0 32 1 1 64 1 1 warmup(int *)
$ nsys profile --stats=true ./test 8 32
block: 8; thread_per_block: 32; total_threads: 256
timer: 0.011000(ms)
...
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) GridXYZ BlockXYZ Name
-------- --------------- --------- -------- -------- -------- -------- ----------- -------------- -------------- --------------------------------------------------------
83.5 43,072 1 43,072.0 43,072.0 43,072 43,072 0.0 8 1 1 32 1 1 persistent_kernel(volatile int *, volatile int *, int *)
9.3 4,800 1 4,800.0 4,800.0 4,800 4,800 0.0 8 1 1 32 1 1 onetime_kernel(int *)
7.2 3,712 1 3,712.0 3,712.0 3,712 3,712 0.0 32 1 1 64 1 1 warmup(int *)
$ nsys profile --stats=true ./test 16 32
block: 16; thread_per_block: 32; total_threads: 512
timer: 0.022000(ms)
...
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) GridXYZ BlockXYZ Name
-------- --------------- --------- -------- -------- -------- -------- ----------- -------------- -------------- --------------------------------------------------------
88.0 60,960 1 60,960.0 60,960.0 60,960 60,960 0.0 16 1 1 32 1 1 persistent_kernel(volatile int *, volatile int *, int *)
6.8 4,736 1 4,736.0 4,736.0 4,736 4,736 0.0 16 1 1 32 1 1 onetime_kernel(int *)
5.2 3,584 1 3,584.0 3,584.0 3,584 3,584 0.0 32 1 1 64 1 1 warmup(int *)
$ nsys profile --stats=true ./test 16 128
block: 16; thread_per_block: 128; total_threads: 2048
timer: 0.090000(ms)
...
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) GridXYZ BlockXYZ Name
-------- --------------- --------- --------- --------- -------- -------- ----------- -------------- -------------- --------------------------------------------------------
95.4 169,600 1 169,600.0 169,600.0 169,600 169,600 0.0 16 1 1 128 1 1 persistent_kernel(volatile int *, volatile int *, int *)
2.7 4,864 1 4,864.0 4,864.0 4,864 4,864 0.0 16 1 1 128 1 1 onetime_kernel(int *)
1.9 3,360 1 3,360.0 3,360.0 3,360 3,360 0.0 32 1 1 64 1 1 warmup(int *)
$ nsys profile --stats=true ./test 16 512
block: 16; thread_per_block: 512; total_threads: 8192
timer: 0.320000(ms)
...
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) GridXYZ BlockXYZ Name
-------- --------------- --------- --------- --------- -------- -------- ----------- -------------- -------------- --------------------------------------------------------
98.4 546,208 1 546,208.0 546,208.0 546,208 546,208 0.0 16 1 1 512 1 1 persistent_kernel(volatile int *, volatile int *, int *)
0.9 5,120 1 5,120.0 5,120.0 5,120 5,120 0.0 16 1 1 512 1 1 onetime_kernel(int *)
0.6 3,488 1 3,488.0 3,488.0 3,488 3,488 0.0 32 1 1 64 1 1 warmup(int *)
FYI, I’m working on a Jetson Orin NX 16Gb with Jetpck 5.1.1 and CUDA 11.4.