CUDA Algorithm slows down with runtime

Hi,

I programmed a simple filter in CUDA.
Unfortunately, its execution slows down and continues to increase.
The execution time behaves like a staircase function.

What can this be?

He is not optimized for performance now. But I can not explain the constantly increasing execution time.

__global__ void KCNL_EXECUTE_Gauss_Filter_x(unsigned int O, unsigned int L)
{
	// Get buffer pointer
	s_CUDA_Filter_t * __restrict__	pcf = &(s_CUDA_CLASS_DEVICE_Holder.sarr_CUDA_Filter[L]);
	unsigned char *	__restrict__	dst = s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].ui_GX[L];
	unsigned char *	__restrict__	src = s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].s_Picture.p_Picture;

	// Grid koordinaten
	unsigned int const ui_GC_X = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned int const ui_GC_Y = blockIdx.y * blockDim.y + threadIdx.y;

	// Calcualte linear index
	unsigned int ui_DST_IDX = ui_GC_Y*s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[0].s_Picture.ui_WIDTH + ui_GC_X;
	// Calcualte filter center index
	unsigned int const ui_FILTER_CENTER_IDX = (pcf->Size - 1) / 2;
	// calculate lower bound
	unsigned int const ui_LB = (ui_FILTER_CENTER_IDX);
	// calculate upper bound
	unsigned int const ui_UB = (s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].s_Picture.ui_WIDTH-(ui_FILTER_CENTER_IDX));


	// check edges of image
	if ( ui_LB <= ui_GC_X && ui_GC_X <= ui_UB)
	{
		long double d_Val = 0.0;
		unsigned int const ui_FILTER_START_IDX = ui_DST_IDX - ui_FILTER_CENTER_IDX;
		for (unsigned int i = 0; i < pcf->Size; i++)
		{
			d_Val += src[ui_FILTER_START_IDX + i] * pcf->p_Filter[i];
		}
		d_Val = d_Val / (long double)pcf->Norm;
		
		dst[ui_DST_IDX] = d_Val;
	}
	
}

Best Regards,
TinTin

You might be doing something like this:

https://stackoverflow.com/questions/53970187/cuda-stream-is-blocked-when-launching-many-kernels-1000

which would be a measurement methodology error