Hi,
I programmed a simple filter in CUDA.
Unfortunately, its execution slows down and continues to increase.
The execution time behaves like a staircase function.
What can this be?
He is not optimized for performance now. But I can not explain the constantly increasing execution time.
__global__ void KCNL_EXECUTE_Gauss_Filter_x(unsigned int O, unsigned int L)
{
// Get buffer pointer
s_CUDA_Filter_t * __restrict__ pcf = &(s_CUDA_CLASS_DEVICE_Holder.sarr_CUDA_Filter[L]);
unsigned char * __restrict__ dst = s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].ui_GX[L];
unsigned char * __restrict__ src = s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].s_Picture.p_Picture;
// Grid koordinaten
unsigned int const ui_GC_X = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int const ui_GC_Y = blockIdx.y * blockDim.y + threadIdx.y;
// Calcualte linear index
unsigned int ui_DST_IDX = ui_GC_Y*s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[0].s_Picture.ui_WIDTH + ui_GC_X;
// Calcualte filter center index
unsigned int const ui_FILTER_CENTER_IDX = (pcf->Size - 1) / 2;
// calculate lower bound
unsigned int const ui_LB = (ui_FILTER_CENTER_IDX);
// calculate upper bound
unsigned int const ui_UB = (s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].s_Picture.ui_WIDTH-(ui_FILTER_CENTER_IDX));
// check edges of image
if ( ui_LB <= ui_GC_X && ui_GC_X <= ui_UB)
{
long double d_Val = 0.0;
unsigned int const ui_FILTER_START_IDX = ui_DST_IDX - ui_FILTER_CENTER_IDX;
for (unsigned int i = 0; i < pcf->Size; i++)
{
d_Val += src[ui_FILTER_START_IDX + i] * pcf->p_Filter[i];
}
d_Val = d_Val / (long double)pcf->Norm;
dst[ui_DST_IDX] = d_Val;
}
}
Best Regards,
TinTin