Hi,
I have a slightly weak graphics card, and a stronger CPU.
When I outsource image processing to the GPU, the GPU actually performs worse and worse.
Is that due to my weak Grafigkarte ??
Or have I not yet fully maxed out the GPU?
I have already reviewed the algorithms with Visual Profiler.
Now and then there are changes that Visual Profiler should have made an improvement after a real measured.
Incidentally, if I do not use shared memory, the algorithm runs the fastest. I made some effort
to load the image accordingly in a shared memory buffer.
BLOCK[0] = 16u;
BLOCK[1] = 12u;
GRID[0] = 40u;
GRID[1] = 40u;
s_CUDA_CLASS_Gauss.fptr_CUDA_EXECUTE_Gauss(GRID, BLOCK, 0);
cudaStream_t streams[10];
void fptr_CUDA_EXECUTE_Gauss(unsigned int ui_GRID[2], unsigned int ui_BLOCK[2],unsigned int O)
{
static char IsInit = 0;
dim3 dimGrid(ui_GRID[0], ui_GRID[1], 1);
dim3 dimBlock(ui_BLOCK[0], ui_BLOCK[1], 1);
unsigned char StreamIDX = 0;
unsigned char StreamIDXPrev = 0;
cudaError_t CUDAError;
if(IsInit==0)
CUDAError = cudaStreamCreate(&streams[0]);
KCNL_EXECUTE_Gauss_Filter_x << <dimGrid, dimBlock, sizeof(unsigned short)*s_CUDA_CLASS_Gauss.arr_FilterSizes[0] + (ui_BLOCK[0] + (s_CUDA_CLASS_Gauss.arr_FilterSizes[0] - 1))*ui_BLOCK[1], streams[0] >> > (O, 0);
CUDAError = cudaGetLastError();
for (unsigned int i = 0; i < s_CUDA_CLASS_Sift.ui_NUM_LEVELS_Q + 2; i++)
{
if (IsInit == 0)
CUDAError = cudaStreamCreate(&streams[i+1]);
KCNL_EXECUTE_Gauss_Filter_x << <dimGrid, dimBlock, sizeof(unsigned short)*s_CUDA_CLASS_Gauss.arr_FilterSizes[i+1]+ (ui_BLOCK[0] + (s_CUDA_CLASS_Gauss.arr_FilterSizes[i + 1] - 1))*ui_BLOCK[1], streams[i + 1] >> > (O, i + 1);
}
CUDAError = cudaStreamSynchronize(streams[0]);
for (unsigned int i = 0; i < s_CUDA_CLASS_Sift.ui_NUM_LEVELS_Q + 2; i++)
CUDAError = cudaStreamSynchronize(streams[i + 1]);
IsInit = 1;
}
#define MAX_THREADS_PER_BLOCK (16u*12u)
#define MIN_BLOCKS_PER_MP 2
__global__ void
__launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
KCNL_EXECUTE_Gauss_Filter_x
(unsigned int O, unsigned int L)
{
// Get buffer pointer
s_CUDA_Filter_t * __restrict__ pcf = &(s_CUDA_CLASS_DEVICE_Holder.sarr_CUDA_Filter[L]);
unsigned char * __restrict__ dst = s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].ui_GX[L];
unsigned char * __restrict__ src = s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].s_Picture.p_Picture;
extern __shared__ unsigned char ScratchPadMEM[];
// SIZE: sizeof(unsigned short)*pcf->Size
#define SratchPadFilter ((unsigned short * const __restrict__)&(ScratchPadMEM[0]))
// SIZE: blockDim.x + (pcf->Size - 1))*blockDim.y
#define SratchPadImgBuff ((unsigned char * const __restrict__)&(ScratchPadMEM))
// Grid koordinaten
unsigned int const ui_GC_X = (blockIdx.x * blockDim.x + threadIdx.x);
unsigned int const ui_GC_Y_MUL_WIDTH = ((blockIdx.y * blockDim.y + threadIdx.y) * s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[0].s_Picture.ui_WIDTH);
// Block koordinaten
unsigned int const ScratchPadDim_x = (blockDim.x + (pcf->Size - 1));
// Calcualte filter center index
unsigned int const ui_FILTER_CENTER_IDX = ((pcf->Size - 1) / 2);
// destination scratchpad idx
unsigned int const ui_DS_y_MUL_ScratchPadDim_x = (threadIdx.y*ScratchPadDim_x);
unsigned int ui_DS;
{
unsigned int const ui_BLK_IDX = threadIdx.y*blockDim.x + threadIdx.x;
if (ui_BLK_IDX < pcf->Size)
{
SratchPadFilter[ui_BLK_IDX] = pcf->p_Filter[ui_BLK_IDX];
}
}
{
unsigned int const ui_LOOP_SIZE_X = ceil((double)(ScratchPadDim_x) / (double)blockDim.x);
unsigned int const ui_DS_xTmp = threadIdx.x * ui_LOOP_SIZE_X;
for (unsigned int i = 0; i < ui_LOOP_SIZE_X; i++)
{
#define ui_DS_x (ui_DS_xTmp + i)
#define ui_DS (ui_DS_y_MUL_ScratchPadDim_x + ui_DS_x)
unsigned int const ui_SG_x = ui_GC_X - (ui_FILTER_CENTER_IDX + threadIdx.x) + ui_DS_x;
//unsigned int const &ui_SG_y = ui_GC_Y;
unsigned int const ui_SG = ui_GC_Y_MUL_WIDTH + ui_SG_x;
if (ScratchPadDim_x > ui_DS_x && ui_GC_X > ui_FILTER_CENTER_IDX)
{
SratchPadImgBuff[ui_DS] = src[ui_SG];
}
}
#undef ui_DS_x
#undef ui_DS
}
__syncthreads();
{
// calculate lower bound
unsigned int const &ui_LB = (ui_FILTER_CENTER_IDX);
// calculate upper bound
unsigned int const ui_UB = (s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].s_Picture.ui_WIDTH - (ui_FILTER_CENTER_IDX + 1));
long double d_Val = 0.0;
//dst[ui_DG] = SratchPadImgBuff[ui_SS];
if (ui_LB <= ui_GC_X && ui_GC_X <= ui_UB)
{
for (unsigned int i = 0; i < pcf->Size; i++)
{
unsigned int const ui_DS_x = (threadIdx.x + i);
unsigned int const ui_DS = (ui_DS_y_MUL_ScratchPadDim_x + ui_DS_x);
d_Val += SratchPadImgBuff[ui_DS] * SratchPadFilter[i];
}
dst[ui_GC_Y_MUL_WIDTH + ui_GC_X] = d_Val / (long double)pcf->Norm;
}
}
}
#undef MAX_THREADS_PER_BLOCK
#undef MIN_BLOCKS_PER_MP