Quadro K2000M versus i7-3820QM ??

Hi,

I have a slightly weak graphics card, and a stronger CPU.
When I outsource image processing to the GPU, the GPU actually performs worse and worse.
Is that due to my weak Grafigkarte ??
Or have I not yet fully maxed out the GPU?
I have already reviewed the algorithms with Visual Profiler.
Now and then there are changes that Visual Profiler should have made an improvement after a real measured.
Incidentally, if I do not use shared memory, the algorithm runs the fastest. I made some effort
to load the image accordingly in a shared memory buffer.

BLOCK[0] = 16u;
	BLOCK[1] = 12u;
	GRID[0] = 40u;
	GRID[1] = 40u;
	s_CUDA_CLASS_Gauss.fptr_CUDA_EXECUTE_Gauss(GRID, BLOCK, 0);
cudaStream_t streams[10];

void fptr_CUDA_EXECUTE_Gauss(unsigned int ui_GRID[2], unsigned int ui_BLOCK[2],unsigned int O)
{
	static char IsInit = 0;
	dim3 dimGrid(ui_GRID[0], ui_GRID[1], 1);
	dim3 dimBlock(ui_BLOCK[0], ui_BLOCK[1], 1);
	unsigned char StreamIDX = 0;
	unsigned char StreamIDXPrev = 0;
	cudaError_t CUDAError;
	
	if(IsInit==0)
		CUDAError = cudaStreamCreate(&streams[0]);
	KCNL_EXECUTE_Gauss_Filter_x << <dimGrid, dimBlock, sizeof(unsigned short)*s_CUDA_CLASS_Gauss.arr_FilterSizes[0] + (ui_BLOCK[0] + (s_CUDA_CLASS_Gauss.arr_FilterSizes[0] - 1))*ui_BLOCK[1], streams[0] >> > (O, 0);
	
	CUDAError = cudaGetLastError();
	
	for (unsigned int i = 0; i < s_CUDA_CLASS_Sift.ui_NUM_LEVELS_Q + 2; i++)
	{
		if (IsInit == 0)
			CUDAError = cudaStreamCreate(&streams[i+1]);


		KCNL_EXECUTE_Gauss_Filter_x << <dimGrid, dimBlock, sizeof(unsigned short)*s_CUDA_CLASS_Gauss.arr_FilterSizes[i+1]+ (ui_BLOCK[0] + (s_CUDA_CLASS_Gauss.arr_FilterSizes[i + 1] - 1))*ui_BLOCK[1], streams[i + 1] >> > (O, i + 1);
		
		
	}
	CUDAError = cudaStreamSynchronize(streams[0]);
	for (unsigned int i = 0; i < s_CUDA_CLASS_Sift.ui_NUM_LEVELS_Q + 2; i++)
		CUDAError = cudaStreamSynchronize(streams[i + 1]);
	IsInit = 1;

}
#define MAX_THREADS_PER_BLOCK (16u*12u)
#define MIN_BLOCKS_PER_MP     2


__global__ void 
__launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
KCNL_EXECUTE_Gauss_Filter_x
(unsigned int O, unsigned int L)
{
	// Get buffer pointer
	s_CUDA_Filter_t * __restrict__	pcf = &(s_CUDA_CLASS_DEVICE_Holder.sarr_CUDA_Filter[L]);
	unsigned char *	__restrict__	dst = s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].ui_GX[L];
	unsigned char *	__restrict__	src = s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].s_Picture.p_Picture;

	extern __shared__ unsigned char ScratchPadMEM[];

	// SIZE: sizeof(unsigned short)*pcf->Size
	#define SratchPadFilter ((unsigned short * const __restrict__)&(ScratchPadMEM[0]))
	// SIZE: blockDim.x + (pcf->Size - 1))*blockDim.y
	#define SratchPadImgBuff ((unsigned char * const __restrict__)&(ScratchPadMEM))

	// Grid koordinaten
	unsigned int const ui_GC_X = (blockIdx.x * blockDim.x + threadIdx.x);
	unsigned int const ui_GC_Y_MUL_WIDTH = ((blockIdx.y * blockDim.y + threadIdx.y) * s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[0].s_Picture.ui_WIDTH);
	// Block koordinaten
	unsigned int const ScratchPadDim_x = (blockDim.x + (pcf->Size - 1));
	
	// Calcualte filter center index
	unsigned int const ui_FILTER_CENTER_IDX = ((pcf->Size - 1) / 2);

	// destination scratchpad idx
	unsigned int const ui_DS_y_MUL_ScratchPadDim_x = (threadIdx.y*ScratchPadDim_x);

	unsigned int ui_DS;
	
	{
		unsigned int const ui_BLK_IDX = threadIdx.y*blockDim.x + threadIdx.x;

		if (ui_BLK_IDX < pcf->Size)
		{
			SratchPadFilter[ui_BLK_IDX] = pcf->p_Filter[ui_BLK_IDX];
		}
	}

	{
		unsigned int const ui_LOOP_SIZE_X = ceil((double)(ScratchPadDim_x) / (double)blockDim.x);
		unsigned int const ui_DS_xTmp = threadIdx.x * ui_LOOP_SIZE_X;

		for (unsigned int i = 0; i < ui_LOOP_SIZE_X; i++)
		{
			#define ui_DS_x (ui_DS_xTmp + i)
			#define ui_DS   (ui_DS_y_MUL_ScratchPadDim_x + ui_DS_x)

			unsigned int const ui_SG_x = ui_GC_X - (ui_FILTER_CENTER_IDX + threadIdx.x) + ui_DS_x;
			//unsigned int const &ui_SG_y = ui_GC_Y;
			unsigned int const ui_SG = ui_GC_Y_MUL_WIDTH + ui_SG_x;

			if (ScratchPadDim_x > ui_DS_x && ui_GC_X > ui_FILTER_CENTER_IDX)
			{
				SratchPadImgBuff[ui_DS] = src[ui_SG];
			}

		}
		#undef ui_DS_x
		#undef ui_DS
	}

	__syncthreads();

	{
		// calculate lower bound
		unsigned int const &ui_LB = (ui_FILTER_CENTER_IDX);
		// calculate upper bound
		unsigned int const ui_UB = (s_CUDA_CLASS_DEVICE_Holder.ps_CUDA_CLASS_DEVICE_Oktave[O].s_Picture.ui_WIDTH - (ui_FILTER_CENTER_IDX + 1));
		long double d_Val = 0.0;

		//dst[ui_DG] = SratchPadImgBuff[ui_SS];
		if (ui_LB <= ui_GC_X && ui_GC_X <= ui_UB)
		{
		
			for (unsigned int i = 0; i < pcf->Size; i++)
			{
				unsigned int const ui_DS_x = (threadIdx.x + i);
				unsigned int const ui_DS = (ui_DS_y_MUL_ScratchPadDim_x + ui_DS_x);

				d_Val += SratchPadImgBuff[ui_DS] * SratchPadFilter[i];
			}
			dst[ui_GC_Y_MUL_WIDTH + ui_GC_X] = d_Val / (long double)pcf->Norm;

		}

	}


}

#undef MAX_THREADS_PER_BLOCK
#undef MIN_BLOCKS_PER_MP

Without seeing what NVVP says, there are only guesses to be made, so you will need to review your code and identify what can be changed:

  • Check if you are launching blocks that are multiples of 32 (up to 1024) for full warps, if your problem allows for it.
  • Make sure your input and output are being read/written in an aligned manner (coalesced). If not, performance can drop drastically. It may not be possible in 100% of the cases, such as when your output is like 5% larger than the input and you need to write the samples either to the beginning (and leave some trailing elements) or to the end (leaving some heading elements). This implies that you have a shift such as output[tid + SHIFT] = input[tid]. NVVP will say the operation is not aligned and there is a performance penalty.
  • The amount of threads that are up and running at a given time depends on many factors, one of them being related to line #14 of your 2nd code block. I have no idea how big this shared memory request is, but if it is large, then the grids/blocks launched at a time on a SM will be reduced, as the shared memory is a very limited resource.

I didn’t really go through your entire code, but as far as I know, image processing is a highly parallel field and GPUs outperform CPUs without problems. So there is definitely something here. Try to spend more time interpreting the profiler results and adjust your code little by little.