Kernel code is attached in the end.
I run this kernel for 200 times, and averagely each time it takes 3000us (by clock() before and after the 200 repeated runs).
I also run this kernel in Visual Profiler, and averagely each time takes about 1400us. Here is result from Visual Profiler for one run, and other runs results are close to this one.
method gputime cputime occupancy
memcopy 305.216
memcopy 305.632
GPU_ComputeDiff 251.84 264.833 0.333
GPU_ReduceDiff 249.568 262.784 0.333
GPU_SortDiff 183.168 231.063 0.125
memcopy 28.928
Is there an explanation where those 1600us goes?
Thanks
/////kernel code ///////////////
// GPU Block Based Motion Estimation (color image)
void GPU_Block_ME_Clr (UCHAR* MVX, float * imL_data, float * imR_data)
{
CUT_DEVICE_INIT();
unsigned int mem_size = ( IM_HEIGHT * IM_WIDTH );
// allocate device memory
float * d_imL;
float * d_imR;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_imL, sizeof(float)*mem_size*3) );
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_imR, sizeof(float)*mem_size*3) );
// copy host memory to device
CUDA_SAFE_CALL( cudaMemcpy( d_imL, imR_data, sizeof(float)*mem_size*3, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy( d_imR, imL_data, sizeof(float)*mem_size*3, cudaMemcpyHostToDevice) );
// allocate device memory for DIFF result
float* DIFF;
CUDA_SAFE_CALL( cudaMalloc( (void**) &DIFF, DIFF_SIZE ));
int GPU_COMPUTE_GRID_HEIGHT = (int)((IM_HEIGHT)/GPU_COMPUTE_BLOCK_HEIGHT) ;
int GPU_COMPUTE_GRID_WIDTH = (int)(320/32 );
int GPU_COMPUTE_SMEM_size = (32+64)*4*3; //96pixels, 32 from im_L, 64 from im_R
// setup execution parameters
dim3 GPU_COMPUTE_threads(GPU_COMPUTE_BLOCK_WIDTH, GPU_COMPUTE_BLOCK_HEIGHT);
dim3 GPU_COMPUTE_grid(GPU_COMPUTE_GRID_WIDTH , GPU_COMPUTE_GRID_HEIGHT);
// execute the kernel
GPU_ComputeDiff<<< GPU_COMPUTE_grid, GPU_COMPUTE_threads ,GPU_COMPUTE_SMEM_size >>> ( DIFF, d_imL, d_imR);
cudaThreadSynchronize(); // wait for kernel to finish
// check if kernel execution generated any error
CUT_CHECK_ERROR("Kernel execution failed");
// setup execution parameters
int GPU_REDUCE_GRID_HEIGHT = (int)((IM_HEIGHT-4)/4) ;
int GPU_REDUCE_GRID_WIDTH = (int)((320-4)/4) ;
dim3 GPU_REDUCE_threads(GPU_REDUCE_BLOCK_WIDTH, GPU_REDUCE_BLOCK_HEIGHT);
dim3 GPU_REDUCE_grid(GPU_REDUCE_GRID_WIDTH , GPU_REDUCE_GRID_HEIGHT);
float* S_DIFF;
CUDA_SAFE_CALL( cudaMalloc( (void**) &S_DIFF, S_DIFF_SIZE/* sizeof(float) * GPU_REDUCE_GRID_HEIGHT * GPU_REDUCE_GRID_WIDTH *32*/ ));
GPU_ReduceDiff <<<GPU_REDUCE_grid, GPU_REDUCE_threads>>>(S_DIFF, DIFF);
cudaThreadSynchronize(); // wait for kernel to finish
// check if kernel execution generated any error
CUT_CHECK_ERROR("Kernel execution failed");
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
CUDA_SAFE_CALL(cudaBindTexture(0, tex_S_DIFF, S_DIFF, desc, S_DIFF_SIZE));
// allocate device memory for d_o_MVX result
UCHAR* d_o_MVX;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_o_MVX, sizeof(UCHAR) * 240*320 ));
CUDA_SAFE_CALL( cudaMemset( (void*) d_o_MVX, 0, sizeof(UCHAR) * 240*320 ));
// setup execution parameters
int GPU_SORT_GRID_HEIGHT = (int)((240-4)/(4*GPU_SORT_BLOCK_HEIGHT)) ;//discard the last row
int GPU_SORT_GRID_WIDTH = (int)((320-4)/(4*GPU_SORT_BLOCK_WIDTH))+1 ;
dim3 GPU_SORT_threads(GPU_SORT_BLOCK_WIDTH, GPU_SORT_BLOCK_HEIGHT);
dim3 GPU_SORT_grid(GPU_SORT_GRID_WIDTH , GPU_SORT_GRID_HEIGHT);
int GPU_SORT_SMEM_size = MAX_Offset*GPU_SORT_BLOCK_WIDTH*GPU_SORT_BLOCK_HEIGHT*4;
GPU_SortDiff <<<GPU_SORT_grid, GPU_SORT_threads, GPU_SORT_SMEM_size>>>(d_o_MVX);
cudaThreadSynchronize(); // wait for kernel to finish
// copy result from device to host
CUDA_SAFE_CALL( cudaMemcpy( MVX, d_o_MVX, sizeof(UCHAR) * 240*320, cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL(cudaFree(d_imL));
CUDA_SAFE_CALL(cudaFree(d_imR));
CUDA_SAFE_CALL(cudaFree(DIFF));
CUDA_SAFE_CALL(cudaFree(S_DIFF));
CUDA_SAFE_CALL(cudaFree(d_o_MVX));
CUDA_SAFE_CALL(cudaUnbindTexture(tex_S_DIFF));
}