Hi everyone:
I run the “separable convolution code”(included in the sdk example) today. And I want to time how much time it costs.
The following is my code:
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_SAFE_CALL( cutStartTimer(hTimer) );
for(hoho = 0; hoho < numInteration; hoho ++)
{
CUDA_SAFE_CALL( cudaMemcpyToSymbol(d_Kernel_X, h_Kernel_x, KERNEL_SIZE_X) );
CUDA_SAFE_CALL( cudaMemcpyToSymbol(d_Kernel_Y, h_Kernel_y, KERNEL_SIZE_Y) );
convolutionRowGPU<<<blockGridRows, threadBlockRows>>>(
d_DataB,
d_DataA,
DATA_W,
DATA_H
);
CUT_CHECK_ERROR("convolutionRowGPU() execution failed\n");
convolutionColumnGPU<<<blockGridColumns, threadBlockColumns>>>(
d_Temp,
d_DataB,
DATA_W,
DATA_H,
COLUMN_TILE_W * threadBlockColumns.y,
DATA_W * threadBlockColumns.y
);
CUT_CHECK_ERROR(“convolutionColumnGPU() execution failed\n”);
}
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL(cutStopTimer(hTimer));
gpuTime = cutGetTimerValue(hTimer) / (float)numInteration;
printf(“GPU anigaussian time : %f msec //%f Mpixels/sec\n, i = %d”, gpuTime, 1e-6 * DATA_W * DATA_H / (gpuTime * 0.001), hoho);
The result is that: the whole process(memory copy & rowfilter && columnfilter) costs 0.335ms. And I time them separably. Strange things happen!
Memory copy costs 0.102ms, while rowfilter&columnfilter costs 0.117ms, which mean that 0.102 + 0.117 != 0.335. External Image
I have no idea of what happens. Do you have some explanation?
I put the “cudaThreadSynchronize()” before and afterthe kernel execution.
:mellow: Time the memory copy:
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_SAFE_CALL( cutStartTimer(hTimer) );
for(hoho = 0; hoho < numInteration; hoho ++)
{
CUDA_SAFE_CALL( cudaMemcpyToSymbol(d_Kernel_X, h_Kernel_x, KERNEL_SIZE_X) );
CUDA_SAFE_CALL( cudaMemcpyToSymbol(d_Kernel_Y, h_Kernel_y, KERNEL_SIZE_Y) );
}
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL(cutStopTimer(hTimer));
gpuTime = cutGetTimerValue(hTimer) / (float)numInteration;
printf(“GPU anigaussian time : %f msec //%f Mpixels/sec\n, i = %d”, gpuTime, 1e-6 * DATA_W * DATA_H / (gpuTime * 0.001), hoho);
:mellow: Time rowfilter&columnfilter:
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_SAFE_CALL( cutStartTimer(hTimer) );
for(hoho = 0; hoho < numInteration; hoho ++)
{
convolutionRowGPU<<<blockGridRows, threadBlockRows>>>(
d_DataB,
d_DataA,
DATA_W,
DATA_H
);
CUT_CHECK_ERROR(“convolutionRowGPU() execution failed\n”);
convolutionColumnGPU<<<blockGridColumns, threadBlockColumns>>>(
d_Temp,
d_DataB,
DATA_W,
DATA_H,
COLUMN_TILE_W * threadBlockColumns.y,
DATA_W * threadBlockColumns.y
);
CUT_CHECK_ERROR(“convolutionColumnGPU() execution failed\n”);
}
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL(cutStopTimer(hTimer));
gpuTime = cutGetTimerValue(hTimer) / (float)numInteration;
printf(“GPU anigaussian time : %f msec //%f Mpixels/sec\n, i = %d”, gpuTime, 1e-6 * DATA_W * DATA_H / (gpuTime * 0.001), hoho);