Hi,
SizeDataForGrid = 36;
unsigned int hTimer1;
CUT_SAFE_CALL( cutCreateTimer(&hTimer1) );
cutResetTimer(hTimer1);
cutStartTimer(hTimer1);
CUDA_SAFE_CALL( cudaMemcpy( grid[0].blocks,
(device_DataForGrid+0*SizeDataForGrid), SizeDataForGrid,
cudaMemcpyDeviceToHost) );
cutStopTimer(hTimer1);
printf("\nMemory copy time(for tracking. From device): %f msec",
cutGetTimerValue(hTimer1));
This code on host side, executing 36msec.
Why it very slow?