Thanks a lot for this very useful example! Actually I compiled it on my machine (GTX 295, AMD Phenom™ 9550 Quad-Core Processor, 2200.149 MHz) and for the first two cases I have very similar results:
[codebox]
ZERO-COPY test
Zero copy initialization time = 0.166 ms
Zero copy kernel execution time = 0.384 ms
PAGEABLE memory test
Pageable memory initialization time = 0.859 ms
Pageable memory execution time = 0.036 ms
[/codebox]
But for the second case when we are using memory transfers using cudaMemcpy, we should use pinned memory that accelerates copies between host and device:
[codebox]
// Pageable memory version
std::cout << “PAGEABLE memory test” << std::endl;
cudaMalloc((void **)&a_d, size);
checkCUDAError(“cudaMalloc”);
a_m = (float *)malloc(size);
for(i = 0; i < N; i++)
check_h[i] = (float)i;
timerGPU = 0;
CE( cutCreateTimer(&timerGPU) );
CUT_SAFE_CALL(cutResetTimer(timerGPU));
CUT_SAFE_CALL(cutStartTimer(timerGPU));
memcpy(a_m, check_h, size);
cudaMemcpy(a_d, a_m, size, cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutStopTimer(timerGPU));
time = cutGetTimerValue(timerGPU);
std::cout << “Pageable memory initialization time = " << time << " ms” << std::endl;
CUT_SAFE_CALL(cutResetTimer(timerGPU));
CUT_SAFE_CALL(cutStartTimer(timerGPU));
incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(timerGPU));
time = cutGetTimerValue(timerGPU);
std::cout << “Pageable memory execution time = " << time << " ms” << std::endl;
cudaMemcpy(a_m, a_d, size, cudaMemcpyDeviceToHost);
checkCUDAError(“incrementArrayOnDevice”);
incrementArrayOnHost(check_h, N);
for(i = 0; i < N; i++)
assert(check_h[i] == a_m[i]);
free(a_m);
cudaFree(a_d);
//
// Page-locked (pinned) memory version
//
std::cout << “PAGEABLE (pinned) memory test” << std::endl;
cudaMalloc((void **)&a_d, size);
checkCUDAError(“cudaMalloc”);
cudaMallocHost((void **)&a_m, size);
checkCUDAError(“cudaMallocHost”);
for(i = 0; i < N; i++)
check_h[i] = (float)i;
timerGPU = 0;
CE( cutCreateTimer(&timerGPU) );
CUT_SAFE_CALL(cutResetTimer(timerGPU));
CUT_SAFE_CALL(cutStartTimer(timerGPU));
memcpy(a_m, check_h, size);
cudaMemcpy(a_d, a_m, size, cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutStopTimer(timerGPU));
time = cutGetTimerValue(timerGPU);
std::cout << “Pageable memory initialization time = " << time << " ms” << std::endl;
CUT_SAFE_CALL(cutResetTimer(timerGPU));
CUT_SAFE_CALL(cutStartTimer(timerGPU));
incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(timerGPU));
time = cutGetTimerValue(timerGPU);
std::cout << “Pageable memory execution time = " << time << " ms” << std::endl;
cudaMemcpy(a_m, a_d, size, cudaMemcpyDeviceToHost);
checkCUDAError(“incrementArrayOnDevice”);
incrementArrayOnHost(check_h, N);
for(i = 0; i < N; i++)
assert(check_h[i] == a_m[i]);
cudaFree(a_m);
cudaFree(a_d);
[/codebox]
Then we have the following runtimes:
[codebox]
ZERO-COPY test
Zero copy initialization time = 0.166 ms
Zero copy kernel execution time = 0.384 ms
PAGEABLE memory test
Pageable memory initialization time = 0.859 ms
Pageable memory execution time = 0.036 ms
PAGEABLE (pinned) memory test
Pageable memory initialization time = 0.369 ms
Pageable memory execution time = 0.036 ms
[/codebox]
As you can see, using pinned memory with memory transfers is faster than zero-copies. In the case of using zero-copies the kernel runtime slows down significantly. But which integrated card is the best/fastest one nowadays? Maybe it’s possible to get better zero-copy runtime on it…