I’ve been testing around with Windows7, VS2008 and Cuda Project Wizard 2.0 (posted in this forum).
I noticed a performance hint by using timers instead of cudaEvents (using the default simple.cu from
the wizard).
With the following code, I got an execution time between 0.22ms and 0.26ms:
/************************************************************
************/
/* HelloCUDA */
/************************************************************
************/
int main(int argc, char* argv[])
{
if(!InitCUDA()) {
return 0;
}
char *device_result = 0;
char host_result[12] ={0};
CUDA_SAFE_CALL( cudaMalloc((void**) &device_result, sizeof(char) * 11));
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
HelloCUDA<<<1, 1, 0>>>(device_result, 11);
CUT_CHECK_ERROR("Kernel execution failed\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutStopTimer( timer));
printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
CUDA_SAFE_CALL( cudaMemcpy(&host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost));
printf("%s\n", host_result);
CUDA_SAFE_CALL( cudaFree(device_result));
CUT_EXIT(argc, argv);
return 0;
}
With this code, I got an execution time between 0.11ms and 0.14ms:
/************************************************************
************/
/* HelloCUDA */
/************************************************************
************/
int main(int argc, char* argv[])
{
if(!InitCUDA()) {
return 0;
}
char *device_result = 0;
char host_result[12] ={0};
CUDA_SAFE_CALL( cudaMalloc((void**) &device_result, sizeof(char) * 11));
cudaEvent_t start;
cudaEvent_t stop;
float elapsed;
cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&stop));
cudaEventRecord(start, 0);
// Invoke kernel
HelloCUDA<<<1, 1, 0>>>(device_result, 11);
cutilCheckMsg("Kernel invocation failed");
// Measure time
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
printf("Processing time: %f (ms)\n", elapsed);
cudaEventDestroy(start);
cudaEventDestroy(stop);
CUDA_SAFE_CALL( cudaMemcpy(&host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost));
printf("%s\n", host_result);
CUDA_SAFE_CALL( cudaFree(device_result));
CUT_EXIT(argc, argv);
return 0;
}
Whats reason for this performance hint? Both codes are based Cuda 2.2.