hi ereryone,
I modify cuda/projects/transpose.cu ,the demo of CUDA SDK ,use gettimeofday() to measure speed of transpose a matrix on gpu.
code:
...
// execute the kernel
timeval start;
gettimeofday(&start,NULL);
printf("gpu start: %d-%d\n",start.tv_sec,start.tv_usec);
transpose<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);//half_data_size);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// copy result from device to host
float* h_odata = (float*) malloc(mem_size);
CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, mem_size,
cudaMemcpyDeviceToHost) );
timeval end;
gettimeofday(&end,NULL);
printf("gpu end: %d-%d\n",end.tv_sec,end.tv_usec);
float val;
val=(float)(end.tv_sec-start.tv_sec);
if(end.tv_usec>=start.tv_usec)
val+=(float)(end.tv_usec-start.tv_usec)/(float)1000000;
else
val+=(float)(end.tv_usec-start.tv_usec+1000000)/(float)1000000;
printf("gpu run time:%0.6f\n",val);
gettimeofday(&start,NULL);
printf("\ncpu start: %d-%d\n",start.tv_sec,start.tv_usec);
computeGold( reference, h_idata, size_x, size_y);
gettimeofday(&end,NULL);
printf("cpu end: %d-%d\n",end.tv_sec,end.tv_usec);
val=(float)(end.tv_sec-start.tv_sec);
if(end.tv_usec>=start.tv_usec)
val+=(float)(end.tv_usec-start.tv_usec)/(float)1000000;
else
val+=(float)(end.tv_usec-start.tv_usec+1000000)/(float)1000000;
printf("cpu run time:%0.6f\n",val);
....
result:
$ transpose
Transposing a 256 by 4096 matrix of floats…
gpu start: 1186558862-22570
gpu end: 1186558862-51328
gpu run time:0.028758
cpu start: 1186558862-51388
cpu end: 1186558862-63622
cpu run time:0.012234
Test PASSED
Press ENTER to exit…
doubt:
Why GPU spend more time than CPU ? How can I use GUP in real time system?