I keep reading that CUDA is really fast but I can’t seem to find a simple demonstration
of a GPU function having a speed increase over a CPU version.
I made a simple multiplication function for the CPU using VS2008 that does a million
multiplications on int’s over a thousand trials, and then I made a GPU version that
does one trial of a million multiplications. I have a Pentium4 3.00GHz and a 9800GT.
My system has 1GB of ram and the GPU has 512MB of ram. I seem to get around a
microsecond or two with the CPU implementation and then anywhere from 45-60
milliseconds with the GPU implementation.
I’ve heard that there’s a lot of overhead in initializing GPU functions, is this what I’m seeing?
Am I doing enough processor calculations to see any difference? Where should I be looking
to optimize GPU code in general?
Could someone provide a simple example of a GPU calculating data faster than a CPU?
Anyways, here’s my code
[codebox]//CPU Implementation
#include “hr_time.h”
int increProduct( int nIncrementTo )
{
int a,i;
for ( i=0; i<nIncrementTo; i++ )
a = 7*i;
return a;
}
int _tmain(int argc, _TCHAR* argv)
{
const int TRIALS = 1000;
unsigned int j,a;
double total=0;
stopWatch s;
startTimer( &s);
for ( j=0; j<TRIALS; j++)
{
a = increProduct(1000000);
}
stopTimer(&s);
total = getElapsedTime(&s);
printf("1000000 multiplications took on average of %d trials: %f milliseconds\n", TRIALS, total);
printf("final product: %d\n", a);
system("PAUSE");
delete [] t;
return 0;
}
//GPU Implementation
#define IMUL(a, B) __umul24(a, B)
global
void
MultiplyGPUTest(int N)
{
int a;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if ( idx < N )
a = IMUL(7,idx); //the product should be under 24bits, so use 24 bit mul
}
void
runMultiplyGPUTest( int argc, char** argv)
{
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
dim3 dimGrid(32768,1,1);
dim3 dimBlock(32,1,1);
unsigned int timer = 0;
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));
int num_elements = 1000000;
MultiplyGPUTest<<<dimGrid,dimBlock>>>(num_elements);
cutilCheckMsg("Kernel execution failed");
cutilCheckError( cutStopTimer( timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
cutilCheckError( cutDeleteTimer( timer));
}[/codebox]
multiplyGPU.zip (9.66 KB)
multiplyCPU.zip (3.59 KB)