Hi, this is my first post, being written a couple of hours after i started playing with CUDA, so my question will no doubt be trivial, sorry in advance!
I was quite amazed at the particule simulator’s performance and having written my own in plain old C, i started wondering if i could translate it to use CUDA.
To get myself familiar with the tools, i started writing a simple program that would take 2 vectors and do some math between them.
I write both vector once to device memory then loop the kernel 10000 times and compare this execution on both gpu and cpu to find out that the execution time is pretty much the same.
Here is the code:
#define THREAD_CNT 256
#define NUM_PARTICULES 104200
const int BLOCK_CNT = ceil((float)NUM_PARTICULES/(float)THREAD_CNT);
float4 partPos[NUM_PARTICULES];
float4 partForces[NUM_PARTICULES];
#define DT 0.001
__global__ void integrate( float4* g_data, float4* g_force,float dt )
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
g_data[index].z+=g_force[index].z;
}
int main(int argc, char** argv)
{
initParticules();
cudaInit();
unsigned int timer0;
CUT_SAFE_CALL( cutCreateTimer( &timer0));
CUT_SAFE_CALL( cutStartTimer( timer0));
for(int i=0;i<10000;i++)
{
update();
//onCpu();
render();
}
CUT_SAFE_CALL( cutStopTimer( timer0));
printf("gpu time: %f\n", cutGetTimerValue(timer0));
unsigned int timer1;
CUT_SAFE_CALL( cutCreateTimer( &timer1));
CUT_SAFE_CALL( cutStartTimer( timer1));
for(int i=0;i<10000;i++)
{
//update();
onCpu();
render();
}
CUT_SAFE_CALL( cutStopTimer( timer1));
printf("cpu time: %f\n", cutGetTimerValue(timer1));
return 1;
}
void onCpu()
{
for(int i=0;i<NUM_PARTICULES;i++)
{
partPos[i].z+=partForces[i].z;
}
}
void cudaInit()
{
CUT_DEVICE_INIT();
int mem_size = NUM_PARTICULES*sizeof(float4);
CUDA_SAFE_CALL(cudaMalloc((void**) &gpuPos, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void**) &gpuForce, mem_size));
CUDA_SAFE_CALL(cudaMemcpy(gpuForce, partForces, mem_size, cudaMemcpyHostToDevice) );
}
void update()
{
//CUDA_SAFE_CALL(cudaMemcpy(gpuPos, partPos, mem_size, cudaMemcpyHostToDevice) );
integrate<<< BLOCK_CNT, THREAD_CNT >>>((float4*) gpuPos,(float4*) gpuForce,DT);
//CUDA_SAFE_CALL(cudaMemcpy(partPos, gpuPos, mem_size,cudaMemcpyDeviceToHost));
}
Which result in:
gpu time: 2151.280029
cpu time: 2044.373657
On a core2duo e6550 and geforce 8800gt. I was expecting a much lower gpu time and im guessing its due to some rookie mistake on my part. Can anyone tell me what it is?
Sorry about the messy code, im only just learning, not writing anything useful at the moment.
Thanks!
Sami
–EDIT
After some more testing, i see the gpu performances starting to ramp up when i quadruple the number of particules
gpu time: 8268.873047
cpu time: 32456.363281
Am i not using the gpu entirely with an array of only 104200? I would seem unlikely to me, but hey… rookie!