Please test the following code sample:
#include <cutil.h>
#define ARRAY_SIZE 10000000
#define SHARED_BLOCK_SIZE 1000
global void Calc_GPU();
extern “C” void CUDA_Calculate(unsigned long* timeDelta)
{
unsigned int timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
Calc_GPU<<<1, 1>>>();
CUDA_SAFE_CALL(cudaThreadSynchronize());
CUT_SAFE_CALL(cutStopTimer(timer));
*timeDelta = cutGetTimerValue(timer);
}
global void Calc_GPU()
{
shared float sharedArrayOne[SHARED_BLOCK_SIZE];
shared float sharedArrayTwo[SHARED_BLOCK_SIZE];
shared float sharedArrayThree[SHARED_BLOCK_SIZE];
int numCycles = ARRAY_SIZE / SHARED_BLOCK_SIZE;
for (int i = 0 ; i < numCycles ; i++)
{
for (int j = 0 ; j < SHARED_BLOCK_SIZE - 1 ; j++)
{
sharedArrayThree[j] = sharedArrayOne[j] * sharedArrayTwo[j];
}
}
}
On my 280 GTX I get *timeDelta = 1100 ms, or little more then one second. So caculation speed is about 10 million operations per second. And it is from the shared memory.
I think it is too slow even for one GPU kernel.
What is wrong?