Performance question

Hi,
I’m using the GTX280 card and wrote a simple kernel to test the bandwidth I get. The results are real poor as far as I can tell.
I get 13ms for copying 400MB to another 400MB - this gives me approximatly 30GBs instead of 140GB no?
Can anyone shed light?

global void Test( float *pSource, float *pDest )
{
int pos = blockIdx.x * gridDim.y * blockDim.x + blockIdx.y * blockDim.x + threadIdx.x;
pDest[ pos ] = pSource[ pos ];
}

int iSize = 1024 * 1024 * 100 * sizeof( float );
float *pSource, *pDest, *pHostData, *pHostDataDest;
pHostData = new float[ iSize / 4 ];
pHostDataDest = new float[ iSize / 4 ];
for ( int i = 0; i < iSize / 4; i++ )
pHostData[ i ] = ( ( i % 1024 ) + 0.2 ) / ( ( i % 700 ) + 22.355f );
CUT_SAFE_CALL( cutCreateTimer( &iKernelTest ) );
CUT_SAFE_CALL( cutResetTimer( iKernelTest ) );
CUT_SAFE_CALL( cutStartTimer( iKernelTest ) );
CUDA_SAFE_CALL( cudaMalloc( ( void ** )&pSource, iSize ) );
CUDA_SAFE_CALL( cudaMalloc( ( void ** )&pDest, iSize ) );
CUDA_SAFE_CALL( cudaMemcpy( pSource, pHostData, iSize, cudaMemcpyHostToDevice ) );
cudaThreadSyncronize();
cutStopTimer( iKernelTest );
fKernelTime = cutGetTimerValue( iKernelTest );
sprintf_s( buffLogData, “[CalculateInGPU_7] - Alloc 800MB: [%0.6f] ms\n”, fKernelTime );
LogData( buffLogData );
int iThreadCount = 256;
int iBlockY = iSize / sizeof( float ) / iThreadCount / 1024;
dim3 blocksTest( 1024, iBlockY );
sprintf_s( buffLogData, “Kernel sizes: Block.x[%d], Block.y[%d], Thread.x[%d]\n”, blocksTest.x, blocksTest.y, iThreadCount );
LogData( buffLogData );
CUT_SAFE_CALL( cutResetTimer( iKernelTest ) );
CUT_SAFE_CALL( cutStartTimer( iKernelTest ) );
Test<<< blocksTest, iThreadCount >>>( pSource, pDest );
cudaThreadSyncronize();
cutStopTimer( iKernelTest );
fKernelTime = cutGetTimerValue( iKernelTest );
CUDA_SAFE_CALL( cudaMemcpy( pHostDataDest, &(pDest[0]), iSize, cudaMemcpyDeviceToHost ) );
sprintf_s( buffLogData, “[CalculateInGPU_7] - After copying 800MB: [%0.6f] ms\n [%f]”, fKernelTime, pHostDataDest[0] );
LogData( buffLogData );

Most likely you’re not getting coalescing. Are you sure your indexing is correct?

I get the same results for this sole line in the kernel as well:

pDest[ threadIdx.x ] = pSource[ threadIdx.x ];