Performance question

Hi all,

I’m using the GTX280 card, and get the following results which I dont understand why I dont get the full ~140 (or so) GB/s.

I’d appriciate any assistance - Thanks a lot !! :)

I ran each kernel 500 times and took the full and average run time.

Kernel name Full (ms) Average (ms) Data size (bytes) GB/s

TestEmptyKernel [2206.326]ms [4.412]ms 891,584,512 N/A

TestWrite1 [11639.913]ms [23.280]ms 891,584,512 ~43GB/s

TestWrite2 [9852.309]ms [19.704]ms 891,584,512 ~50GB/s

TestWrite3 [57236.895]ms [114.474]ms 891,584,512 ~8GB/s

TestWrite4 [26470.006]ms [52.940]ms 891,584,512 ~19GB/s

Here is the test code I’ve used:

[codebox]

constant int ConstProjectParams[ 6 ];

global void TestEmptyKernel( float *pDeviceVAGCOutput1 )

{

}

global void TestWrite1( float *pDeviceVAGCOutput1 )

{

float fVal = 1;

pDeviceVAGCOutput1[ threadIdx.x ] = fVal;

}

global void TestWrite2( float *pDeviceVAGCOutput1 )

{

int iIndex = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y + threadIdx.x;

float fVal = 1;

pDeviceVAGCOutput1[ iIndex ] = fVal;

}

global void TestWrite3( float *pDeviceVAGCOutput1 )

{

float fVal = 1;

int iV = blockIdx.x / ConstProjectParams[ 3 ];

int iC = blockIdx.x % ConstProjectParams[ 3 ];

int iA = blockIdx.y / ConstProjectParams[ 2 ];

int iG = blockIdx.y % ConstProjectParams[ 2 ];

int iOutputPos = iV * ConstProjectParams[ 1 ] * ConstProjectParams[ 2 ] * ConstProjectParams[ 3 ];

iOutputPos += iA * ConstProjectParams[ 2 ] * ConstProjectParams[ 3 ];

iOutputPos += iG * ConstProjectParams[ 3 ] + iC;

iOutputPos *= ConstProjectParams[ 4 ];

pDeviceVAGCOutput1[ iOutputPos ] = fVal;

}

global void TestWrite4( float *pDeviceVAGCOutput1 )

{

float fVal = 1;

__shared__ int iV;

__shared__ int iA;

__shared__ int iG;

__shared__ int iC;

if ( 0 == threadIdx.x )

	iV = blockIdx.x / ConstProjectParams[ 3 ];

if ( 1 == threadIdx.x )

	iC = blockIdx.x % ConstProjectParams[ 3 ];

if ( 2 == threadIdx.x )

	iA = blockIdx.y / ConstProjectParams[ 2 ];

if ( 3 == threadIdx.x )

	iG = blockIdx.y % ConstProjectParams[ 2 ];

__syncthreads();

int iOutputPos = iV * ConstProjectParams[ 1 ] * ConstProjectParams[ 2 ] * ConstProjectParams[ 3 ];

iOutputPos += iA * ConstProjectParams[ 2 ] * ConstProjectParams[ 3 ];

iOutputPos += iG * ConstProjectParams[ 3 ] + iC;

iOutputPos *= ConstProjectParams[ 4 ];

pDeviceVAGCOutput1[ iOutputPos ] = fVal;

}

void runTest1()

{

char buffLogData[ 1000 ];

int m_V = 104;

int m_A = 161;

int m_G = 4;

int m_C = 13;

int m_S = 256; 

int iOutputVAGCSize = m_V * m_A * m_G;

iOutputVAGCSize *= m_C * m_S;  // 222,896,128



int constHostProjectParams[ 6 ];

constHostProjectParams[ 0 ] = m_V;

constHostProjectParams[ 1 ] = m_A;

constHostProjectParams[ 2 ] = m_G;

constHostProjectParams[ 3 ] = m_C;

constHostProjectParams[ 4 ] = m_S;

cudaMemcpyToSymbol( ConstProjectParams, &constHostProjectParams[0], 6 * sizeof( int ), 0 );



float *pDeviceVAGCOutput1 = NULL;

CUDA_SAFE_CALL( cudaMalloc( ( void ** )&pDeviceVAGCOutput1, iOutputVAGCSize * sizeof( float ) ) );

CUDA_SAFE_CALL( cudaMemset( pDeviceVAGCOutput1, 0, iOutputVAGCSize * sizeof( float ) ) );

CUT_CHECK_ERROR_LOG_TO_FILE( "Error preparing output." );

unsigned int iTimer = 0, iTimer1 = 0;

float fKernelTime = 0, f1 = 0;

CUT_SAFE_CALL( cutCreateTimer( &iTimer ) );

CUT_SAFE_CALL( cutCreateTimer( &iTimer1 ) );

dim3 blocks( m_V * m_C, m_A * m_G );

CUT_SAFE_CALL( cutResetTimer( iTimer1 ) );

CUT_SAFE_CALL( cutStartTimer( iTimer1 ) );

int iIterCount = 500;

for ( int i = 0; i < iIterCount; i++ )

{

	CUT_SAFE_CALL( cutResetTimer( iTimer ) );

	CUT_SAFE_CALL( cutStartTimer( iTimer ) );

	//TestEmptyKernel<<< blocks, 256 >>>( pDeviceVAGCOutput1 );

	TestWrite4<<< blocks, 256 >>>( pDeviceVAGCOutput1 );

	CUT_CHECK_ERROR_LOG_TO_FILE( "[] - Error running kernel." );

	CUT_SAFE_CALL( cutStopTimer( iTimer ) );

	fKernelTime += cutGetTimerValue( iTimer );

}

CUT_SAFE_CALL( cutStopTimer( iTimer1 ) );

f1 = cutGetTimerValue( iTimer1 );

sprintf_s( buffLogData, "TestWrite4: Full: [%0.3f], Average: [%0.3f]ms, Data output size: [%d] (bytes)\n", f1, fKernelTime / iIterCount, iOutputVAGCSize * sizeof( float ) );

LogData( buffLogData );

}

define CUT_CHECK_ERROR_LOG_TO_FILE(errorMessage) do { \

cudaError_t err = cudaGetLastError();                                    \

char buff[ 1000 ];														 \

if( cudaSuccess != err) {                                                \

sprintf_s( buff, "Cuda error: %s in file '%s' in line %i : %s[%d].\n",    \

errorMessage, __FILE__, __LINE__, cudaGetErrorString( err), err );\

LogData( buff );													\

exit(EXIT_FAILURE);                                                  \

}                                                                        \

err = cudaThreadSynchronize();                                           \

if( cudaSuccess != err) {                                                \

sprintf_s( buff, "Cuda error: %s in file '%s' in line %i : %s[%d].\n",    \

errorMessage, __FILE__, __LINE__, cudaGetErrorString( err), err );\

LogData( buff );													\

exit(EXIT_FAILURE);                                                  \

} } while (0)

[/codebox]

Don’t you have to use __cudathreadsync() after the kernel call ?

I’m using a custom CUT_CHECK_ERROR_LOG_TO_FILE macro after the kernel code which does exactly this.

Hi,

Any assistance would be much appriciated :)

thanks

eyal

The bandwidth tester I wrote pegs 100+ GiB/s copy bandwidth for the GTX280. I don’t recall what it got for the write only bmarks and am away from that system for the holidays. You can download it from the forums (search for bw_test posted by me).