Hi all,
I’m using the GTX280 card, and get the following results which I dont understand why I dont get the full ~140 (or so) GB/s.
I’d appriciate any assistance - Thanks a lot !! :)
I ran each kernel 500 times and took the full and average run time.
Kernel name Full (ms) Average (ms) Data size (bytes) GB/s
TestEmptyKernel [2206.326]ms [4.412]ms 891,584,512 N/A
TestWrite1 [11639.913]ms [23.280]ms 891,584,512 ~43GB/s
TestWrite2 [9852.309]ms [19.704]ms 891,584,512 ~50GB/s
TestWrite3 [57236.895]ms [114.474]ms 891,584,512 ~8GB/s
TestWrite4 [26470.006]ms [52.940]ms 891,584,512 ~19GB/s
Here is the test code I’ve used:
[codebox]
constant int ConstProjectParams[ 6 ];
global void TestEmptyKernel( float *pDeviceVAGCOutput1 )
{
}
global void TestWrite1( float *pDeviceVAGCOutput1 )
{
float fVal = 1;
pDeviceVAGCOutput1[ threadIdx.x ] = fVal;
}
global void TestWrite2( float *pDeviceVAGCOutput1 )
{
int iIndex = blockIdx.x * blockDim.x + blockIdx.y * blockDim.y + threadIdx.x;
float fVal = 1;
pDeviceVAGCOutput1[ iIndex ] = fVal;
}
global void TestWrite3( float *pDeviceVAGCOutput1 )
{
float fVal = 1;
int iV = blockIdx.x / ConstProjectParams[ 3 ];
int iC = blockIdx.x % ConstProjectParams[ 3 ];
int iA = blockIdx.y / ConstProjectParams[ 2 ];
int iG = blockIdx.y % ConstProjectParams[ 2 ];
int iOutputPos = iV * ConstProjectParams[ 1 ] * ConstProjectParams[ 2 ] * ConstProjectParams[ 3 ];
iOutputPos += iA * ConstProjectParams[ 2 ] * ConstProjectParams[ 3 ];
iOutputPos += iG * ConstProjectParams[ 3 ] + iC;
iOutputPos *= ConstProjectParams[ 4 ];
pDeviceVAGCOutput1[ iOutputPos ] = fVal;
}
global void TestWrite4( float *pDeviceVAGCOutput1 )
{
float fVal = 1;
__shared__ int iV;
__shared__ int iA;
__shared__ int iG;
__shared__ int iC;
if ( 0 == threadIdx.x )
iV = blockIdx.x / ConstProjectParams[ 3 ];
if ( 1 == threadIdx.x )
iC = blockIdx.x % ConstProjectParams[ 3 ];
if ( 2 == threadIdx.x )
iA = blockIdx.y / ConstProjectParams[ 2 ];
if ( 3 == threadIdx.x )
iG = blockIdx.y % ConstProjectParams[ 2 ];
__syncthreads();
int iOutputPos = iV * ConstProjectParams[ 1 ] * ConstProjectParams[ 2 ] * ConstProjectParams[ 3 ];
iOutputPos += iA * ConstProjectParams[ 2 ] * ConstProjectParams[ 3 ];
iOutputPos += iG * ConstProjectParams[ 3 ] + iC;
iOutputPos *= ConstProjectParams[ 4 ];
pDeviceVAGCOutput1[ iOutputPos ] = fVal;
}
void runTest1()
{
char buffLogData[ 1000 ];
int m_V = 104;
int m_A = 161;
int m_G = 4;
int m_C = 13;
int m_S = 256;
int iOutputVAGCSize = m_V * m_A * m_G;
iOutputVAGCSize *= m_C * m_S; // 222,896,128
int constHostProjectParams[ 6 ];
constHostProjectParams[ 0 ] = m_V;
constHostProjectParams[ 1 ] = m_A;
constHostProjectParams[ 2 ] = m_G;
constHostProjectParams[ 3 ] = m_C;
constHostProjectParams[ 4 ] = m_S;
cudaMemcpyToSymbol( ConstProjectParams, &constHostProjectParams[0], 6 * sizeof( int ), 0 );
float *pDeviceVAGCOutput1 = NULL;
CUDA_SAFE_CALL( cudaMalloc( ( void ** )&pDeviceVAGCOutput1, iOutputVAGCSize * sizeof( float ) ) );
CUDA_SAFE_CALL( cudaMemset( pDeviceVAGCOutput1, 0, iOutputVAGCSize * sizeof( float ) ) );
CUT_CHECK_ERROR_LOG_TO_FILE( "Error preparing output." );
unsigned int iTimer = 0, iTimer1 = 0;
float fKernelTime = 0, f1 = 0;
CUT_SAFE_CALL( cutCreateTimer( &iTimer ) );
CUT_SAFE_CALL( cutCreateTimer( &iTimer1 ) );
dim3 blocks( m_V * m_C, m_A * m_G );
CUT_SAFE_CALL( cutResetTimer( iTimer1 ) );
CUT_SAFE_CALL( cutStartTimer( iTimer1 ) );
int iIterCount = 500;
for ( int i = 0; i < iIterCount; i++ )
{
CUT_SAFE_CALL( cutResetTimer( iTimer ) );
CUT_SAFE_CALL( cutStartTimer( iTimer ) );
//TestEmptyKernel<<< blocks, 256 >>>( pDeviceVAGCOutput1 );
TestWrite4<<< blocks, 256 >>>( pDeviceVAGCOutput1 );
CUT_CHECK_ERROR_LOG_TO_FILE( "[] - Error running kernel." );
CUT_SAFE_CALL( cutStopTimer( iTimer ) );
fKernelTime += cutGetTimerValue( iTimer );
}
CUT_SAFE_CALL( cutStopTimer( iTimer1 ) );
f1 = cutGetTimerValue( iTimer1 );
sprintf_s( buffLogData, "TestWrite4: Full: [%0.3f], Average: [%0.3f]ms, Data output size: [%d] (bytes)\n", f1, fKernelTime / iIterCount, iOutputVAGCSize * sizeof( float ) );
LogData( buffLogData );
}
define CUT_CHECK_ERROR_LOG_TO_FILE(errorMessage) do { \
cudaError_t err = cudaGetLastError(); \
char buff[ 1000 ]; \
if( cudaSuccess != err) { \
sprintf_s( buff, "Cuda error: %s in file '%s' in line %i : %s[%d].\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err), err );\
LogData( buff ); \
exit(EXIT_FAILURE); \
} \
err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
sprintf_s( buff, "Cuda error: %s in file '%s' in line %i : %s[%d].\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err), err );\
LogData( buff ); \
exit(EXIT_FAILURE); \
} } while (0)
[/codebox]