In BestPracticeGuide, it says
Effective bandwidth = (( Br + Bw ) / 10^9) / time
Theoretical Bandwidth = ( clockRate * 10^6 * (bitwidth/8) * 2 ) / 10^9
so my GTX260 216sp 's theoretical bandwidth is
( 1175 * 10^6 * (448/8) * 2) / 10^9 = 131.6 GB/s
In practice, my effective bandwidth is only 1.832 GB/s, is that too small?
[codebox] #define z_uint8 unsigned char
#define z_float32 float
#define z_int32 int
#define N 1024
#define R 1.23456789
// global var
z_uint8 G_Input[N*N];
z_uint8 G_Output[int(N*R)int(NR)];
z_float32 G_Input2[N*N];
z_float32 G_Output2[int(N*R)int(NR)];
…
…
{
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
z_int32 size1 = NN, size2 = int(NR)int(NR), size3 = int(N*R);
z_float32* d_dest;
z_float32* d_src;
cudaMalloc( (void**)&d_src, size1 * sizeof(z_float32) );
cudaMalloc( (void**)&d_dest, size2 * sizeof(z_float32) );
for(z_int32 i = 0; i < size1; i++) G_Input2[i] = z_float32(G_Input[i]);
z_int32 iter = 10;
for(int i =0; i<iter;i++)
{
cudaMemcpy( d_src, G_Input2, size1 * sizeof(z_float32), cudaMemcpyHostToDevice);
cudaMemcpy( G_Output2, d_dest, size2 * sizeof(z_float32), cudaMemcpyDeviceToHost);
}
for(z_int32 i = 0; i < size2; i++) G_Output[i] = z_uint8(G_Output2[i]);
cudaFree( d_dest );
cudaFree( d_src );
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
printf(“Gpu time: %f miliseconds. \n”, time );
printf(“Gpu bandwidth: %f Gflops. \n”, (size1+size2)sizeof(z_float32)/1e6/timeiter );
}[/codebox]