effective bandwidth is different from best practices guide question

Hi,

I am trying to run the experiment to assess the effect of misaligned access on the effective bandwidth, which is nicely explained in the CUDA C Programming Best Practices Guide. But I am getting different numbers from the best practices guide. I have a GTX 280 card. For this card, the memory bandwidth values in figure 3.7 of the guide are at three different levels, 66, 80, and 120 GBps. In my experiment, I am getting 47, 57, and 75 for the same levels. Here is my code, which is mostly copied from the guide:

[codebox]

global void offsetCopy(float *odata, float *idata, int n, int offset)

{

int xid = blockIdx.x * blockDim.x + threadIdx.x; 

if (xid < n)

	odata[xid + offset] = idata[xid + offset]; 

}

void testMemAccessThruput()

{

const int maxOffset = 16;

const int numItr = 10;

const int numDataElems = 1024 * 1024 * 32;

float *didata = NULL;

float *dodata = NULL;

cudaMalloc((void**) &didata, sizeof(*didata) * (numDataElems + maxOffset));

cudaMalloc((void**) &dodata, sizeof(*dodata) * (numDataElems + maxOffset));

dim3 bSize, gSize;

bSize.x = 256;

gSize.x = numDataElems / bSize.x;

for (int o = 0; o <= maxOffset; o++)

{

	cudaEvent_t start, stop; 

	cudaEventCreate(&start); 

	cudaEventCreate(&stop); 

	 

	offsetCopy<<<gSize, bSize>>>(dodata, didata, numDataElems, o);

	

	cudaEventRecord(start, 0);

	for (int i = 0; i < numItr; i++)

	{

		offsetCopy<<<gSize, bSize>>>(dodata, didata, numDataElems, o);

	}

	cudaEventRecord(stop, 0); 

	cudaEventSynchronize(stop);

	float time; 

	cudaEventElapsedTime(&time, start, stop);

	time /= 1000.f; // convert from msec to sec

	double thruput = (double(numItr * sizeof(*dodata) * numDataElems * 2) / (1000.0 * 1000.0 * 1000.0)) / time;

	printf("For offset %d, the bandwidth is %f\n", o, thruput);

	cudaEventDestroy(start); 

	cudaEventDestroy(stop);

}

}

[/codebox]

Any idea why my effective bandwidth is different?