Hi,
I am trying to run the experiment to assess the effect of misaligned access on the effective bandwidth, which is nicely explained in the CUDA C Programming Best Practices Guide. But I am getting different numbers from the best practices guide. I have a GTX 280 card. For this card, the memory bandwidth values in figure 3.7 of the guide are at three different levels, 66, 80, and 120 GBps. In my experiment, I am getting 47, 57, and 75 for the same levels. Here is my code, which is mostly copied from the guide:
[codebox]
global void offsetCopy(float *odata, float *idata, int n, int offset)
{
int xid = blockIdx.x * blockDim.x + threadIdx.x;
if (xid < n)
odata[xid + offset] = idata[xid + offset];
}
void testMemAccessThruput()
{
const int maxOffset = 16;
const int numItr = 10;
const int numDataElems = 1024 * 1024 * 32;
float *didata = NULL;
float *dodata = NULL;
cudaMalloc((void**) &didata, sizeof(*didata) * (numDataElems + maxOffset));
cudaMalloc((void**) &dodata, sizeof(*dodata) * (numDataElems + maxOffset));
dim3 bSize, gSize;
bSize.x = 256;
gSize.x = numDataElems / bSize.x;
for (int o = 0; o <= maxOffset; o++)
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
offsetCopy<<<gSize, bSize>>>(dodata, didata, numDataElems, o);
cudaEventRecord(start, 0);
for (int i = 0; i < numItr; i++)
{
offsetCopy<<<gSize, bSize>>>(dodata, didata, numDataElems, o);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time, start, stop);
time /= 1000.f; // convert from msec to sec
double thruput = (double(numItr * sizeof(*dodata) * numDataElems * 2) / (1000.0 * 1000.0 * 1000.0)) / time;
printf("For offset %d, the bandwidth is %f\n", o, thruput);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
}
[/codebox]
Any idea why my effective bandwidth is different?