Offset copy Bandwidth on GTX460 and GTX295

Hi, all

I use the following codes for testing offset copy bandwidth on GTX460 and GTX295. I came up with the problem that peak bandwidth on GTX460 is 75GB/s, and GTX295 90Gb/s.

GTX460 has 336 cuda cores and memory bandwidth is 115, and GTX 448 cuda cores and memory bandwidth is 119,

GTX460 is Fermi and GTX295 is Tesla,

What caused the lower bandwidth on GTX460?

/* copy with offset */

// Includes

#include <stdio.h>

#include <cutil_inline.h>

// Variables

float* h_A;

float* h_C;

float* d_A;

float* d_C;

bool noprompt = false;

// Functions

void Cleanup(void);

void RandomInit(float*, int);

void ParseArguments(int, char**);

// Device code

__global__ void offset_copy(const float* A,  float* C, int N, int offset)

{

    int i = blockDim.x * blockIdx.x + threadIdx.x+offset;

  if (i < N)

        C[i] = A[i] ;

}

// Host code

int main(int argc, char** argv)

{

    printf("Vector addition\n");

    int N = 10240000;

	int offset=0;

    const  int size = N * sizeof(float);

   // ParseArguments(argc, argv);

	cudaEvent_t start;

	cudaEvent_t stop;

    // Allocate input vectors h_A and h_B in host memory

    h_A = (float*)malloc(size);

    if (h_A == 0) Cleanup();

    //h_B = (float*)malloc(size);

    //if (h_B == 0) Cleanup();

    h_C = (float*)malloc(size);

    if (h_C == 0) Cleanup();

// Initialize input vectors

    RandomInit(h_A, N);

   // RandomInit(h_B, N);

// Allocate vectors in device memory

    cutilSafeCall( cudaMalloc((void**)&d_A, size) );

    cutilSafeCall( cudaMalloc((void**)&d_C, size) );

// Copy vectors from host memory to device memory

    cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaEventCreate(&start) );

	cutilSafeCall( cudaEventCreate(&stop) );

//kernel initial

	void (*kernel)(const float *, float *, int,int);

//	

	char *kernelName;

   for (int offse = 0; offse<33; offse++) {

	   kernel = &offset_copy;

int threadsPerBlock = 256;

    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

kernel<<<blocksPerGrid, threadsPerBlock>>>(d_A,  d_C, N,offset);

	cutilSafeCall( cudaEventRecord(start, 0) );

	for(int i=0;i<1000;i++)

     {

     kernel<<<blocksPerGrid, threadsPerBlock>>>(d_A,  d_C, N,offset);

    }

    cutilSafeCall( cudaEventRecord(stop, 0) );

    cutilSafeCall( cudaEventSynchronize(stop) );

    float outerTime;

    cutilSafeCall( cudaEventElapsedTime(&outerTime, start, stop) );    

    cutilCheckMsg("kernel launch failure");

#ifdef _DEBUG

    cutilSafeCall( cudaThreadSynchronize() );

#endif

// Copy result from device memory to host memory

    // h_C contains the result in host memory

    cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );

// Verify result

float outerBandwidth = 2.0f * 1000.0f * size/(1024*1024*1024)/(outerTime/1000);

printf("\n");

    printf( "transpose-Outer-%s, Throughput = %.4f GB/s, Time = %.5f s, offset= %.4d, size= %.9d", 

           "offsetCopy",

           outerBandwidth, 

           outerTime/10,

		   offset,size);

}

int i;

    for (i = 0; i <N; ++i) {

        float sum = h_A[i] ;

        if (fabs(h_C[i] - sum) > 1e-5)

            break;

    }

    printf("%s \n", (i == N) ? "PASSED" : "FAILED");

	Cleanup();

}

	void Cleanup(void)

{

    // Free device memory

    if (d_A)

        cudaFree(d_A);

    //if (d_B)

    //    cudaFree(d_B);

    if (d_C)

        cudaFree(d_C);

// Free host memory

    if (h_A)

        free(h_A);

    //if (h_B)

    //    free(h_B);

    if (h_C)

        free(h_C);

cutilSafeCall( cudaThreadExit() );

if (!noprompt) {

        printf("\nPress ENTER to exit...\n");

        fflush( stdout);

        fflush( stderr);

        getchar();

    }

exit(0);

}

// Allocates an array with random float entries.

void RandomInit(float* data, int n)

{

    for (int i = 0; i < n; ++i)

        data[i] = rand() / (float)RAND_MAX;

}

Thx.