Hi, all
I use the following codes for testing offset copy bandwidth on GTX460 and GTX295. I came up with the problem that peak bandwidth on GTX460 is 75GB/s, and GTX295 90Gb/s.
GTX460 has 336 cuda cores and memory bandwidth is 115, and GTX 448 cuda cores and memory bandwidth is 119,
GTX460 is Fermi and GTX295 is Tesla,
What caused the lower bandwidth on GTX460?
/* copy with offset */
// Includes
#include <stdio.h>
#include <cutil_inline.h>
// Variables
float* h_A;
float* h_C;
float* d_A;
float* d_C;
bool noprompt = false;
// Functions
void Cleanup(void);
void RandomInit(float*, int);
void ParseArguments(int, char**);
// Device code
__global__ void offset_copy(const float* A, float* C, int N, int offset)
{
int i = blockDim.x * blockIdx.x + threadIdx.x+offset;
if (i < N)
C[i] = A[i] ;
}
// Host code
int main(int argc, char** argv)
{
printf("Vector addition\n");
int N = 10240000;
int offset=0;
const int size = N * sizeof(float);
// ParseArguments(argc, argv);
cudaEvent_t start;
cudaEvent_t stop;
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
if (h_A == 0) Cleanup();
//h_B = (float*)malloc(size);
//if (h_B == 0) Cleanup();
h_C = (float*)malloc(size);
if (h_C == 0) Cleanup();
// Initialize input vectors
RandomInit(h_A, N);
// RandomInit(h_B, N);
// Allocate vectors in device memory
cutilSafeCall( cudaMalloc((void**)&d_A, size) );
cutilSafeCall( cudaMalloc((void**)&d_C, size) );
// Copy vectors from host memory to device memory
cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaEventCreate(&start) );
cutilSafeCall( cudaEventCreate(&stop) );
//kernel initial
void (*kernel)(const float *, float *, int,int);
//
char *kernelName;
for (int offse = 0; offse<33; offse++) {
kernel = &offset_copy;
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
kernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, N,offset);
cutilSafeCall( cudaEventRecord(start, 0) );
for(int i=0;i<1000;i++)
{
kernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, N,offset);
}
cutilSafeCall( cudaEventRecord(stop, 0) );
cutilSafeCall( cudaEventSynchronize(stop) );
float outerTime;
cutilSafeCall( cudaEventElapsedTime(&outerTime, start, stop) );
cutilCheckMsg("kernel launch failure");
#ifdef _DEBUG
cutilSafeCall( cudaThreadSynchronize() );
#endif
// Copy result from device memory to host memory
// h_C contains the result in host memory
cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );
// Verify result
float outerBandwidth = 2.0f * 1000.0f * size/(1024*1024*1024)/(outerTime/1000);
printf("\n");
printf( "transpose-Outer-%s, Throughput = %.4f GB/s, Time = %.5f s, offset= %.4d, size= %.9d",
"offsetCopy",
outerBandwidth,
outerTime/10,
offset,size);
}
int i;
for (i = 0; i <N; ++i) {
float sum = h_A[i] ;
if (fabs(h_C[i] - sum) > 1e-5)
break;
}
printf("%s \n", (i == N) ? "PASSED" : "FAILED");
Cleanup();
}
void Cleanup(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
//if (d_B)
// cudaFree(d_B);
if (d_C)
cudaFree(d_C);
// Free host memory
if (h_A)
free(h_A);
//if (h_B)
// free(h_B);
if (h_C)
free(h_C);
cutilSafeCall( cudaThreadExit() );
if (!noprompt) {
printf("\nPress ENTER to exit...\n");
fflush( stdout);
fflush( stderr);
getchar();
}
exit(0);
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
Thx.