I was tried to test my A800 80GB GPU bandwidth, a strange phenomenon was obtained.
When i used FP32 format to test bandwidth, the read bandwidth only can get half of peak performance approximately. I changed FP32 to double then can get peak performance.
But when i tested write bandwidth, whatever format i used were not influence the memory bandwdith .
To avoid L1/L2 cache affect, i only access all data once.
So is there something different between FP32 and double when GPU read date from DRAM?
My test code as follows:
#include <cuda_runtime.h>
#include <stdio.h>
__global__ void copyRow(float * MatA,float * MatB)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
float tmp =MatA[idx];
// Prevents the compiler from optimizing the above assignment statement
if ( tmp == 123.0)
{
MatA[idx] = tmp ;
}
}
int main(int argc,char** argv)
{
printf("strating...\n");
int nxy=128*1024*1024;
int nBytes=nxy*sizeof(float);
//Malloc
float* A_host=(float*)malloc(nBytes);
float* B_host=(float*)malloc(nBytes);
//cudaMalloc
float *A_dev=NULL;
float *B_dev=NULL;
cudaMalloc((void**)&A_dev,nBytes);
cudaMalloc((void**)&B_dev,nBytes);
for(int test =0; test<10; test++)
copyRow<<<nxy/1024,1024>>>(A_dev,B_dev);
cudaMemcpy(B_host,B_dev,nBytes,cudaMemcpyDeviceToHost);
cudaFree(A_dev);
cudaFree(B_dev);
free(A_host);
free(B_host);
cudaDeviceReset();
return 0;
}