As described in table 2 in the cuda c programming guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions, the number of operations per clock cycle per multiprocessor for 32-bit floating-point add is 128, while it is 4 for 64-bit floating-point add, namely, 32 times slower for 64-bit floating-point add.
However, as I used the following code to test the speed difference, the float version is only 9 to 10 times faster than the double version, does anyone know the reason?
#define N 1000
typedef double Real;// double or float
// Device code
__global__ void VecAdd(Real* A, Real* B, Real* C)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
Real a = A[i];
Real b = B[i];
Real c = 0.0f;
for (int j = 0; j < 10000; j++)
{
c += (a + b);
}
C[i] = c;
}
}
// Host code
int main()
{
size_t size = N * sizeof(Real);
// Allocate input vectors h_A and h_B in host memory
Real* h_A = (Real*)malloc(size);
Real* h_B = (Real*)malloc(size);
Real* h_C = (Real*)malloc(size);
// Initialize input vectors
for (int i = 0; i < N; i++)
{
h_A[i] = 1.0f + i * 0.1f;
h_B[i] = 100.0f + i * 0.1f;
}
// Allocate vectors in device memory
Real* d_A;
cudaMalloc(&d_A, size);
Real* d_B;
cudaMalloc(&d_B, size);
Real* d_C;
cudaMalloc(&d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid =
(N + threadsPerBlock - 1) / threadsPerBlock;
// Time measurement starts
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaEventSynchronize(start);
for (int i = 0; i < 10000; i++)
{
VecAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C);
}
// Time measurement ends
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Time to generate: %3.8f ms\n", elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
}
I used the following compile command:
nvcc -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin" -O0 main.cu -o main
and CUDA toolkit 8.0 and my system is 64bit windows 10 with GeForce 1080, driver version 372.90.
Here is the ptx file.
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-20732876
// Cuda compilation tools, release 8.0, V8.0.26
// Based on LLVM 3.4svn
//
.version 5.0
.target sm_20
.address_size 32
// .globl _Z6VecAddPfS_S_
.visible .entry _Z6VecAddPfS_S_(
.param .u32 _Z6VecAddPfS_S__param_0,
.param .u32 _Z6VecAddPfS_S__param_1,
.param .u32 _Z6VecAddPfS_S__param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<57>;
.reg .b32 %r<20>;
ld.param.u32 %r5, [_Z6VecAddPfS_S__param_0];
ld.param.u32 %r6, [_Z6VecAddPfS_S__param_1];
ld.param.u32 %r7, [_Z6VecAddPfS_S__param_2];
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r1, %r8, %r9, %r10;
setp.gt.s32 %p1, %r1, 999;
@%p1 bra BB0_4;
cvta.to.global.u32 %r2, %r7;
cvta.to.global.u32 %r12, %r5;
shl.b32 %r13, %r1, 2;
add.s32 %r14, %r12, %r13;
cvta.to.global.u32 %r15, %r6;
add.s32 %r16, %r15, %r13;
ld.global.f32 %f5, [%r16];
ld.global.f32 %f6, [%r14];
add.f32 %f1, %f6, %f5;
mov.f32 %f56, 0f00000000;
mov.u32 %r19, 10000;
BB0_2:
add.f32 %f7, %f1, %f56;
add.f32 %f8, %f1, %f7;
add.f32 %f9, %f1, %f8;
add.f32 %f10, %f1, %f9;
add.f32 %f11, %f1, %f10;
add.f32 %f12, %f1, %f11;
add.f32 %f13, %f1, %f12;
add.f32 %f14, %f1, %f13;
add.f32 %f15, %f1, %f14;
add.f32 %f16, %f1, %f15;
add.f32 %f17, %f1, %f16;
add.f32 %f18, %f1, %f17;
add.f32 %f19, %f1, %f18;
add.f32 %f20, %f1, %f19;
add.f32 %f21, %f1, %f20;
add.f32 %f22, %f1, %f21;
add.f32 %f23, %f1, %f22;
add.f32 %f24, %f1, %f23;
add.f32 %f25, %f1, %f24;
add.f32 %f26, %f1, %f25;
add.f32 %f27, %f1, %f26;
add.f32 %f28, %f1, %f27;
add.f32 %f29, %f1, %f28;
add.f32 %f30, %f1, %f29;
add.f32 %f31, %f1, %f30;
add.f32 %f32, %f1, %f31;
add.f32 %f33, %f1, %f32;
add.f32 %f34, %f1, %f33;
add.f32 %f35, %f1, %f34;
add.f32 %f36, %f1, %f35;
add.f32 %f37, %f1, %f36;
add.f32 %f38, %f1, %f37;
add.f32 %f39, %f1, %f38;
add.f32 %f40, %f1, %f39;
add.f32 %f41, %f1, %f40;
add.f32 %f42, %f1, %f41;
add.f32 %f43, %f1, %f42;
add.f32 %f44, %f1, %f43;
add.f32 %f45, %f1, %f44;
add.f32 %f46, %f1, %f45;
add.f32 %f47, %f1, %f46;
add.f32 %f48, %f1, %f47;
add.f32 %f49, %f1, %f48;
add.f32 %f50, %f1, %f49;
add.f32 %f51, %f1, %f50;
add.f32 %f52, %f1, %f51;
add.f32 %f53, %f1, %f52;
add.f32 %f54, %f1, %f53;
add.f32 %f55, %f1, %f54;
add.f32 %f56, %f1, %f55;
add.s32 %r19, %r19, -50;
setp.ne.s32 %p2, %r19, 0;
@%p2 bra BB0_2;
add.s32 %r18, %r2, %r13;
st.global.f32 [%r18], %f56;
BB0_4:
ret;
}