Hello All,

I did some micro-benchmarking on addition operation for both fp16 and fp32 on gtx 1080 ti, with -Xptxas -O0 to turn off the optimization.

kernel for half_add()

```
__global__ void kern_hadd (half *my_array, uint *start_t, uint *end_t)
{
unsigned int start_time1;
unsigned int start_time2;
unsigned int start_time3;
unsigned int end_time1;
unsigned int end_time2;
unsigned int end_time3;
half a = my_array[0];
half b = my_array[1];
half c;
__syncthreads();
start_time1 = clock();
end_time1 = clock();
__syncthreads();
start_time2 = clock();
c= __hadd(a, b);
end_time2 = clock();
__syncthreads();
start_time3 = clock();
end_time3 = clock();
start_t[0] = start_time1;
start_t[1] = start_time2;
start_t[2] = start_time3;
end_t[0] = end_time1;
end_t[1] = end_time2;
end_t[2] = end_time3;
my_array[2] = c;
}
```

Sass for half_add

```
/*0228*/ MOV R8, R8; /* 0x5c98078000870008 */
/*0230*/ BAR.SYNC 0x0; /* 0xf0a81b8000070000 */
/*0238*/ CS2R R10, SR_CLOCKLO; /* 0x50c800000507000a */
/* 0x00643c03fde01fef */
/*0248*/ MOV R10, R10; /* 0x5c98078000a7000a */
/*0250*/ MOV R11, R10; /* 0x5c98078000a7000b */
/*0258*/ HADD2 R4, R4.H0_H0, R9.H0_H0; /* 0x5d11000020970404 */
/* 0x007fbc03fde01fef */
/*0268*/ CS2R R9, SR_CLOCKLO; /* 0x50c8000005070009 */
/*0270*/ MOV R9, R9; /* 0x5c98078000970009 */
/*0278*/ MOV R10, R9; /* 0x5c9807800097000a */
/* 0x007fbc03fde019ef */
/*0288*/ BAR.SYNC 0x0; /* 0xf0a81b8000070000 */
/*0290*/ CS2R R9, SR_CLOCKLO; /* 0x50c8000005070009 */
/*0298*/ MOV R9, R9; /* 0x5c98078000970009 */
```

syn1 45 (clk/warp) :2mov

syn2 131 (clk/warp) 2mov + hadd2

syn3 45 (clk/warp) : 2mov

__hadd() consume around 131 - 45 = 86 clocks

kernel for float_add()

```
__global__ void kern_fadd (float *my_array, uint *start_t, uint *end_t)
{
unsigned int start_time1;
unsigned int start_time2;
unsigned int start_time3;
unsigned int end_time1;
unsigned int end_time2;
unsigned int end_time3;
float a = my_array[0];
float b = my_array[1];
float c;
__syncthreads();
start_time1 = clock();
end_time1 = clock();
__syncthreads();
start_time2 = clock();
c = a + b;
end_time2 = clock();
__syncthreads();
start_time3 = clock();
end_time3 = clock();
start_t[0] = start_time1;
start_t[1] = start_time2;
start_t[2] = start_time3;
end_t[0] = end_time1;
end_t[1] = end_time2;
end_t[2] = end_time3;
my_array[2] = c;
}
```

Sass for float_add()

```
/*0228*/ MOV R8, R8; /* 0x5c98078000870008 */
/*0230*/ BAR.SYNC 0x0; /* 0xf0a81b8000070000 */
/*0238*/ CS2R R10, SR_CLOCKLO; /* 0x50c800000507000a */
/* 0x007fbc03fde01fef */
/*0248*/ MOV R10, R10; /* 0x5c98078000a7000a */
/*0250*/ MOV R11, R10; /* 0x5c98078000a7000b */
/*0258*/ FADD R7, R7, R9; /* 0x5c58000000970707 */
/* 0x007fbc03fde01fef */
/*0268*/ MOV R7, R7; /* 0x5c98078000770007 */
/*0270*/ CS2R R9, SR_CLOCKLO; /* 0x50c8000005070009 */
/*0278*/ MOV R9, R9; /* 0x5c98078000970009 */
/* 0x007fbc033de01fef */
/*0288*/ MOV R10, R9; /* 0x5c9807800097000a */
/*0290*/ BAR.SYNC 0x0; /* 0xf0a81b8000070000 */
/*0298*/ CS2R R9, SR_CLOCKLO; /* 0x50c8000005070009 */
```

syn1 45 (clk/warp) 2mov

syn2 75 (clk/warp) 2mov + fadd + mov

syn3 45 (clk/warp) 2mov

Float addition consume around 15 clocks

It appears that fp16 is not as fast as fp32. Is it true? Can we say that the benefit of using fp16 is majorly from reducing the memory bandwidth?