I have rewritten your code to avoid some of the issues I mentioned. Here is a complete test case, on tesla V100, CentOS 7, CUDA 10.1.243, which suggests to me there is no significant difference in kernel execution times between the two cases as I have written them:
$ cat t1641.cu
#include <iostream>
#include <cuda_fp16.h>
__device__ __half w(__half half_a, __half half_b){
return __hadd(half_a, half_b) + hsin(half_a)*hsin(half_a) + hcos(half_b)*hcos(half_b) + hexp(half_b);
}
__global__ void add_sin_cos_half_host_pinned(__half* a, __half* b, __half* c, int N){
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N){
__half half_a = a[idx];
__half half_b = b[idx];
__half half_c = 0;
for(int i=0; i<1000; ++i)
half_c += w(half_a, half_b);
c[idx] = half_c;
}
}
__global__ void add_sin_cos_half_pinned(float* a, float* b, float* c, int N){
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N){
__half half_a = __float2half(a[idx]);
__half half_b = __float2half(b[idx]);
__half half_c = 0;
for(int i=0; i<1000; ++i)
half_c += w(half_a, half_b);
c[idx] = __half2float(half_c);
}
}
const int ds = 32*1048576;
int main(){
__half *d_ha, *d_hb, *d_hc;
float *d_fa, *d_fb, *d_fc;
const int hds = ds*sizeof(__half);
const int fds = ds*sizeof(float);
cudaMalloc(&d_ha, hds);
cudaMalloc(&d_hb, hds);
cudaMalloc(&d_hc, hds);
cudaMalloc(&d_fa, fds);
cudaMalloc(&d_fb, fds);
cudaMalloc(&d_fc, fds);
cudaMemset(d_ha, 0, hds);
cudaMemset(d_hb, 0, hds);
cudaMemset(d_fa, 0, fds);
cudaMemset(d_fb, 0, fds);
add_sin_cos_half_pinned<<<ds/1024, 1024>>>(d_fa, d_fb, d_fc, ds);
add_sin_cos_half_host_pinned<<<ds/1024, 1024>>>(d_ha, d_hb, d_hc, ds);
add_sin_cos_half_pinned<<<ds/1024, 1024>>>(d_fa, d_fb, d_fc, ds);
add_sin_cos_half_host_pinned<<<ds/1024, 1024>>>(d_ha, d_hb, d_hc, ds);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_70 -o t1641 t1641.cu
$ nvprof ./t1641
==12946== NVPROF is profiling process 12946, command: ./t1641
==12946== Profiling application: ./t1641
==12946== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 50.00% 192.42ms 2 96.209ms 96.086ms 96.331ms add_sin_cos_half_pinned(float*, float*, float*, int)
49.99% 192.38ms 2 96.189ms 96.082ms 96.296ms add_sin_cos_half_host_pinned(__half*, __half*, __half*, int)
0.00% 5.2800us 4 1.3200us 1.1840us 1.5680us [CUDA memset]
API calls: 54.80% 385.16ms 1 385.16ms 385.16ms 385.16ms cudaDeviceSynchronize
43.64% 306.72ms 6 51.119ms 325.74us 304.83ms cudaMalloc
0.77% 5.3857ms 4 1.3464ms 677.62us 3.3237ms cuDeviceTotalMem
0.71% 4.9661ms 388 12.799us 341ns 525.10us cuDeviceGetAttribute
0.06% 444.14us 4 111.03us 102.39us 126.06us cuDeviceGetName
0.02% 119.44us 4 29.860us 12.701us 75.906us cudaMemset
0.01% 62.672us 4 15.668us 10.173us 29.536us cudaLaunchKernel
0.00% 26.885us 4 6.7210us 3.7720us 12.116us cuDeviceGetPCIBusId
0.00% 8.1800us 8 1.0220us 480ns 1.5930us cuDeviceGet
0.00% 6.3160us 3 2.1050us 336ns 4.0530us cuDeviceGetCount
0.00% 2.4490us 4 612ns 530ns 713ns cuDeviceGetUuid
$
Also, the codes as you have written them are really not identical. The half approach depends on the values in c. The float approach does not (c is purely overwritten).
Your posted code also has a typo in it. The first function uses _hadd which doesn’t exist. So this evidently isn’t the code you are actually running.