Why is the FP16 much slower than the FP32 on jetson axiver? I’m just doing simple addition。
Cuda 10.0
coda like this
global static void squaresSum_half2(half2 *data, half2 *sum,half2 des)
{
for (int i=0;i<51210;i++)
{
CUDA_KERNEL_LOOP(i,HLAF2_DATA_SIZE) {
des[i] = __hadd2(data[i],sum[i]);
}
}
}
global static void squaresSum(float *data, float *sum,float des)
{
for (int i=0;i<51210;i++)
{
CUDA_KERNEL_LOOP(i,FLOAT_DATA_SIZE) {
des[i] = sum[i]+data[i];
}
}
}