Hello,

I want to migrate to mixed precision a program in which I need to compute a huge number of sin functions (actually, what I need are sinc functions, but, as there is no native implementation, I compute it using sin functions). I expected my half2 implementation to be 2x faster, but it is even slower.

Dealing with arithmetic functions, such as __hadd2 or __hmul2, I get the desired x2 gain factor, however, using nvprof over the following test program:

```
#include <stdio.h>
#include <cuda_fp16.h>
#define Fs 16000.0
#define h2Fs __floats2half2_rn(Fs,Fs)
__global__
void generate_time_mp_kernel(int n, half2 *t)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n/2) t[i] = __floats2half2_rn(2.0*n/Fs, (2.0*n+1)/Fs);
}
__global__
void sin_mp_kernel(int n, half2 *x, half2 *t)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n/2) x[i] = h2sin(t[n]);
}
int main(void)
{
int N = 1<<18; // 1<<20 = 1048576
half2 *x, *t, *d_x, *d_t;
t = (half2*)malloc(N/2*sizeof(half2));
x = (half2*)malloc(N/2*sizeof(half2));
cudaMalloc(&d_t, N/2*sizeof(half2));
cudaMalloc(&d_x, N/2*sizeof(half2));
generate_time_mp_kernel<<<(N/2+255)/256, 256>>>(N, d_t);
sin_mp_kernel<<<(N/2+255)/256, 256>>>(N, d_x, d_t);
cudaFree(d_x);
cudaFree(d_t);
free(x);
free(t);
}
```

the time spent in sin_mp_kernel is quite similar to the time spent in sin_kernel for:

```
#include <stdio.h>
#define Fs 16000.0
__global__
void create_time_kernel(int n, float *t)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) t[i] = n / Fs;
}
__global__
void sin_kernel(int n, float *x, float *t)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) x[i] = sin(t[n]);
}
int main(void)
{
int N = 1<<18; // 1<<20 = 1048576
float *x, *t, *d_x, *d_t;
t = (float*)malloc(N*sizeof(float));
x = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_t, N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
generate_time_kernel<<<(N+255)/256, 256>>>(N, d_t);
sin_kernel<<<(N+255)/256, 256>>>(N, d_x, d_t);
cudaFree(d_x);
cudaFree(d_t);
free(x);
free(t);
}
```

Is this the expected behaviour or am I doing something worng? Is h2sin(half2) as fast as sin(float) while computing 2 sins (as __hadd2 or __hmul2 are) or it is slower?

Thanks in advance,

David

By the way, I am using a Tesla P100 on a Google Cloud Instance with CUDA 10.