I have tested the half type with the code as below:
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include <cuda_fp16.h>
__device__ __managed__ u_char A[100];
__device__ __managed__ u_char B[100];
__device__ __managed__ u_char C[100];
__global__ void vector_add_u8()
{
int idx = threadIdx.x;
__half a = __float2half((float)(A[idx]));
__half b = __float2half((float)(B[idx]));
__half c = __hadd(a, b);
printf("a = %f, b = %f, c = %f\n", __half2float(a), __half2float(b), __half2float(c));
C[idx] = (u_char)__half2float(c);
}
int main(int argc, char **argv)
{
memset(A, 11, 100);
memset(B, 10, 100);
dim3 blocks(1);
dim3 threads(100, 1);
vector_add_u8<<<blocks, threads>>>();
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaGetLastError());
for(int i = 0; i < 100; i++)
{
printf("A[i] + B[i] = C[i]: %d + %d = %d\n", A[0], B[0], C[0]);
}
return 1;
}
And get the result:
a = 11.000000, b = 10.000000, c = 10.000000
A[i] + B[i] = C[i]: 11 + 10 = 10
The result is the half sum of a and b and round-to-nearest-even.
I also tried a lot, but the result is always integer.
Is it right?
If so, how can I get the true value?
Thanks very much.