Hello, I encountered some precision-related issues when using CUDA’s __hfma function. I hope you can help me.
I called the __hfma (half-precision fused multiply-add) function in a CUDA program to perform calculations, but found that there was a 1 ULP (Unit in the Last Place) difference between the results calculated on the Device side and the Host side. The following is my test environment and code snippet:
Test environment:
GPU: [p4]
CUDA version: [12.1]
GPU architecture: [sm_60]
Compiler command: nvcc demo.cu -o demo -arch=sm_60
===================================================
include <cuda_fp16.h>
include <stdio.h>
// CUDA Kernel to perform hfma operation
global void hfmaKernel(half *a, half *b, half *c, half *result) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx == 0) {
// Perform hfma: result = a * b + c
result[idx] = __hfma(a[idx], b[idx], c[idx]);
}
}
int main() {
size_t size = sizeof(half);
// Host arrays
half *h_a, *h_b, *h_c, *h_result;
// Allocate host memory
h_result = (half *)malloc(size);
// Initialize host arrays
unsigned short a = 0xF8A6;
unsigned short b = 0x3B09;
unsigned short c = 0xB944;
h_a = reinterpret_cast<half *>(&a);
h_b = reinterpret_cast<half *>(&b);
h_c = reinterpret_cast<half *>(&c);
// Device arrays
half *d_a, *d_b, *d_c, *d_result;
// Allocate device memory
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
cudaMalloc(&d_result, size);
// Copy data from host to device
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, h_c, size, cudaMemcpyHostToDevice);
// Launch kernel
hfmaKernel<<<1, 1>>>(d_a, d_b, d_c, d_result);
cudaDeviceSynchronize();
// Copy result back to host
cudaMemcpy(h_result, d_result, size, cudaMemcpyDeviceToHost);
// do the workaroud for __hfma in host
float h_a_f = __half2float(*h_a);
float h_b_f = __half2float(*h_b);
float h_c_f = __half2float(*h_c);
float res = h_a_f * h_b_f + h_c_f;
half h_result_ref = __float2half(res);
// comapre the result
printf(" result in device: %X vs %f \n", *h_result, __half2float(*h_result));
printf(" result in host: %X vs %f \n", h_result_ref, __half2float(h_result_ref));
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_result);
// Free host memory
free(h_result);
return 0;
}
==============================================
result is:
result in device: F817 vs -33504.000000
result in host: F816 vs -33472.000000
==============================================
What is the problem? Is the host’s reference calculation method inappropriate? Or is it the difference caused by the device calculation? Please give me some help, thank you.