Below is a compilation error we are getting on Turning (Architecture *sm_75).
This is an __hcmadd operation
error: no suitable user-defined conversion from "__nv_bfloat162" to "__half2" exists
CMat[CMatIdx[threadIdx.x]] = __hcmadd(tmp1, tmp2, CMat[CMatIdx[threadIdx.x]]);
It seems that the conversion of BF16 to FP16 is not supported in Turing architecture.
In order to overcome this issue, we are planning to Emulate BF16 using FP32. Please let us know if this is reasonable
approach.
Also is there any backward compatibility option where Cuda can work with some flags enabled.
Following are the two reference non-working and working sample code snippets I tried to compile on sm_75.
Failing code:
#include <cuda_bf16.h>
__global__ void testHcmaddBF16(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162* out) {
*out = __hcmadd(a, b, c);
}
Working Code:
#include <cuda_bf16.h>
#include <stdio.h>
#include <stdint.h>
__device__ __nv_bfloat162 pack_bfloat162(__nv_bfloat16 real, __nv_bfloat16 imag) {
uint16_t real_bits = *reinterpret_cast<const uint16_t*>(&real);
uint16_t imag_bits = *reinterpret_cast<const uint16_t*>(&imag);
__nv_bfloat162 result;
result.x = uint32_t(real_bits) | (uint32_t(imag_bits) << 16);
return result;
}
__device__ __nv_bfloat162 emulate_hcmadd(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
float a_real = __bfloat162float(__low2bfloat16(a));
float a_imag = __bfloat162float(__high2bfloat16(a));
float b_real = __bfloat162float(__low2bfloat16(b));
float b_imag = __bfloat162float(__high2bfloat16(b));
float c_real = __bfloat162float(__low2bfloat16(c));
float c_imag = __bfloat162float(__high2bfloat16(c));
float out_real = a_real * b_real - a_imag * b_imag + c_real;
float out_imag = a_real * b_imag + a_imag * b_real + c_imag;
__nv_bfloat16 bf_real = __float2bfloat16(out_real);
__nv_bfloat16 bf_imag = __float2bfloat16(out_imag);
return pack_bfloat162(bf_real, bf_imag);
}
__global__ void testHcmaddBF16_emulated(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162* out) {
*out = emulate_hcmadd(a, b, c);
}
int main() {
__nv_bfloat16 a_real = __float2bfloat16(1.0f);
__nv_bfloat16 a_imag = __float2bfloat16(2.0f);
__nv_bfloat16 b_real = __float2bfloat16(3.0f);
__nv_bfloat16 b_imag = __float2bfloat16(4.0f);
__nv_bfloat16 c_real = __float2bfloat16(5.0f);
__nv_bfloat16 c_imag = __float2bfloat16(6.0f);
__nv_bfloat162 a, b, c;
uint16_t ar = *reinterpret_cast<uint16_t*>(&a_real);
uint16_t ai = *reinterpret_cast<uint16_t*>(&a_imag);
uint16_t br = *reinterpret_cast<uint16_t*>(&b_real);
uint16_t bi = *reinterpret_cast<uint16_t*>(&b_imag);
uint16_t cr = *reinterpret_cast<uint16_t*>(&c_real);
uint16_t ci = *reinterpret_cast<uint16_t*>(&c_imag);
a.x = uint32_t(ar) | (uint32_t(ai) << 16);
b.x = uint32_t(br) | (uint32_t(bi) << 16);
c.x = uint32_t(cr) | (uint32_t(ci) << 16);
__nv_bfloat162* d_out;
__nv_bfloat162 h_out;
cudaMalloc(&d_out, sizeof(__nv_bfloat162));
testHcmaddBF16_emulated<<<1, 1>>>(a, b, c, d_out);
cudaMemcpy(&h_out, d_out, sizeof(__nv_bfloat162), cudaMemcpyDeviceToHost);
float out_real = __bfloat162float(__low2bfloat16(h_out));
float out_imag = __bfloat162float(__high2bfloat16(h_out));
printf("Result: real = %f, imag = %f\n", out_real, out_imag);
cudaFree(d_out);
return 0;
}
Please check and confirm this approach.
Thanks and Regards,
Anubhav Aggarwal