When I run the following code, using nvcc harta.cu -gencode arch=compute_90a,code=sm_90a -o harta
I get the output of:
tid: 0, bid: 0, p0[2] = 0.000000
tid: 0, bid: 0, p0[3] = 0.000000
tid: 0, bid: 0, p1[0] = 0.000000
tid: 0, bid: 0, p1[1] = 0.000000
This means that the cuda conversions between float2
and __nv_fp8x2_e4m3
is broken in arch=compute_90a,code=sm_90a
, since no output is expected.
#include <cuda_fp8.h>
#include <cstdio>
template <typename T, int N = sizeof(T)>
__noinline__ __device__ void validate(T p, float val, int j) {
#pragma unroll 1
for (int i = 0; i < N; i++) {
float v = static_cast<float>((reinterpret_cast<const __nv_fp8_e4m3*>(&p))[i]);
if (v != val) {
printf("tid: %d, bid: %d, p%d[%d] = %f\n", threadIdx.x, blockIdx.x, j, i, v);
}
}
}
__global__ void test_packed_fp8() {
__nv_fp8x4_e4m3 p0;
__nv_fp8x2_e4m3 p1;
p0 = __nv_fp8x4_e4m3(make_float4(1.0f, 1.0f, 1.0f, 1.0f));
p1 = __nv_fp8x2_e4m3(make_float2(1.0f, 1.0f));
validate(p0, 1, 0);
validate(p1, 1, 1);
}
int main() {
test_packed_fp8<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
Notes:
When compiling with -G
the bug doesn’t reproduce.
Without the -gencode arch=compute_90a,code=sm_90a
the bug doesn’t reproduce.
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Tue_Oct_29_23:50:19_PDT_2024
Cuda compilation tools, release 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0