basi info:
NVIDIA-SMI 510.73.08
Driver Version: 510.73.08
CUDA Version: 11.6
Compute Capability 750
cmd:
nvcc -o show_cuda_arch show_cuda_arch.cu -arch=compute_75
code:
#include <cstdio>
#include "cuda_fp16.h"
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
__device__ void print_arch(){
const char my_compile_time_ver[] = STR(__CUDACC_VER_MINOR__);
const char my_compile_time_arch[] = STR(__CUDA_ARCH__);
printf("__CUDA_ARCH__: %s, __CUDACC_VER_MINOR__:%s\n", my_compile_time_arch, my_compile_time_ver);
const __half a = 0.2;
const __half b = 0.3;
auto c = __hmax(a, b);
printf("c:%f\n", c);
}
__global__ void example()
{
print_arch();
}
int main(){
example<<<1,1>>>();
cudaDeviceSynchronize();
}
It would be helpful to see the actual compile output. I don’t think there is any problem with your use of __hmax()
itself. However this:
auto c = __hmax(a, b);
will create a variable c
of type half
. You cannot print a half
variable this way:
printf("c:%f\n", c);
I would do this:
printf("c:%f\n", __half2float(c));
When I make that change, I don’t have any trouble compiling your code.
1 Like
I follow your update, but the same compile error happened:
error: identifier “__hmax” is undefined
It compile successfully with “-arch=compute_80”, but it run without any out put
I’m not sure what is happening then. I don’t have any trouble with it:
# cat t14.cu
#include <cstdio>
#include "cuda_fp16.h"
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
__device__ void print_arch(){
const char my_compile_time_ver[] = STR(__CUDACC_VER_MINOR__);
const char my_compile_time_arch[] = STR(__CUDA_ARCH__);
printf("__CUDA_ARCH__: %s, __CUDACC_VER_MINOR__:%s\n", my_compile_time_arch, my_compile_time_ver);
const __half a = 0.2;
const __half b = 0.3;
auto c = __hmax(a, b);
printf("c:%f\n", __half2float(c));
}
__global__ void example()
{
print_arch();
}
int main(){
example<<<1,1>>>();
cudaDeviceSynchronize();
}
# nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Jul_11_02:20:44_PDT_2023
Cuda compilation tools, release 12.2, V12.2.128
Build cuda_12.2.r12.2/compiler.33053471_0
# nvcc -o t14 t14.cu -arch=compute_75
# ./t14
__CUDA_ARCH__: 750, __CUDACC_VER_MINOR__:2
c:0.300049
#
If you compile for compute_80 but attempt to run on a compute_75 device, it won’t run correctly. That is a basic CUDA principle. So I’m not surprised you don’t get output there. Regarding the compile issue, it’s possible you have a broken CUDA install of some sort.
My guess is that you are issuing your compile command incorrectly. My suggestion would be that you provide an exact copy-paste of your console output, duplicating exactly the method I have shown. This is the second time now I’ve asked to see the exact output. I won’t respond further without a complete test case, just as I have given you. Good luck.
1 Like
I’m sorry about that.
cat t14.cu
#include <cstdio>
#include "cuda_fp16.h"
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
__device__ void print_arch(){
const char my_compile_time_ver[] = STR(__CUDACC_VER_MINOR__);
const char my_compile_time_arch[] = STR(__CUDA_ARCH__);
printf("__CUDA_ARCH__: %s, __CUDACC_VER_MINOR__:%s\n", my_compile_time_arch, my_compile_time_ver);
const __half a = 0.2;
const __half b = 0.3;
auto c = __hmax(a, b);
printf("c:%f\n", __half2float(c));
}
__global__ void example()
{
print_arch();
}
int main(){
example<<<1,1>>>();
cudaDeviceSynchronize();
}
nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_Mar__8_18:18:20_PST_2022
Cuda compilation tools, release 11.6, V11.6.124
Build cuda_11.6.r11.6/compiler.31057947_0
nvcc -o t14 t14.cu -arch=compute_75
t14.cu(12): error: identifier "__hmax" is undefined
1 error detected in the compilation of "t14.cu".
what if you change this:
auto c = __hmax(a, b);
to this:
__half c = __hmax(a, b);
is the problem the same?
Also, assuming you have a standard linux install of CUDA, could you run this command:
grep hmax /usr/local/cuda/include/cuda_fp16.h
and paste the output here.
OK I have reproduced the issue on CUDA 11.6
I suggest you move forward to CUDA 11.7 or newer.
after I install cuda 11.7 it work right for __half c = __hmax(a, b);
but new issue come.
#nvidia-smi --query-gpu=compute_cap --format=csv
Failed to initialize NVML: Driver/library version mismatch
I have fixed them.Thank you!
but __hmax_nan get the same eror in cuda 11.7.1 and cuda 11.8.0
# cat t14.cu
#include <cstdio>
#include "cuda_fp16.h"
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
__device__ void print_arch(){
const char my_compile_time_ver[] = STR(__CUDACC_VER_MINOR__);
const char my_compile_time_arch[] = STR(__CUDA_ARCH__);
printf("__CUDA_ARCH__: %s, __CUDACC_VER_MINOR__:%s\n", my_compile_time_arch, my_compile_time_ver);
const __half a = 0.2;
const __half b = 0.3;
__half c = __hmax_nan(a, b);
printf("c:%f\n", __half2float(c));
}
__global__ void example()
{
print_arch();
}
int main(){
example<<<1,1>>>();
cudaDeviceSynchronize();
}
# nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
# nvcc -o t14 t14.cu
t14.cu(12): error: identifier "__hmax_nan" is undefined
1 error detected in the compilation of "t14.cu".
apparently __hmax_nan()
support was introduced in a later version of CUDA. Try updating to the latest CUDA 12.2.1
1 Like
system
Closed
September 1, 2023, 2:42am
16
This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.