Hi forum,
I am working on Orin developer kit (sm_87) on some cuda c++ code, I got an undefined error:
error: identifier “__hfma” is undefined
even I have include:
include <cuda.h>
include <cuda_runtime.h>
include <cuda_fp16.h>
Do you know what did I do wrong? Thank you so much for your help!
A simplified version of my code is:
#include <stdio.h>
#include <cstdlib>
#include <cstdio>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <eigen3/Eigen/Dense>
using matrix_template = Eigen::Matrix<__half, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
__global__ void matrix_mul_naive_kernel_32x32(__half* matAT, __half* matB, __half* matC, int M, int N, int K) {
const unsigned int block_id = blockIdx.x;
const unsigned int thread_id = threadIdx.x;
const unsigned int block_row = block_id / 16;
const unsigned int block_col = block_id % 16;
const unsigned int thread_row = thread_id / 32;
const unsigned int thread_col = thread_id % 32;
int C_row = 32 * block_row + thread_row;
int C_col = 32 * block_col + thread_col;
// fma
__half tmp_C = matC[N * C_row + C_col];
for (int i=0; i<K; ++i) {
tmp_C = __hfma(matAT[M * i + C_row], matB[N * i + C_col], tmp_C);
}
matC[N * C_row + C_col] = tmp_C;
__syncthreads();
return;
}