I am trying to compile a simpler wmma kernel with FP8 data type on H100. Following is my example code,
GPU: H100
CUDA version 12.0
gcc version 11.3
header file
#include <cuda.h>
#include <mma.h>
#include <cuda_fp8.h>
#include <cuda_fp8.hpp>
using namespace nvcuda;
/* Performs a 16x16x16 warp matmul with input types InT
* and accumulator type AccT. This uses tensorcores.
*/
typedef int8_t INT8;
typedef __nv_fp8_e4m3 D8; // 16-bit float on device
//typedef half D8;
__global__ void wmma_ker(D8 *a, D8 *b, float *c) {
// Declare the fragments
wmma::fragment<wmma::matrix_a, 16, 16, 16, D8, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, 16, 16, 16, D8, wmma::row_major> b_frag;
wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
// Initialize the output to zero
wmma::fill_fragment(c_frag, 0.0f);
// Load the inputs
wmma::load_matrix_sync(a_frag, a, 16);
wmma::load_matrix_sync(b_frag, b, 16);
// Perform the matrix multiplication
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
// Store the output
wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
}
cu file looks like
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <assert.h>
#include <curand.h>
#include <stdint.h>
#include "tu_test_e4m3.cuh"
typedef union {
INT8 _i;
D8 _f;
} H8; // 8-bit float on host
const int DIM = 16;
#define cudaErrCheck(stat) { cudaErrCheck_((stat), __FILE__, __LINE__); }
void cudaErrCheck_(cudaError_t stat, const char *file, int line) {
if (stat != cudaSuccess) {
fprintf(stderr, "CUDA error: %s %s %d\n", cudaGetErrorString(stat), file, line);
}
}
int main(int argc, char** argv) {
// We need a 1-byte int type to treat as FP8
assert(sizeof(INT8) == 1);
assert(sizeof(D8) == 1);
assert(sizeof(H8) == 1);
cudaEvent_t start_wmma;
cudaEvent_t stop_wmma;
cudaErrCheck(cudaEventCreate(&start_wmma));
cudaErrCheck(cudaEventCreate(&stop_wmma));
// Allocate A, B, C on host
H8 *A, *B;
float *C;
A = (H8*)malloc(DIM*DIM*sizeof(H8));
B = (H8*)malloc(DIM*DIM*sizeof(H8));
C = (float*)malloc(DIM*DIM*sizeof(float));
memset(A, 0, sizeof(A));
// Allocate A,B,C on device
D8 *d_A, *d_B;
float *d_C;
cudaMalloc((void**)&d_A, DIM*DIM*sizeof(D8));
cudaMalloc((void**)&d_B, DIM*DIM*sizeof(D8));
cudaMalloc((void**)&d_C, DIM*DIM*sizeof(float));
for(int i = 0; i<DIM; i++) {
A[i]._i = 1;
}
// Copy to device
cudaMemcpy(d_A, A, DIM*DIM*sizeof(D8), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, DIM*DIM*sizeof(D8), cudaMemcpyHostToDevice);
// Do the matmul
dim3 threads = dim3(DIM,DIM);
dim3 blocks = dim3(1,1);
cudaErrCheck(cudaEventRecord(start_wmma));
wmma_ker<<<blocks,threads>>>(d_A, d_B, d_C);
cudaDeviceSynchronize();
cudaErrCheck(cudaEventRecord(stop_wmma));
// Copy back
cudaMemcpy(C, d_C, DIM*DIM*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(A);
free(B);
free(C);
printf("Done!\n");
}
At compilation I get following error
nvcc -arch=sm_90 tu_test_e4m3.s.cu -o tu_test_e4m3
tu_test_e4m3.cuh(17): error: incomplete type is not allowed
tu_test_e4m3.cuh(18): error: incomplete type is not allowed
2 errors detected in the compilation of "tu_test_e4m3.s.cu".
make: *** [Makefile:2: tu_test_e4m3] Error 2
Any help will be appreciated.