I want to write a code in CUDA for it uses the Tensor Core. I’m using the library nvcuda::wmma. I have this error: name followed by “::” must be a class or namespace name. This is my code
include
include <cuda_runtime.h>
include <stdio.h>
include <mma.h>
include <curand_kernel.h>
// Must be multiples of 16 for wmma code to work
define MATRIX_M 8192
define MATRIX_N 8192
define MATRIX_K 8192
define TILE_WIDTH 32
// The only dimensions currently supported by WMMA
const int WMMA_M = 16;
const int WMMA_N = 16;
const int WMMA_K = 16;
const int warpSize = 32;
// Performs an MxNxK GEMM (C=alphaAB + betaC) assuming:
// 1) Matrices are packed in memory.
// 2) M, N and K are multiples of 16.
// 3) Neither A nor B are transposed.
// Note: This is NOT a high performance example but is for demonstration purposes only
// This version does not use shared memory
// All matrices are in global memory
// It takes matrix fragments to multiply them using tensor cores
//
global void wmma_example(half a, half* b, float* c, int M, int N, int K, float alpha, float beta) {
// Leading dimensions. Packed with no transpositions.
int lda = M;
int ldb = K;
int ldc = M;
// Tile using a 2D grid
// Warps are numbered in two dimensions
int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
// Declare the fragments
nvcuda::wmma::fragment<nvcuda::wmma::matrix_a,WMMA_M,WMMA_N,WMMA_K, half, nvcuda::wmma::col_major> a_frag;
nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, nvcuda::wmma::col_major> b_frag;
nvcuda::wmma::fragment<nvcuda::wmma::accumulator,WMMA_M,WMMA_N,WMMA_K, float> acc_frag;
nvcuda::wmma::fragment<nvcuda::wmma::accumulator,WMMA_M,WMMA_N,WMMA_K, float> c_frag;
nvcuda::wmma::fill_fragment(acc_frag,0.0f);
// Loop over k
for (int i = 0; i < K; i += WMMA_K) {
int aRow = warpM * WMMA_M;
int aCol = i;
int bRow = i;
int bCol = warpN * WMMA_N;
// Bounds checking
if (aRow < M && aCol < K && bRow < K && bCol < N) {
// Load the inputs
nvcuda::wmma::load_matrix_sync(a_frag, a + aRow + aCol * lda, lda);
nvcuda::wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb);
// Perform the matrix multiplication
nvcuda::wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
}
}
// load in the current value of c, scale it by beta, and add this our result scaled by alpha
int cRow = warpM * WMMA_M;
int cCol = warpN * WMMA_N;
if (cRow < M && cCol < N) {
nvcuda::wmma::load_matrix_sync(c_frag, c + cRow + cCol * ldc, ldc, nvcuda::wmma::mem_col_major);
for (int i = 0; i < c_frag.num_elements; i++) {
c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
}
// Store the output
nvcuda::wmma::store_matrix_sync(c + cRow + cCol * ldc, c_frag, ldc, nvcuda::wmma::mem_col_major);
}
}
global void convertFp32toFp16(half* out, float* in, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
out[idx] = in[idx];
}
}
int main() {
float* a_fp32;
float* b_fp32;
half* a_fp16;
half* b_fp16;
float* c;
float* c_wmma;
float* c_host_wmma;
curandGenerator_t gen;
const dim3 block_size(TILE_WIDTH, TILE_WIDTH);
const dim3 num_blocks(MATRIX_M / block_size.x, MATRIX_N / block_size.y);
cudaMalloc((void**)&a_fp32, MATRIX_M * MATRIX_K * sizeof(float));
cudaMalloc((void**)&b_fp32, MATRIX_K * MATRIX_N * sizeof(float));
cudaMalloc((void**)&a_fp16, MATRIX_M * MATRIX_K * sizeof(half));
cudaMalloc((void**)&b_fp16, MATRIX_K * MATRIX_N * sizeof(half));
cudaMalloc((void**)&c, MATRIX_M * MATRIX_N * sizeof(float));
cudaMalloc((void**)&c_wmma, MATRIX_M * MATRIX_N * sizeof(float));
c_host_wmma = (float*)malloc(MATRIX_M * MATRIX_N * sizeof(float));
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, 1337ULL);
curandGenerateUniform(gen, a_fp32, MATRIX_M * MATRIX_K);
curandGenerateUniform(gen, b_fp32, MATRIX_K * MATRIX_N);
// curand doesn´t currently support fp16 so we generate in fp32 and convert to fp16
convertFp32toFp16 << < (MATRIX_M * MATRIX_K + 255) / 256, 256 >> > (a_fp16, a_fp32, MATRIX_M * MATRIX_K);
convertFp32toFp16 << < (MATRIX_K * MATRIX_N + 255) / 256, 256 >> > (a_fp16, a_fp32, MATRIX_K * MATRIX_N);
curandGenerateUniform(gen, c, MATRIX_M * MATRIX_N);
curandDestroyGenerator(gen);
// Testing kernel using tensor core
float alpha = 1.0f;
float beta = 1.0f;
printf("\n M=%d,N=%d,K=%d,alpha=%f, beta=%f \n\n", MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);
//First using wmma
dim3 gridDim;
dim3 blockDim;
// blockDim.x must be a multple of warpSize
// 128x4 means we have 16 warps and a block computes a 64x64 output tile
blockDim.x = 128;
blockDim.y = 4;
gridDim.x = (MATRIX_M + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32);
gridDim.y = (MATRIX_N + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
printf("Running with wmma with grid (%d,%d), block(%d,%d)...\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);
wmma_example << < gridDim, blockDim >> > (a_fp16, b_fp16, c_wmma, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);
// Copiar el resultado de vuelta al host
cudaMemcpy(c_host_wmma, c_wmma, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToHost);
// Imprimir algunas entradas de la matriz resultante (opcional)
for (int i = 0; i < std::min(MATRIX_M, 5); ++i) {
for (int j = 0; j < std::min(MATRIX_N, 5); ++j) {
std::cout << c_host_wmma[i * MATRIX_N + j] << " ";
}
std::cout << std::endl;
}
free(c_host_wmma);
cudaFree(a_fp32);
cudaFree(b_fp32);
cudaFree(a_fp16);
cudaFree(b_fp16);
return 0;
}