Programming Tensor core in RTX4070

I want to write a code in CUDA for it uses the Tensor Core. I’m using the library nvcuda::wmma. I have this error: name followed by “::” must be a class or namespace name. This is my code

include
include <cuda_runtime.h>
include <stdio.h>
include <mma.h>
include <curand_kernel.h>

// Must be multiples of 16 for wmma code to work
define MATRIX_M 8192
define MATRIX_N 8192
define MATRIX_K 8192
define TILE_WIDTH 32
// The only dimensions currently supported by WMMA
const int WMMA_M = 16;
const int WMMA_N = 16;
const int WMMA_K = 16;
const int warpSize = 32;

// Performs an MxNxK GEMM (C=alphaAB + betaC) assuming:
// 1) Matrices are packed in memory.
// 2) M, N and K are multiples of 16.
// 3) Neither A nor B are transposed.
// Note: This is NOT a high performance example but is for demonstration purposes only
// This version does not use shared memory
// All matrices are in global memory
// It takes matrix fragments to multiply them using tensor cores
//
global void wmma_example(half
a, half* b, float* c, int M, int N, int K, float alpha, float beta) {
// Leading dimensions. Packed with no transpositions.
int lda = M;
int ldb = K;
int ldc = M;
// Tile using a 2D grid
// Warps are numbered in two dimensions
int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
int warpN = (blockIdx.y * blockDim.y + threadIdx.y);

// Declare the fragments
nvcuda::wmma::fragment<nvcuda::wmma::matrix_a,WMMA_M,WMMA_N,WMMA_K, half, nvcuda::wmma::col_major> a_frag;
nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, nvcuda::wmma::col_major> b_frag;
nvcuda::wmma::fragment<nvcuda::wmma::accumulator,WMMA_M,WMMA_N,WMMA_K, float> acc_frag;
nvcuda::wmma::fragment<nvcuda::wmma::accumulator,WMMA_M,WMMA_N,WMMA_K, float> c_frag;
nvcuda::wmma::fill_fragment(acc_frag,0.0f);
// Loop over k
for (int i = 0; i < K; i += WMMA_K) {
	int aRow = warpM * WMMA_M;
	int aCol = i;
	int bRow = i;
	int bCol = warpN * WMMA_N;
	// Bounds checking
	if (aRow < M && aCol < K && bRow < K && bCol < N) {
		// Load the inputs
		nvcuda::wmma::load_matrix_sync(a_frag, a + aRow + aCol * lda, lda);
		nvcuda::wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb);
		// Perform the matrix multiplication
		nvcuda::wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
	}
}


// load in the current value of c, scale it by beta, and add this our result scaled by alpha
int cRow = warpM * WMMA_M;
int cCol = warpN * WMMA_N;
if (cRow < M && cCol < N) {
	nvcuda::wmma::load_matrix_sync(c_frag, c + cRow + cCol * ldc, ldc, nvcuda::wmma::mem_col_major);
	for (int i = 0; i < c_frag.num_elements; i++) {
		c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
	}
	// Store the output
	nvcuda::wmma::store_matrix_sync(c + cRow + cCol * ldc, c_frag, ldc, nvcuda::wmma::mem_col_major);
}

}

global void convertFp32toFp16(half* out, float* in, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
out[idx] = in[idx];
}

}

int main() {
float* a_fp32;
float* b_fp32;
half* a_fp16;
half* b_fp16;
float* c;
float* c_wmma;
float* c_host_wmma;
curandGenerator_t gen;

const dim3 block_size(TILE_WIDTH, TILE_WIDTH);
const dim3 num_blocks(MATRIX_M / block_size.x, MATRIX_N / block_size.y);

cudaMalloc((void**)&a_fp32, MATRIX_M * MATRIX_K * sizeof(float));
cudaMalloc((void**)&b_fp32, MATRIX_K * MATRIX_N * sizeof(float));
cudaMalloc((void**)&a_fp16, MATRIX_M * MATRIX_K * sizeof(half));
cudaMalloc((void**)&b_fp16, MATRIX_K * MATRIX_N * sizeof(half));

cudaMalloc((void**)&c, MATRIX_M * MATRIX_N * sizeof(float));
cudaMalloc((void**)&c_wmma, MATRIX_M * MATRIX_N * sizeof(float));

c_host_wmma = (float*)malloc(MATRIX_M * MATRIX_N * sizeof(float));

curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, 1337ULL);
curandGenerateUniform(gen, a_fp32, MATRIX_M * MATRIX_K);
curandGenerateUniform(gen, b_fp32, MATRIX_K * MATRIX_N);

// curand doesn´t currently support fp16  so we generate in fp32 and convert to fp16
convertFp32toFp16 << < (MATRIX_M * MATRIX_K + 255) / 256, 256 >> > (a_fp16, a_fp32, MATRIX_M * MATRIX_K);
convertFp32toFp16 << < (MATRIX_K * MATRIX_N + 255) / 256, 256 >> > (a_fp16, a_fp32, MATRIX_K * MATRIX_N);
curandGenerateUniform(gen, c, MATRIX_M * MATRIX_N);
curandDestroyGenerator(gen);


// Testing kernel using tensor core
float alpha = 1.0f;
float beta = 1.0f;
printf("\n M=%d,N=%d,K=%d,alpha=%f, beta=%f \n\n", MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);

//First using wmma
dim3 gridDim;
dim3 blockDim;

// blockDim.x must be a multple of warpSize
// 128x4 means we have 16 warps and a block computes a 64x64 output tile

blockDim.x = 128;
blockDim.y = 4;
gridDim.x = (MATRIX_M + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32);
gridDim.y = (MATRIX_N + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
printf("Running with wmma with grid (%d,%d), block(%d,%d)...\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);


wmma_example << < gridDim, blockDim >> > (a_fp16, b_fp16, c_wmma, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);

// Copiar el resultado de vuelta al host
cudaMemcpy(c_host_wmma, c_wmma, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToHost);



// Imprimir algunas entradas de la matriz resultante (opcional)
for (int i = 0; i < std::min(MATRIX_M, 5); ++i) {
	for (int j = 0; j < std::min(MATRIX_N, 5); ++j) {
		std::cout << c_host_wmma[i * MATRIX_N + j] << " ";
	}
	std::cout << std::endl;
}


free(c_host_wmma);
cudaFree(a_fp32);
cudaFree(b_fp32);
cudaFree(a_fp16);
cudaFree(b_fp16);


return 0;

}

Please properly format code posted here. A simple method to do that:

  1. click the pencil icon below your post to edit your post
  2. in the edit window, select the code
  3. click the </> button at the top of the edit window
  4. save your changes

Please do that now.

When I compile the code you have posted as follows:

nvcc -o t120 t120.cu -arch=sm_89 -lcurand

The code compiles successfully for me with no errors or warnings. (CUDA 12.2) My guess is that your compilation fails to include an arch flag, or includes an incorrect arch specification. The wmma functionality is only available on cc7.0 or newer GPUs, and current nvcc compilers do not default to cc7.0 or higher. This functionality is not available with current default targets e.g. cc5.2, and you will get that kind of compilation error in that case.