Programming Tensor core in RTX4070

horacio.ayala · January 11, 2024, 9:32pm

I want to write a code in CUDA for it uses the Tensor Core. I’m using the library nvcuda::wmma. I have this error: name followed by “::” must be a class or namespace name. This is my code

include
include <cuda_runtime.h>
include <stdio.h>
include <mma.h>
include <curand_kernel.h>

// Must be multiples of 16 for wmma code to work
define MATRIX_M 8192
define MATRIX_N 8192
define MATRIX_K 8192
define TILE_WIDTH 32
// The only dimensions currently supported by WMMA
const int WMMA_M = 16;
const int WMMA_N = 16;
const int WMMA_K = 16;
const int warpSize = 32;

// Performs an MxNxK GEMM (C=alphaAB + betaC) assuming:
// 1) Matrices are packed in memory.
// 2) M, N and K are multiples of 16.
// 3) Neither A nor B are transposed.
// Note: This is NOT a high performance example but is for demonstration purposes only
// This version does not use shared memory
// All matrices are in global memory
// It takes matrix fragments to multiply them using tensor cores
//
global void wmma_example(half a, half* b, float* c, int M, int N, int K, float alpha, float beta) {
// Leading dimensions. Packed with no transpositions.
int lda = M;
int ldb = K;
int ldc = M;
// Tile using a 2D grid
// Warps are numbered in two dimensions
int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
int warpN = (blockIdx.y * blockDim.y + threadIdx.y);

// Declare the fragments
nvcuda::wmma::fragment<nvcuda::wmma::matrix_a,WMMA_M,WMMA_N,WMMA_K, half, nvcuda::wmma::col_major> a_frag;
nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, nvcuda::wmma::col_major> b_frag;
nvcuda::wmma::fragment<nvcuda::wmma::accumulator,WMMA_M,WMMA_N,WMMA_K, float> acc_frag;
nvcuda::wmma::fragment<nvcuda::wmma::accumulator,WMMA_M,WMMA_N,WMMA_K, float> c_frag;
nvcuda::wmma::fill_fragment(acc_frag,0.0f);
// Loop over k
for (int i = 0; i < K; i += WMMA_K) {
	int aRow = warpM * WMMA_M;
	int aCol = i;
	int bRow = i;
	int bCol = warpN * WMMA_N;
	// Bounds checking
	if (aRow < M && aCol < K && bRow < K && bCol < N) {
		// Load the inputs
		nvcuda::wmma::load_matrix_sync(a_frag, a + aRow + aCol * lda, lda);
		nvcuda::wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb);
		// Perform the matrix multiplication
		nvcuda::wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
	}
}


// load in the current value of c, scale it by beta, and add this our result scaled by alpha
int cRow = warpM * WMMA_M;
int cCol = warpN * WMMA_N;
if (cRow < M && cCol < N) {
	nvcuda::wmma::load_matrix_sync(c_frag, c + cRow + cCol * ldc, ldc, nvcuda::wmma::mem_col_major);
	for (int i = 0; i < c_frag.num_elements; i++) {
		c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
	}
	// Store the output
	nvcuda::wmma::store_matrix_sync(c + cRow + cCol * ldc, c_frag, ldc, nvcuda::wmma::mem_col_major);
}

}

global void convertFp32toFp16(half* out, float* in, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
out[idx] = in[idx];
}

}

int main() {
float* a_fp32;
float* b_fp32;
half* a_fp16;
half* b_fp16;
float* c;
float* c_wmma;
float* c_host_wmma;
curandGenerator_t gen;

const dim3 block_size(TILE_WIDTH, TILE_WIDTH);
const dim3 num_blocks(MATRIX_M / block_size.x, MATRIX_N / block_size.y);

cudaMalloc((void**)&a_fp32, MATRIX_M * MATRIX_K * sizeof(float));
cudaMalloc((void**)&b_fp32, MATRIX_K * MATRIX_N * sizeof(float));
cudaMalloc((void**)&a_fp16, MATRIX_M * MATRIX_K * sizeof(half));
cudaMalloc((void**)&b_fp16, MATRIX_K * MATRIX_N * sizeof(half));

cudaMalloc((void**)&c, MATRIX_M * MATRIX_N * sizeof(float));
cudaMalloc((void**)&c_wmma, MATRIX_M * MATRIX_N * sizeof(float));

c_host_wmma = (float*)malloc(MATRIX_M * MATRIX_N * sizeof(float));

curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, 1337ULL);
curandGenerateUniform(gen, a_fp32, MATRIX_M * MATRIX_K);
curandGenerateUniform(gen, b_fp32, MATRIX_K * MATRIX_N);

// curand doesn´t currently support fp16  so we generate in fp32 and convert to fp16
convertFp32toFp16 << < (MATRIX_M * MATRIX_K + 255) / 256, 256 >> > (a_fp16, a_fp32, MATRIX_M * MATRIX_K);
convertFp32toFp16 << < (MATRIX_K * MATRIX_N + 255) / 256, 256 >> > (a_fp16, a_fp32, MATRIX_K * MATRIX_N);
curandGenerateUniform(gen, c, MATRIX_M * MATRIX_N);
curandDestroyGenerator(gen);


// Testing kernel using tensor core
float alpha = 1.0f;
float beta = 1.0f;
printf("\n M=%d,N=%d,K=%d,alpha=%f, beta=%f \n\n", MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);

//First using wmma
dim3 gridDim;
dim3 blockDim;

// blockDim.x must be a multple of warpSize
// 128x4 means we have 16 warps and a block computes a 64x64 output tile

blockDim.x = 128;
blockDim.y = 4;
gridDim.x = (MATRIX_M + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32);
gridDim.y = (MATRIX_N + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
printf("Running with wmma with grid (%d,%d), block(%d,%d)...\n", gridDim.x, gridDim.y, blockDim.x, blockDim.y);


wmma_example << < gridDim, blockDim >> > (a_fp16, b_fp16, c_wmma, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);

// Copiar el resultado de vuelta al host
cudaMemcpy(c_host_wmma, c_wmma, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToHost);



// Imprimir algunas entradas de la matriz resultante (opcional)
for (int i = 0; i < std::min(MATRIX_M, 5); ++i) {
	for (int j = 0; j < std::min(MATRIX_N, 5); ++j) {
		std::cout << c_host_wmma[i * MATRIX_N + j] << " ";
	}
	std::cout << std::endl;
}


free(c_host_wmma);
cudaFree(a_fp32);
cudaFree(b_fp32);
cudaFree(a_fp16);
cudaFree(b_fp16);


return 0;

}

Robert_Crovella · January 18, 2024, 4:50pm

Please properly format code posted here. A simple method to do that:

click the pencil icon below your post to edit your post
in the edit window, select the code
click the </> button at the top of the edit window
save your changes

Please do that now.

When I compile the code you have posted as follows:

nvcc -o t120 t120.cu -arch=sm_89 -lcurand

The code compiles successfully for me with no errors or warnings. (CUDA 12.2) My guess is that your compilation fails to include an arch flag, or includes an incorrect arch specification. The wmma functionality is only available on cc7.0 or newer GPUs, and current nvcc compilers do not default to cc7.0 or higher. This functionality is not available with current default targets e.g. cc5.2, and you will get that kind of compilation error in that case.

Topic		Replies	Views
Error while using different precisions with tensor cores Deep Learning (Training & Inference) mixed-precision	2	1968	July 26, 2019
Differences in Precision Between Tensor Cores and CUDA Cores CUDA Programming and Performance cuda	1	114	January 10, 2025
While learning tensor core low level code example (using wmma), mismatch occurs between wmma and cublas results when comparing CUDA Programming and Performance	13	527	February 6, 2024
How to use WMMA efficiently CUDA Programming and Performance	4	7968	October 23, 2020
Tensor core "Invalid __global__ write of size 4 bytes" CUDA Programming and Performance	9	915	May 21, 2022
Using Tensor Cores in CUDA Fortran Technical Blog	1	439	March 7, 2025
Get wrong result using tensor core example CUDA Programming and Performance cuda , kernel	8	330	July 1, 2024
Tensor cores floating point inaccuracies CUDA Programming and Performance	0	486	September 16, 2020
Concurrent execution of CUDA and Tensor cores CUDA Programming and Performance	34	8255	November 3, 2024
Tensor Core Programming Using CUDA Fortran Technical Blog	0	427	August 25, 2020

Programming Tensor core in RTX4070

Related topics