Originally published at: https://developer.nvidia.com/blog/using-tensor-cores-in-cuda-fortran/
Tensor Cores, which are programmable matrix multiply and accumulate units, were first introduced in the V100 GPUs where they operated on half-precision (16-bit) multiplicands. Tensor Core functionality has been expanded in the following architectures, and in the Ampere A100 GPUs (compute capability 8.0) support for other data types was added, including double precision. Access to…
I tried to compile and run following programs after going through the above document
include “cuf_macros.CUF”
module m
integer, parameter :: wmma_m = 8
integer, parameter :: wmma_n = 8
integer, parameter :: wmma_k = 4
contains
! kernel for C = A B (C(8x8), A(8x4), B(4x8)) using wmma
! Should be launched with one block of 32 threads
attributes(global) subroutine wmma_single(a, b, c)
use wmma
implicit none
real(8) :: a(wmma_m,), b(wmma_k,), c(wmma_m,*)
WMMASubMatrix(WMMAMatrixA, 8, 8, 4, Real, WMMAColMajorKind8) :: sa
WMMASubMatrix(WMMAMatrixB, 8, 8, 4, Real, WMMAColMajorKind8) :: sb
WMMASubMatrix(WMMAMatrixC, 8, 8, 4, Real, WMMAKind8) :: sc
integer :: lda, ldb, ldc
lda = wmma_m
ldb = wmma_k
ldc = wmma_m
sc = 0.0_8
call wmmaLoadMatrix(sa, a(1,1), lda)
call wmmaLoadMatrix(sb, b(1,1), ldb)
call wmmaMatMul(sc, sa, sb, sc)
call wmmaStoreMatrix(c(1,1), sc, ldc)
end subroutine wmma_single
end module m
===========================================<
--------------------------------->>>>
testmodm.F90
program main
#ifdef _CUDA
! Need to include a couple of modules:
! cublas: required to use generic BLAS interface
! cudafor: required to use CUDA runtime API routines (e.g. cudaDeviceSynchronize)
! not explicitly required if file has *.cuf suffix
use cublas
use cudafor
use m
endif
implicit none
integer, parameter :: m = wmma_m, n = wmma_n, k = wmma_k
real(8), device :: a_d(m,k), b_d(k,n), c_d(m,n)
call wmma_single<<<1,32>>>(a_d, b_d, c_d)
end program main
------------------------------------<<<<
I compiled using following instruction
gksen@gksen-Precision-3561:~/CHAZ/data/datanv/CUDA$ sudo /opt/nvidia/hpc_sdk/Linux_x86_64/25.1/compilers/bin/nvfortran -Mpreprocess -cuda -gpu=cc80 -o ttmm testmodm.F90 modm.F90 -L/opt/nvidia/hpc_sdk/Linux_x86_64/25.1/math_libs/11.8/targets/x86_64-linux/lib
testmodm.F90:
modm.F90:
Then I run
gksen@gksen-Precision-3561:~/CHAZ/data/datanv/CUDA$ sudo nvprof --unified-memory-profiling off ./ttmm
[sudo] password for gksen:
==27616== NVPROF is profiling process 27616, command: ./ttmm
==27616== Profiling application: ./ttmm
==27616== Profiling result:
No kernels were profiled.
Type Time(%) Time Calls Avg Min Max Name
API calls: 82.96% 181.85ms 1 181.85ms 181.85ms 181.85ms cuDevicePrimaryCtxRetain
16.34% 35.825ms 1 35.825ms 35.825ms 35.825ms cudaLaunchKernel
0.20% 439.70us 1 439.70us 439.70us 439.70us cuMemAllocHost
0.19% 413.15us 1 413.15us 413.15us 413.15us cuDeviceTotalMem
0.13% 277.60us 106 2.6180us 281ns 117.64us cuDeviceGetAttribute
0.08% 177.91us 384 463ns 240ns 5.1670us �
0.05% 102.42us 1 102.42us 102.42us 102.42us cuDeviceGetName
0.04% 89.951us 4 22.487us 1.2640us 84.049us cuMemAlloc
0.01% 12.031us 1 12.031us 12.031us 12.031us cuDeviceGetPCIBusId
0.00% 10.834us 1 10.834us 10.834us 10.834us cudaGetDevice
0.00% 3.8080us 3 1.2690us 312ns 3.1140us cuDeviceGetCount
0.00% 2.7690us 1 2.7690us 2.7690us 2.7690us cuInit
0.00% 1.7030us 2 851ns 550ns 1.1530us cuCtxGetCurrent
0.00% 1.4390us 3 479ns 265ns 885ns cuDeviceGet
0.00% 1.0520us 4 263ns 94ns 354ns cuDriverGetVersion
0.00% 1.0320us 4 258ns 100ns 507ns cuCtxSetCurrent
0.00% 612ns 1 612ns 612ns 612ns cuDeviceComputeCapability
0.00% 524ns 1 524ns 524ns 524ns cuDeviceGetUuid
OpenACC (excl): 100.00% 17.856us 1 17.856us 17.856us 17.856us acc_device_init
0.00% 0ns 3 0ns 0ns 0ns acc_delete
0.00% 0ns 3 0ns 0ns 0ns acc_alloc
gksen@gksen-Precision-3561:~/CHAZ/data/datanv/CUDA$
I have some confusion about profiling
Why no kernel was profiled
I can understand about some missing components those are required to be incorporated.