Benchmark result with vs. without tensor core

So I tried extending cublas code for tensor and benchmark. Created two iterations kernel launch one with setting tensor flag and second iteration without. Both results are too close which seems to make no difference. I also added warmup code still same.
Wondering if i am missing something and if so, what?

*/
//Example 1. Application Using C and cuBLAS: 1-based indexing

include <stdio.h>
include <curand.h>
include <cublas_v2.h>
include <cuda_fp16.h>
include
include
include <time.h>
include <sys/time.h>
define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){

timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;

}

using T_ELEM_IN = half;
const int m = 1024 * 16;
const int n = 1024 * 16;
const int k = 1024 * 16;
const int rowsA = m;
const int colsA = k;
const int rowsB = k;
const int colsB = n;
const int rowsC = m;
const int colsC = n;

int main(){
half val1 = __float2half(1.0f);
half val0 = __float2half(0.f);
cublasHandle_t handle;

// First, create a cuBLAS handle:

cublasStatus_t cublasStat = cublasCreate(&handle);
assert(cublasStat == CUBLAS_STATUS_SUCCESS);

// Allocate and initialize your matrices (only the A matrix is shown):

size_t matrixSizeA = (size_t)rowsA * colsA;
T_ELEM_IN *devPtrA = 0;
cudaMalloc((void**)&devPtrA, matrixSizeA * sizeof(devPtrA[0]));
T_ELEM_IN *A    = (T_ELEM_IN *)malloc(matrixSizeA * sizeof(A[0]));

/*
    https://docs.nvidia.com/cuda/pdf/CUBLAS_Library.pdf
    5.4.14. cublasSetMatrix()
    cublasStatus_t
    cublasSetMatrix(int rows, int cols, int elemSize,
    const void *A, int lda, void *B, int ldb)
    This function supports the 64-bit Integer Interface.
    This function copies a tile of rows x cols elements from a matrix A in host memory space to a
    matrix B in GPU memory space. It is assumed that each element requires storage of elemSize bytes
    and that both matrices are stored in column-major format, with the leading dimension of the source
    matrix A and destination matrix B given in lda and ldb, respectively. The leading dimension indicates
    the number of rows of the allocated matrix, even if only a submatrix of it is being used.
*/

// … allocate and initialize B and C matrices …

size_t matrixSizeB = (size_t)rowsB * colsB;
T_ELEM_IN *devPtrB = 0;
cudaMalloc((void**)&devPtrB, matrixSizeB * sizeof(devPtrB[0]));
T_ELEM_IN *B    = (T_ELEM_IN *)malloc(matrixSizeB * sizeof(B[0]));

size_t matrixSizeC = (size_t)rowsC * colsC;
T_ELEM_IN *devPtrC = 0;

cudaMalloc((void**)&devPtrC, matrixSizeC * sizeof(devPtrC[0]));
T_ELEM_IN *C    = (T_ELEM_IN *)malloc(matrixSizeC * sizeof(C[0]));

for (int counter = 0 ; counter < 2  ; counter ++) {
    if (counter == 0) {
        cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
        printf("Using tensor core...\n");
    } else {
        printf("Without using tensor core...\n");
    }

    for (int i = 0; i < matrixSizeA; i++) 
                A[i] = val1;
    for (int i = 0; i < matrixSizeB; i++) 
                B[i] = val1;
    for (int i = 0; i < matrixSizeC; i++) 
                C[i] = val0;

    cublasStat = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA, rowsA);
    assert(cublasStat == CUBLAS_STATUS_SUCCESS);

    cublasStat = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB, rowsB);
    assert(cublasStat == CUBLAS_STATUS_SUCCESS);

    cublasStat = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC, rowsC);
    assert(cublasStat == CUBLAS_STATUS_SUCCESS);

    float alpha = 1.0f;
    float beta    = 0.f;
    int lda = m;
    int ldb = k;
    int ldc = m;

    // Invoke the GEMM, ensuring k, lda, ldb, and ldc are all multiples of 8,
    // and m is a multiple of 4:

    // warmup...?
    cublasStat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha,
                                                    devPtrA, CUDA_R_16F, lda,
                                                    devPtrB, CUDA_R_16F, ldb,
                                                    &beta, devPtrC, CUDA_R_16F, ldc, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);

    unsigned long long dt = dtime_usec(0);
    cublasStat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha,
                                                    devPtrA, CUDA_R_16F, lda,
                                                    devPtrB, CUDA_R_16F, ldb,
                                                    &beta, devPtrC, CUDA_R_16F, ldc, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
    cudaDeviceSynchronize();
    assert(cublasStat == CUBLAS_STATUS_SUCCESS);
    cudaError_t err = cudaGetLastError();
    assert(err == cudaSuccess);
    dt = dtime_usec(dt);
    cudaMemcpy(C, devPtrC, sizeof(C[0]) * matrixSizeC, cudaMemcpyDeviceToHost);
    std::cout << "C[0]: "    << __half2float(C[0]) << std::endl;
    std::cout << "duration: " << dt << "us" << std::endl;
    std::cout << "flops/s: "    << ((unsigned long long)m)*n*k*2/(float)dt << "MF/s" << std::endl;
}

}

Output:

-rwxr-xr-x. 1 root root 981824 Feb 14 16:35 example-using-cublas-modded.out
+++ exit 0
Using tensor core…
C[0]: 16384
duration: 440979us
flops/s: 1.99467e+07MF/s
Without using tensor core…
C[0]: 16384
duration: 445341us
flops/s: 1.97514e+07MF/s

build command:

set -x
mkdir build log
rm -rf build/* log/*

pushd build
ln -s ../src/$FILE.cu .
for i in $FILE ; do
    nvcc -c $i.cu | tee ../log/build.`basename $i`.log
done
nvcc $FILE.o -lcublas -o $FILE.out 2>&1 | tee ../log/ld.log
ret=$?
popd
ls -l build
exit $ret

I guess you mean this:

That flag is

  1. Irrelevant for GemmEx usage
  2. Deprecated

Sometimes people think it is a way to switch Tensorcore usage on/off. That may have once been true for a very specific non-GemmEx case/usage, but it is no longer true, and it was never true (to my knowledge) for GemmEx. In any event it no longer serves any such purpose.

So its not surprising your results are similar/comparable. They are both doing the same thing. The idea that one is using tensor core and the other isn’t is false.

(I don’t know if I should point this out or not, as it is besides the point. But the math mode is a sticky setting. Once you set it, it remains. Therefore setting it on your first iteration means it will be set that way for all subsequent iterations. But even if you “fixed” that, e.g. by switching the math mode to default on subsequent iterations, you would not observe a significant difference.)

1 Like

Thx for explanation.
Now following on your stick settings, I looked for flag to turn off but looks like there is no explicit ways to turn off these days?
I look at: 2.2.10 cublasMath_t
CUBLAS_DEFAULT_MATH but it says tensor core is used whenever possible.

CUBLAS_PEDANTIC_MATH says may not best performative setting but nothing said about tensor core.
CUBLAS_TF32_TENSOR_OP_MATH will use tensor core…
CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION nothing said about tensor core.

Accoding to this statement " Since this setting does not directly control the use of Tensor Cores, the mode CUBLAS_TENSOR_OP_MATH is being deprecated, and will be removed in a future release." tensor core usage is not controlled by user but rather just decided by softare? (runtime or compile whatever…)

Yes, I would agree, I don’t find a way to turn it off. To be clear, there was never a way to turn it off with GemmEx.

I tested seprately, CUBLAS_PEDANTIC_MATH does not turn it off for your GemmEx example.

Yes, tensorcore usage is decided by the CUBLAS library call.

1 Like

I had an idea hellbent on going as far as installing backward version cuda 9 to see how it works. However it is not insalling well on stream9 so I am even trying older Centos 7. Will post an update if successful.

So even with deprecated version, where I can use to turn off? I am bit confused on that part:
Here is original code from historical perspective, I am reviewing, which says “…Set the math mode to allow cuBLAS to use Tensor Cores:`” which conversely, I am sort of assuming not using will not use tensor core. And further down the line gemmEx being called: cublasGemmEx

// First, create a cuBLAS handle:

cublasStatus_t cublasStat = cublasCreate(&handle);

// Set the math mode to allow cuBLAS to use Tensor Cores:

cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);

// Allocate and initialize your matrices (only the A matrix is shown):

size_t matrixSizeA = ( size_t )rowsA * colsA;

T_ELEM_IN **devPtrA = 0;

cudaMalloc(( void **)&devPtrA[0], matrixSizeA * sizeof (devPtrA[0][0]));

T_ELEM_IN A = (T_ELEM_IN *) malloc (matrixSizeA * sizeof (A[0]));

memset ( A, 0xFF, matrixSizeA* sizeof (A[0]));

status1 = cublasSetMatrix(rowsA, colsA, sizeof (A[0]), A, rowsA, devPtrA[i], rowsA);

// ... allocate and initialize B and C matrices (not shown) ...

// Invoke the GEMM, ensuring k, lda, ldb, and ldc are all multiples of 8,

// and m is a multiple of 4:

cublasStat = cublasGemmEx(handle, transa, transb, m, n, k, alpha,

A, CUDA_R_16F, lda,

B, CUDA_R_16F, ldb,

beta, C, CUDA_R_16F, ldc, CUDA_R_32F, algo);

This is the only thing I have come across, that allowed computation with or without tensorcore, and as far as I know its deprecated and/or doesn’t work that way anymore. I don’t have any further information and don’t have suggestions for anything like that with GemmEx.

OK I will try those. Going back cuda9 is next to impossible. All the support distros has EOL. so can not get any updates and packages: u1604, 1704, fedora, and centos7. I am throwing in the towel on this one.