So I tried extending cublas code for tensor and benchmark. Created two iterations kernel launch one with setting tensor flag and second iteration without. Both results are too close which seems to make no difference. I also added warmup code still same.
Wondering if i am missing something and if so, what?
*/
//Example 1. Application Using C and cuBLAS: 1-based indexing
include <stdio.h>
include <curand.h>
include <cublas_v2.h>
include <cuda_fp16.h>
include
include
include <time.h>
include <sys/time.h>
define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
using T_ELEM_IN = half;
const int m = 1024 * 16;
const int n = 1024 * 16;
const int k = 1024 * 16;
const int rowsA = m;
const int colsA = k;
const int rowsB = k;
const int colsB = n;
const int rowsC = m;
const int colsC = n;
int main(){
half val1 = __float2half(1.0f);
half val0 = __float2half(0.f);
cublasHandle_t handle;
// First, create a cuBLAS handle:
cublasStatus_t cublasStat = cublasCreate(&handle);
assert(cublasStat == CUBLAS_STATUS_SUCCESS);
// Allocate and initialize your matrices (only the A matrix is shown):
size_t matrixSizeA = (size_t)rowsA * colsA;
T_ELEM_IN *devPtrA = 0;
cudaMalloc((void**)&devPtrA, matrixSizeA * sizeof(devPtrA[0]));
T_ELEM_IN *A = (T_ELEM_IN *)malloc(matrixSizeA * sizeof(A[0]));
/*
https://docs.nvidia.com/cuda/pdf/CUBLAS_Library.pdf
5.4.14. cublasSetMatrix()
cublasStatus_t
cublasSetMatrix(int rows, int cols, int elemSize,
const void *A, int lda, void *B, int ldb)
This function supports the 64-bit Integer Interface.
This function copies a tile of rows x cols elements from a matrix A in host memory space to a
matrix B in GPU memory space. It is assumed that each element requires storage of elemSize bytes
and that both matrices are stored in column-major format, with the leading dimension of the source
matrix A and destination matrix B given in lda and ldb, respectively. The leading dimension indicates
the number of rows of the allocated matrix, even if only a submatrix of it is being used.
*/
// … allocate and initialize B and C matrices …
size_t matrixSizeB = (size_t)rowsB * colsB;
T_ELEM_IN *devPtrB = 0;
cudaMalloc((void**)&devPtrB, matrixSizeB * sizeof(devPtrB[0]));
T_ELEM_IN *B = (T_ELEM_IN *)malloc(matrixSizeB * sizeof(B[0]));
size_t matrixSizeC = (size_t)rowsC * colsC;
T_ELEM_IN *devPtrC = 0;
cudaMalloc((void**)&devPtrC, matrixSizeC * sizeof(devPtrC[0]));
T_ELEM_IN *C = (T_ELEM_IN *)malloc(matrixSizeC * sizeof(C[0]));
for (int counter = 0 ; counter < 2 ; counter ++) {
if (counter == 0) {
cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
printf("Using tensor core...\n");
} else {
printf("Without using tensor core...\n");
}
for (int i = 0; i < matrixSizeA; i++)
A[i] = val1;
for (int i = 0; i < matrixSizeB; i++)
B[i] = val1;
for (int i = 0; i < matrixSizeC; i++)
C[i] = val0;
cublasStat = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA, rowsA);
assert(cublasStat == CUBLAS_STATUS_SUCCESS);
cublasStat = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB, rowsB);
assert(cublasStat == CUBLAS_STATUS_SUCCESS);
cublasStat = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC, rowsC);
assert(cublasStat == CUBLAS_STATUS_SUCCESS);
float alpha = 1.0f;
float beta = 0.f;
int lda = m;
int ldb = k;
int ldc = m;
// Invoke the GEMM, ensuring k, lda, ldb, and ldc are all multiples of 8,
// and m is a multiple of 4:
// warmup...?
cublasStat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha,
devPtrA, CUDA_R_16F, lda,
devPtrB, CUDA_R_16F, ldb,
&beta, devPtrC, CUDA_R_16F, ldc, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
unsigned long long dt = dtime_usec(0);
cublasStat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha,
devPtrA, CUDA_R_16F, lda,
devPtrB, CUDA_R_16F, ldb,
&beta, devPtrC, CUDA_R_16F, ldc, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
cudaDeviceSynchronize();
assert(cublasStat == CUBLAS_STATUS_SUCCESS);
cudaError_t err = cudaGetLastError();
assert(err == cudaSuccess);
dt = dtime_usec(dt);
cudaMemcpy(C, devPtrC, sizeof(C[0]) * matrixSizeC, cudaMemcpyDeviceToHost);
std::cout << "C[0]: " << __half2float(C[0]) << std::endl;
std::cout << "duration: " << dt << "us" << std::endl;
std::cout << "flops/s: " << ((unsigned long long)m)*n*k*2/(float)dt << "MF/s" << std::endl;
}
}
Output:
-rwxr-xr-x. 1 root root 981824 Feb 14 16:35 example-using-cublas-modded.out
+++ exit 0
Using tensor core…
C[0]: 16384
duration: 440979us
flops/s: 1.99467e+07MF/s
Without using tensor core…
C[0]: 16384
duration: 445341us
flops/s: 1.97514e+07MF/s
build command:
set -x mkdir build log rm -rf build/* log/* pushd build ln -s ../src/$FILE.cu . for i in $FILE ; do nvcc -c $i.cu | tee ../log/build.`basename $i`.log done nvcc $FILE.o -lcublas -o $FILE.out 2>&1 | tee ../log/ld.log ret=$? popd ls -l build exit $ret