#include #include #include #include #include "cublas_v2.h" //M, N and K must be multiples of 8 (M multiple of 4) #define M 8 #define N 8 #define K 8 #define IDX2C(i,j,ld) (((j)*(ld))+(i)) int main (void){ cudaError_t cudaStat; cublasStatus_t stat; cublasHandle_t handle; int i, j; float* devPtrA; float* devPtrB; float* devPtrC; float* a = 0; float* b = 0; float* c = 0; a = (float *)malloc (M * K * sizeof (*a)); b = (float *)malloc (K * N * sizeof (*b)); c = (float *)malloc (M * N * sizeof (*c)); if (!a || !b || !c) { printf ("host memory allocation failed"); return EXIT_FAILURE; } for (j = 0; j < N; j++) { for (i = 0; i < M; i++) { a[IDX2C(i,j,M)] = (float)1; //(i * M + j + 1); b[IDX2C(i,j,K)] = (float)1; c[IDX2C(i,j,K)] = (float)0; } } cudaStat = cudaMalloc ((void**)&devPtrA, M*K*sizeof(*a)); if (cudaStat != cudaSuccess) { printf ("device memory allocation failed"); return EXIT_FAILURE; } cudaStat = cudaMalloc ((void**)&devPtrB, K*N*sizeof(*b)); if (cudaStat != cudaSuccess) { printf ("device memory allocation failed"); return EXIT_FAILURE; } cudaStat = cudaMalloc ((void**)&devPtrC, M*N*sizeof(*c)); if (cudaStat != cudaSuccess) { printf ("device memory allocation failed"); return EXIT_FAILURE; } // Let's create a cuda handle stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); return EXIT_FAILURE; } // Set the math mode to allow cuBLAS to use Tensor Cores: stat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS math mode setting failed\n"); return EXIT_FAILURE; } // copy the matrix from host to device stat = cublasSetMatrix (M, K, sizeof(*a), a, M, devPtrA, M); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("data download failed"); cudaFree (devPtrA); cudaFree (devPtrB); cudaFree (devPtrC); cublasDestroy(handle); return EXIT_FAILURE; } stat = cublasSetMatrix (K, N, sizeof(*b), b, K, devPtrB, K); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("data download failed"); cudaFree (devPtrA); cudaFree (devPtrB); cudaFree (devPtrC); cublasDestroy(handle); return EXIT_FAILURE; } stat = cublasSetMatrix (M, N, sizeof(*c), c, M, devPtrC, M); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("data download failed"); cudaFree (devPtrA); cudaFree (devPtrB); cudaFree (devPtrC); cublasDestroy(handle); return EXIT_FAILURE; } int alpha = 1, beta=1; // Let's compute the matrix multiplication // Invoke the GEMM, ensuring k, lda, ldb, and ldc are all multiples of 8, // and m is a multiple of 4: stat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, devPtrA, CUDA_R_16F, M, devPtrB, CUDA_R_16F, K, &beta, devPtrC, CUDA_R_32F, M, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); //I make sure to ask for tensor core if (stat != CUBLAS_STATUS_SUCCESS) { printf ("matrix multiplication failed"); cudaFree (devPtrA); cudaFree (devPtrB); cudaFree (devPtrC); cublasDestroy(handle); return EXIT_FAILURE; } stat = cublasGetMatrix (M, N, sizeof(*c), devPtrC, M, c, M); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("data upload failed"); cudaFree (devPtrA); cudaFree (devPtrB); cudaFree (devPtrC); cublasDestroy(handle); return EXIT_FAILURE; } cudaFree (devPtrA); cudaFree (devPtrB); cudaFree (devPtrC); cublasDestroy(handle); for (j = 0; j < N; j++) { for (i = 0; i < M; i++) { printf ("%7.0f", c[IDX2C(i,j,M)]); } printf ("\n"); } free(a); free(b); free(c); return EXIT_SUCCESS; }