#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
//M, N and K must be multiples of 8 (M multiple of 4)
#define M 8
#define N 8
#define K 8
#define IDX2C(i,j,ld) (((j)*(ld))+(i))


int main (void){
    cudaError_t cudaStat;
    cublasStatus_t stat;
    cublasHandle_t handle;
    int i, j;
    float* devPtrA;
    float* devPtrB;
    float* devPtrC;
    float* a = 0;
    float* b = 0;
    float* c = 0;

    a = (float *)malloc (M * K * sizeof (*a));
    b = (float *)malloc (K * N * sizeof (*b));
    c = (float *)malloc (M * N * sizeof (*c));
    if (!a || !b || !c) {
        printf ("host memory allocation failed");
        return EXIT_FAILURE;
    }

    for (j = 0; j < N; j++) {
        for (i = 0; i < M; i++) {
            a[IDX2C(i,j,M)] = (float)1; //(i * M + j + 1);
            b[IDX2C(i,j,K)] = (float)1;
            c[IDX2C(i,j,K)] = (float)0;
        }
    }

    cudaStat = cudaMalloc ((void**)&devPtrA, M*K*sizeof(*a));
    if (cudaStat != cudaSuccess) {
        printf ("device memory allocation failed");
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc ((void**)&devPtrB, K*N*sizeof(*b));
    if (cudaStat != cudaSuccess) {
        printf ("device memory allocation failed");
        return EXIT_FAILURE;
    }
    cudaStat = cudaMalloc ((void**)&devPtrC, M*N*sizeof(*c));
    if (cudaStat != cudaSuccess) {
        printf ("device memory allocation failed");
        return EXIT_FAILURE;
    }

    // Let's create a cuda handle
    stat = cublasCreate(&handle);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("CUBLAS initialization failed\n");
        return EXIT_FAILURE;
    }

    // Set the math mode to allow cuBLAS to use Tensor Cores:
    stat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("CUBLAS math mode setting failed\n");
        return EXIT_FAILURE;
    }

    // copy the matrix from host to device
    stat = cublasSetMatrix (M, K, sizeof(*a), a, M, devPtrA, M);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed");
        cudaFree (devPtrA);
        cudaFree (devPtrB);
        cudaFree (devPtrC);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }
    stat = cublasSetMatrix (K, N, sizeof(*b), b, K, devPtrB, K);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed");
        cudaFree (devPtrA);
        cudaFree (devPtrB);
        cudaFree (devPtrC);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }
    stat = cublasSetMatrix (M, N, sizeof(*c), c, M, devPtrC, M);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed");
        cudaFree (devPtrA);
        cudaFree (devPtrB);
        cudaFree (devPtrC);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    int alpha = 1, beta=1;
    // Let's compute the matrix multiplication
    // Invoke the GEMM, ensuring k, lda, ldb, and ldc are all multiples of 8, 
    // and m is a multiple of 4:
    stat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha,
                              devPtrA, CUDA_R_16F, M,
                              devPtrB, CUDA_R_16F, K,
                              &beta, devPtrC, CUDA_R_32F, M, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); //I make sure to ask for tensor core
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("matrix multiplication failed");
        cudaFree (devPtrA);
        cudaFree (devPtrB);
        cudaFree (devPtrC);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    stat = cublasGetMatrix (M, N, sizeof(*c), devPtrC, M, c, M);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data upload failed");
        cudaFree (devPtrA);
        cudaFree (devPtrB);
        cudaFree (devPtrC);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }
    cudaFree (devPtrA);
    cudaFree (devPtrB);
    cudaFree (devPtrC);
    cublasDestroy(handle);
    for (j = 0; j < N; j++) {
        for (i = 0; i < M; i++) {
            printf ("%7.0f", c[IDX2C(i,j,M)]);
        }
        printf ("\n");
    }
    free(a);
    free(b);
    free(c);
    return EXIT_SUCCESS;
}