cuBLAS call from kernel in CUDA 10.0

matt_hanley · January 30, 2019, 3:58am

I’m having issues calling cuBLAS API functions from kernels in CUDA 10.0.

Here is the piece of sample code I’m using to try to debug:

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * This example demonstrates how to call CUBLAS library
 * functions both from the HOST code and from the DEVICE code
 * running on the GPU (the latter is available only for the compute
 * capability >= 3.5). The single-precision matrix-matrix
 * multiplication operation, SGEMM, will be performed 3 times:
 * 1) once by calling a method defined in this file (simple_sgemm),
 * 2) once by calling the cublasSgemm library routine from the HOST code
 * 3) and once by calling the cublasSgemm library routine from
 *    the DEVICE code.
 */

/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>

/* Includes, cuda helper functions */
#include <helper_cuda.h>

/* Matrix size */
#define N  (275)

/* Host implementation of a simple version of sgemm */
static void simple_sgemm(int n, float alpha, const float *A, const float *B,
                         float beta, float *C)
{
    int i;
    int j;
    int k;

    for (i = 0; i < n; ++i)
    {
        for (j = 0; j < n; ++j)
        {
            float prod = 0;

            for (k = 0; k < n; ++k)
            {
                prod += A[k * n + i] * B[j * n + k];
            }

            C[j * n + i] = alpha * prod + beta * C[j * n + i];
        }
    }
}

/* Checks result against reference and returns relative error */
static float check_result(const float *result,
                          const float *reference,
                          int size)
{
    float error_norm = 0.0f;
    float ref_norm = 0.0f;

    for (int i = 0; i < size; ++i)
    {
        float diff = reference[i] - result[i];
        error_norm += diff * diff;
        ref_norm += reference[i] * reference[i];
    }

    error_norm = (float) sqrt((double) error_norm);
    ref_norm = (float) sqrt((double) ref_norm);

    if (fabs(ref_norm) < 1e-7)
    {
        fprintf(stderr, "!!!! Result check failed: reference norm is 0\n");
        exit(EXIT_FAILURE);
    }

    return error_norm / ref_norm;
}

/* Declaration of the function that computes sgemm using CUBLAS device API */
extern "C" void device_cublas_sgemm(int n,
                                    float alpha,
                                    const float *d_A, const float *d_B,
                                    float beta,
                                    float *d_C);

/* Main */
int main(int argc, char **argv)
{
    cublasStatus_t status;
    float *h_A;
    float *h_B;
    float *h_C;
    float *h_C_rnd;
    float *h_C_ref;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    cublasHandle_t handle;

    int dev_id;
    cudaDeviceProp device_prop;

    bool do_device_api_test = false;

    float host_api_test_ratio, device_api_test_ratio;

    /* Initialize CUBLAS */
    printf("simpleDevLibCUBLAS test running...\n");

    dev_id = findCudaDevice(argc, (const char **) argv);
    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));

    if ((device_prop.major << 4) + device_prop.minor >= 0x35)
    {
        printf("Host and device APIs will be tested.\n");
        do_device_api_test = true;
    }
    /*    else if ((device_prop.major << 4) + device_prop.minor >= 0x20)
        {
            printf("Host API will be tested.\n");
            do_device_api_test = false;
        }
    */
    else
    {
        fprintf(stderr, "simpleDevLibCUBLAS examples requires Compute Capability of SM 3.5 or higher\n");
        return EXIT_WAIVED;
    }

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float *)malloc(n2 * sizeof(h_A[0]));

    if (h_A == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }

    h_B = (float *)malloc(n2 * sizeof(h_B[0]));

    if (h_B == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }

    h_C_rnd = (float *)malloc(n2 * sizeof(h_C_rnd[0]));

    if (h_C_rnd == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C_rnd)\n");
        return EXIT_FAILURE;
    }

    h_C = (float *)malloc(n2 * sizeof(h_C_ref[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++)
    {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C_rnd[i] = rand() / (float)RAND_MAX;
        h_C[i] = h_C_rnd[i];
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    /*
     * Performs operation using plain C code
     */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /*
     * Performs operation using cublas
     */
    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for reading back the result from device memory */
    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    host_api_test_ratio = check_result(h_C, h_C_ref, n2);

    if (do_device_api_test)
    {
        /* Reset device resident C matrix */
        status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1);

        if (status != CUBLAS_STATUS_SUCCESS)
        {
            fprintf(stderr, "!!!! device access error (write C)\n");
            return EXIT_FAILURE;
        }

        /*
         * Performs operation using the device API of CUBLAS library
         */
        device_cublas_sgemm(N, alpha, d_A, d_B, beta, d_C);

        /* Read the result back */
        status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

        if (status != CUBLAS_STATUS_SUCCESS)
        {
            fprintf(stderr, "!!!! device access error (read C)\n");
            return EXIT_FAILURE;
        }

        /* Check result against reference */
        device_api_test_ratio = check_result(h_C, h_C_ref, n2);
    }

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_rnd);
    free(h_C_ref);

    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    bool test_result = do_device_api_test ?
                       host_api_test_ratio < 1e-6 &&
                       device_api_test_ratio < 1e-6 :
                       host_api_test_ratio < 1e-6;

    printf("simpleDevLibCUBLAS completed, returned %s\n",
           test_result ? "OK" : "ERROR!");

    exit(test_result ? EXIT_SUCCESS : EXIT_FAILURE);
}

And the associated kernels:

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 * Routines for testing the device API of CUBLAS.
 */

/* Includes, system */
#include <stdio.h>
#include <stdlib.h>

/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>

/* Includes, cuda helper functions */
#include <helper_cuda.h>

__global__ void invokeDeviceCublasSgemm(cublasStatus_t *returnValue,
                                        int n,
                                        const float *d_alpha,
                                        const float *d_A,
                                        const float *d_B,
                                        const float *d_beta,
                                        float *d_C)
{
    cublasHandle_t cnpHandle;
    cublasStatus_t status = cublasCreate(&cnpHandle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        *returnValue = status;
        return;
    }

    /* Perform operation using cublas */
    status =
        cublasSgemm(cnpHandle,
                    CUBLAS_OP_N, CUBLAS_OP_N,
                    n, n, n,
                    d_alpha,
                    d_A, n,
                    d_B, n,
                    d_beta,
                    d_C, n);

    cublasDestroy(cnpHandle);

    *returnValue = status;
}

struct SGEMMScalarParams
{
    float alpha, beta;
};

extern "C" void device_cublas_sgemm(int n,
                                    float alpha,
                                    const float *d_A, const float *d_B,
                                    float beta,
                                    float *d_C)
{
    cublasStatus_t *d_status;
    cublasStatus_t status;

    if (cudaMalloc((void **) &d_status, sizeof(cublasStatus_t)) != cudaSuccess)
    {
        fprintf(stderr,
                "!!!! device memory allocation error (allocate d_status)\n");
        exit(EXIT_FAILURE);
    }

    // Device API requires scalar arguments (alpha and beta)
    // to be allocated in the device memory.
    SGEMMScalarParams h_params = {alpha, beta};
    SGEMMScalarParams *d_params;

    if (cudaMalloc((void **) &d_params, sizeof(SGEMMScalarParams)) !=
        cudaSuccess)
    {
        fprintf(stderr,
                "!!!! device memory allocation error (allocate d_params)\n");
        exit(EXIT_FAILURE);
    }

    if (cudaMemcpy(d_params,
                   &h_params,
                   sizeof(SGEMMScalarParams),
                   cudaMemcpyHostToDevice) != cudaSuccess)
    {
        fprintf(stderr,
                "!!!! host to device memory copy error\n");
        exit(EXIT_FAILURE);
    }

    // Launch cublasSgemm wrapper kernel.
    invokeDeviceCublasSgemm<<<1, 1>>>
    (d_status, n, &d_params->alpha, d_A, d_B, &d_params->beta, d_C);

    cudaError_t error;

    if ((error = cudaGetLastError()) != cudaSuccess)
    {
        fprintf(stderr,
                "!!!! kernel execution error: %s\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    if (cudaMemcpy(&status,
                   d_status,
                   sizeof(cublasStatus_t),
                   cudaMemcpyDeviceToHost) != cudaSuccess)
    {
        fprintf(stderr,
                "!!!! device to host memory copy error\n");
        exit(EXIT_FAILURE);
    }

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr,
                "!!!! CUBLAS Device API call failed with code %d\n",
                status);
        exit(EXIT_FAILURE);
    }

    // Free allocated device memory.
    if (cudaFree(d_status) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (d_status)\n");
        exit(EXIT_FAILURE);
    }

    if (cudaFree(d_params) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (d_params)\n");
        exit(EXIT_FAILURE);
    }
}

And here’s the errors I’m getting:

nvcc cublas_test.cu -arch=sm_75 -rdc=true -lcublas -lcublas_device -lcudadevrt
nvlink error   : Undefined reference to 'cublasCreate_v2' in '/tmp/tmpxft_00001feb_00000000-10_cublas_test.o'
nvlink error   : Undefined reference to 'cublasSgemm_v2' in '/tmp/tmpxft_00001feb_00000000-10_cublas_test.o'

The source code can be found on Github at https://github.com/tpn/cuda-samples/tree/master/v9.0/7_CUDALibraries/simpleDevLibCUBLAS

Any help would be greatly appreciated!

Robert_Crovella · January 31, 2019, 8:07pm

This capability was removed from CUDA 10. There is no longer a device CUBLAS capability in CUDA 10.

[url]https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#deprecated-features[/url]

Daniel_Wong · January 1, 2021, 7:42am

Hi, Robert,

so there is no way for us to leverage cuBLAS from a CUDA kernel any more?

Robert_Crovella · January 1, 2021, 4:28pm

correct. The current recommendation would be to see if CUTLASS will help (it is mostly focused on GEMM related activities). If not, write your own code to perform the function, or call cublas from host code.

758370661 · March 24, 2021, 3:33am

Oh, that is the most terrible news I heard these days… Is that the same for cuda 11? I have to do many matrix multiplication operations, but their dimensions are not the same so they cannot be merged into a batch… So is there any better choice in cuda toolkit?

758370661 · March 24, 2021, 4:39am

I also heard that cuSolver cannot be used in device function in cuda 11.0, is that true? And I have no idea how could I do SVD within the device function, because it might be too time consuming for me to implement it by myself.

mnicely · March 26, 2021, 12:58pm

Hi,

Can you give me details on your use cases and problem sizes?

758370661 · March 27, 2021, 6:13am

Thank you! Indeed, I am implementing an ADMM algorithm. First, I need to do SVD decomposition of multiple matrixes whose length and width are not fixed and are larger than 32. Then I have to do some matrix multiplication.

Recently, I found maybe I could make their shapes fixed, although there might be some drop of accuracy. But even so, I still don’t figure out how to accelerate the SVD of multiple matrixes.

Robert_Crovella · March 28, 2021, 2:51am

cusolver provides a batched SVD operation
which may be of interest
It does require the matrices all be the same, and all 32x32 or less dimension.

mnicely · April 7, 2021, 4:47pm

Hi 758370661,

You might find latter part of “Recent Developments in NVIDIA Math Libraries [S31754]” of interest at GTC 21 ;)

Topic		Replies	Views
Where can I find working examples for the new cuBLASLt library? GPU-Accelerated Libraries	35	5833	March 16, 2020
Issue when calling cublasDdot from within kernel GPU-Accelerated Libraries	7	924	March 21, 2018
Help with CUBLAS performance and timing issues, please help... CUDA Programming and Performance	1	3440	December 26, 2008
cuBLAS convolution does not use Tensor Cores GPU-Accelerated Libraries cublas	6	2173	June 8, 2021
cublasSgemm results in null matrix CUDA Programming and Performance	5	758	May 28, 2019
Program hit cudaErrorInvalidValue (error 1) due to "invalid argument" on CUDA API call to cudaMemsetAsync CUDA Programming and Performance	7	7295	January 11, 2020
Can a cuda kernel call CUBLAS function or how to call a cublas functions from Python ? CUDA Programming and Performance	7	3643	November 15, 2019
cublas<S,D,C,Z>GEMM crash on multi GPUs CUDA Programming and Performance	8	1888	November 27, 2014
CUBLAS_STATUS_MAPPING_ERROR in cublasGetMatrix() after cublasDgemm() GPU-Accelerated Libraries	10	9260	February 21, 2013
Is it correct that my Pascal card is calling Maxwell_gemm kernels through cublas? And if so, why is cublas unusably slow for me? CUDA Programming and Performance	6	933	August 23, 2018

cuBLAS call from kernel in CUDA 10.0

Related topics