Error parameter number 10 cublasSgemmBatched

Hello,

I try to made a multiplication matrix batch on A(m,k) et B(k,n) where m = 5 k = 5 and n = 1.

I have this code:

#include <stdio.h>
#include <string>
#include <cuda_runtime.h>
#include <cublas_v2.h>
//#include <conio.h>

#define CUDA_CALL(res, str) { if (res != cudaSuccess) { printf("CUDA Error : %s : %s %d : ERR %s\n", str, __FILE__, __LINE__, cudaGetErrorName(res)); } }
#define CUBLAS_CALL(res, str) { if (res != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS Error : %s : %s %d : ERR %d\n", str, __FILE__, __LINE__, int(res)); } }

static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;

void d_CUDATimerStart(void)
{
    CUDA_CALL(cudaEventCreate(&cu_TimerStart), "Failed to create start event!");
    CUDA_CALL(cudaEventCreate(&cu_TimerStop), "Failed to create stop event!");

    CUDA_CALL(cudaEventRecord(cu_TimerStart), "Failed to record start event!");
}

float d_CUDATimerStop(void)
{
    CUDA_CALL(cudaEventRecord(cu_TimerStop), "Failed to record stop event!");

    CUDA_CALL(cudaEventSynchronize(cu_TimerStop), "Failed to synch stop event!");

    float ms;

    CUDA_CALL(cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop), "Failed to elapse events!");

    CUDA_CALL(cudaEventDestroy(cu_TimerStart), "Failed to destroy start event!");
    CUDA_CALL(cudaEventDestroy(cu_TimerStop), "Failed to destroy stop event!");

    return ms;
}

void d_GetInv(float* A, float*B, float *Ainv, float *X, int n)
{
    cublasHandle_t cu_cublasHandle;
    CUBLAS_CALL(cublasCreate(&cu_cublasHandle), "Failed to initialize cuBLAS!");

    float** adA;
    float** adB;
    float** adAInv;
    float** adX;

    float* dA;
    float* dAInv;
    float* dX;

    int* dAUPivots;
    int* dAUInfo;

    float *dB;
    float *alpha;
    float *beta;

    float *dalpha;
    float *dbeta;


    size_t szA = n * n * sizeof(float);
    size_t szB = n  * sizeof(float);

    CUDA_CALL(cudaMalloc(&adA, sizeof(float*)), "Failed to allocate adA!");
    CUDA_CALL(cudaMalloc(&adAInv, sizeof(float*)), "Failed to allocate adA!");
    CUDA_CALL(cudaMalloc(&adX,10* sizeof(float*)), "Failed to allocate adC!");
    CUDA_CALL(cudaMalloc(&dA, szA), "Failed to allocate dA!");
    CUDA_CALL(cudaMalloc(&dAInv, szA), "Failed to allocate dAinv!");
    CUDA_CALL(cudaMalloc(&dX, szB*50), "Failed to allocate dX!");
    CUDA_CALL(cudaMalloc(&dAUPivots, n * sizeof(int)), "Failed to allocate dAUPivots!");
    CUDA_CALL(cudaMalloc(&dAUInfo, sizeof(int)), "Failed to allocate dAUInfo!");

    CUDA_CALL(cudaMemcpy(dA, A, szA, cudaMemcpyHostToDevice), "Failed to copy to dA!");
    CUDA_CALL(cudaMemcpy(&adA[0], &(dA), sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adA!");
    CUDA_CALL(cudaMemcpy(&adAInv[0], &(dAInv), sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adAInv!");
    CUDA_CALL(cudaMemcpy(&adX[0], &(dX), 1*sizeof(float), cudaMemcpyHostToDevice), "Failed to copy to adX!");


    CUDA_CALL(cudaMalloc(&adB, sizeof(float*)), "Failed to allocate adB!");
    CUDA_CALL(cudaMalloc(&dB, szB), "Failed to allocate dB!");
    CUDA_CALL(cudaMemcpy(dB, B, szB, cudaMemcpyHostToDevice), "Failed to copy to dA!");
    CUDA_CALL(cudaMemcpy(&adB[0], &(dB), sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adB!");


    alpha   =(float*)malloc(10*sizeof(float));
    beta    =(float*)malloc(10*sizeof(float));
    alpha[0]= 1;
    beta[0] = 0;
    CUDA_CALL(cudaMalloc(&dalpha, 1*sizeof(float)), "Failed to allocate dalpha!");
    CUDA_CALL(cudaMalloc(&dbeta, 1*sizeof(float)), "Failed to allocate dbeta!");

    CUDA_CALL(cudaMemcpy(dalpha, alpha, sizeof(float), cudaMemcpyHostToDevice), "Failed to copy to dalpha!");
    CUDA_CALL(cudaMemcpy(dbeta, beta, sizeof(float), cudaMemcpyHostToDevice), "Failed to copy to dbeta!");



    d_CUDATimerStart();

    CUBLAS_CALL(cublasSgetrfBatched(cu_cublasHandle, n, adA, n, dAUPivots, dAUInfo, 1), "Failed to perform LU decomp operation!");
    CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");
    CUBLAS_CALL(cublasSgetriBatched(cu_cublasHandle, n, (const float **)adA, n, dAUPivots, adAInv, n, dAUInfo, 1), "Failed to perform Inverse operation!");
    CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");

    CUBLAS_CALL(cublasSgemmBatched(cu_cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N,n,1,n,dalpha,(const float **)adA,n,(const float **)adB,1,dbeta,(adX),n,1),"Failed to multiply matrix\n");
    CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");


    float timed = d_CUDATimerStop();

    Ainv = (float*)malloc(szA);

    CUDA_CALL(cudaMemcpy(Ainv, dAInv, szA, cudaMemcpyDeviceToHost), "Failed to copy to res!");


    printf("Ainv: \n");
    for(int j = 0; j < n; j++)
    {
        for(int i = 0; i < n ; i++)
        {

            printf(" %f",Ainv[i+j*n]);
        }
        printf(" \n");
    }


    printf("cublas inverse in: %.5f ms.\n", timed);

    CUDA_CALL(cudaFree(adA), "Failed to free adA!");
    CUDA_CALL(cudaFree(adAInv), "Failed to free adC!");
    CUDA_CALL(cudaFree(dA), "Failed to free dA!");
    CUDA_CALL(cudaFree(dAInv), "Failed to free dC!");
    CUDA_CALL(cudaFree(dAUPivots), "Failed to free dAUPivots!");
    CUDA_CALL(cudaFree(dAUInfo), "Failed to free dAUInfo!");

    CUBLAS_CALL(cublasDestroy(cu_cublasHandle), "Failed to destroy cuBLAS!");
}

The code return me that error:

** On entry to SGEMM parameter number 10 had an illegal value

I don’t understand the problem seems to be in the adB matrix but it is constructed the same has the matrix A.

I don’t know if it is a problem the size of the matrix or the contruction of the matrix.

Can someone help me about this?

Best regard.

If you want to provide a complete code including a main function, I’ll take a look.

I believe parameter 10 refers to the ldb parameter. Your choice of ldb (1) doesn’t seem to fit the rule given in the cublas documentation:

http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-gemmbatched

"Barray: device input array of pointers to array, with each array of dim. ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise.

ldb: input leading dimension of two-dimensional array used to store each matrix B[i]."

Since your transa parameter is CUBLAS_OP_N, then ldb is supposed to be:

ldb>=max(1,k)

Your value of k is n.

However you are passing 1 for ldb:

CUBLAS_CALL(cublasSgemmBatched(cu_cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N,n,1,n,dalpha,(const float **)adA,n,(const float **)adB,1,dbeta,(adX),n,1),"Failed to multiply matrix\n");

Thank you for the response. So here is the min function:

#include <stdio.h>
#include <stdlib.h>
#include <string>


extern void d_GetInv(float* A,float*B, float *Ainv,float *X,int n);
//#include "cuda_interface.cu"

int main()
{
    int n = 5;
    float* L = (float*)malloc(n * n * sizeof(float));
    float* B = (float*)malloc(n * sizeof(float));

    printf("A: \n");
    for(int j = 0; j <  n; j++)
    {
        for(int i = 0; i < n ; i++)
        {
                L[i+j*n] = i*i+1;;
        }
    }

    float *Ainv,*X;
    d_GetInv(L,B,Ainv,X,n);
    return 0;
}

I found an other code to batched-multiply matrix here is the part I don’t understand:

float **h_AA, **h_BB, **h_CC;
    h_AA = (float**)malloc(6* sizeof(float*));
    h_BB = (float**)malloc(6 * sizeof(float*));
    h_CC = (float**)malloc(6 * sizeof(float*));
    printf("la1.0\n");
    for (int i = 0; i < 6; i++){
        cudaMalloc((void **)&h_AA[i], 5*5* sizeof(float));
        cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
        cudaMalloc((void **)&h_CC[i], 5*sizeof(float));

    }

    float **d_AA, **d_BB, **d_CC;
    cudaMalloc(&d_AA, 6 * sizeof(float*));
    cudaMalloc(&d_BB, 6 * sizeof(float*));
    cudaMalloc(&d_CC, 6 * sizeof(float*));
    cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
    cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
    cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);

Why can’t we do that directly?

float **d_AA, **d_BB, **d_CC;
    cudaMalloc(&d_AA, 6 * sizeof(float*));
    cudaMalloc(&d_BB, 6 * sizeof(float*));
    cudaMalloc(&d_CC, 6 * sizeof(float*));

    for (int i = 0; i < 6; i++){
        cudaMalloc((void **)&d_AA[i], 5*5* sizeof(float)); HERE AN ERROR
        cudaMalloc((void **)&d_BB[i], 5 * sizeof(float));
        cudaMalloc((void **)&d_CC[i], 5*sizeof(float));
    }

I get a error with my version, no error explaination, just crash. In fact my goal is to build a float **A and B matrix in the device, to fill it into a kernel and process it. I don’t understand the why I should use a h_AA host variable to build a float ** variable into the device?

cudaMalloc requires that the pointer address you pass to it be pointing to a location in host memory. It is this location where the cudaMalloc routine will store the pointer to device memory that corresponds to the allocation you requested:

float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
           ^
           %d_AA is a location in host memory

You cannot cudaMalloc when the pointer address you pass to it is a location in device memory:

cudaMalloc((void **)&d_AA[i], 5*5* sizeof(float)); HERE AN ERROR
                    ^
                    &d_AA[i] is a location in device memory, due to previous cudaMalloc call on d_AA

Doing so will result in a seg fault, when the cudaMalloc code attempts to store the pointer to the requested allocation (from host code) in a location that is not accessible from host code (i.e. on the device).

For these types of situations, you need to do a “deep copy” which is what the code that you found is doing.

Ok I will try it.

On last information about this, I will create the device architecture in the host program:

float **h_AA, **h_BB, **h_CC;
        h_AA = (float**)malloc(6* sizeof(float*));
        h_BB = (float**)malloc(6 * sizeof(float*));
        h_CC = (float**)malloc(6 * sizeof(float*));
        printf("la1.0\n");
        for (int i = 0; i < 6; i++){
            cudaMalloc((void **)&h_AA[i], 5*5* sizeof(float));
            cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
            cudaMalloc((void **)&h_CC[i], 5*sizeof(float));

        }

        float **d_AA, **d_BB, **d_CC;
        cudaMalloc(&d_AA, 6 * sizeof(float*));
        cudaMalloc(&d_BB, 6 * sizeof(float*));
        cudaMalloc(&d_CC, 6 * sizeof(float*));
        cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
        cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
        cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);

I would like to use a kernel to fill each matrix and launch the process in the GPU, my kernel will be like this:

__global void mykernel(float **d_AA,float **d_BB,float **d_CC)

Does this line provide me trouble?