CuSolver error status 13 for cusolverDnSSgels call

I get the error CUSOLVER_STATUS_IRS_PARAMS_INVALID (Error Status: 13) when I try to compute the buffer size using the cusolverDnSSgels_bufferSize API.
The input matrix A is 5x3, matrix B is 5x1 and matrix X is 3x1. All matrixes are in row-major format.

CUDA Version: 12.8

Any help in identifying and solving this error? Thanks in advance.

void generateValidationOutput(float* matA_d, float* matB_d, float* matX_d, int M, int N, cudaStream_t stream) {

    int inputRowsA = M;
    int inputColsA = N;
    int inputRowsX = N;
    int outputRowsB = M;

    cusolverDnHandle_t handle;
    cusolverDnCreate(&handle);
    cusolverDnSetStream(handle, stream);

    cusolverDnIRSParams_t params;
    CUSOLVER_CHECK(cusolverDnIRSParamsCreate(&params));
    CUSOLVER_CHECK(cusolverDnIRSParamsSetMaxIters(params, INT_MAX));
    CUSOLVER_CHECK(cusolverDnIRSParamsSetMaxItersInner(params, INT_MAX - 10));

    int nIters = 0;
    int* dInfo_d = nullptr;
    size_t workspaceSize = 0;
    void* workspacePtr = nullptr;
    CUSOLVER_CHECK(cusolverDnSSgels_bufferSize(handle, inputRowsA, inputColsA, 1, matA_d, N, matB_d, 1, matX_d, 1, NULL, &workspaceSize));
    CUDA_CHECK(cudaMallocAsync(&workspacePtr, workspaceSize, stream));
    CUDA_CHECK(cudaMallocAsync((void**)&dInfo_d, sizeof(int), stream));
    CUSOLVER_CHECK(cusolverDnSSgels(handle, M, N, 1, matA_d, N, matB_d, 1, matX_d, 1, workspacePtr, workspaceSize, &nIters, dInfo_d));

    int dInfo_h = 0;
    CUDA_CHECK(cudaMemcpyAsync(&dInfo_h, dInfo_d, sizeof(int), cudaMemcpyDeviceToHost, stream));

    if (nIters > 0)
        printf("Success coverging cuSolver: %d\n", nIters);
    else
        printf("Failure converging cuSolver\n");

    if (dInfo_h == 0)
        printf("Success using cuSolver\n");
    else
        printf("Failure using cuSolver at %d\n", dInfo_h);
}

void main() {

    
    // Memory allocations
    float* matA_h = (float*)malloc(5 * 3 * sizeof(float));
    float* matX_h = (float*)malloc(3 * sizeof(float));
    float* matB_h = (float*)malloc(5 * sizeof(float));

    // GPU Allocations
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    float* matA_d = nullptr;
    float* matX_d = nullptr;
    float* matB_d = nullptr;
    CUDA_CHECK(cudaMallocAsync((void**)&matA_d, 5 * 3 * sizeof(float), stream));
    CUDA_CHECK(cudaMallocAsync((void**)&matX_d, 3 * sizeof(float), stream));
    CUDA_CHECK(cudaMallocAsync((void**)&matB_d, 5 * sizeof(float), stream));

    int matARows = 5;
    int matACols = 3;
    int matXRows = 3;
    int matBRows = 5;

    // Generate input and validation outputs
    for (int i = 0;i < matARows * matACols;i++) {
        matA_h[i] = ((rand() % 100) - 50.f) / (float)2.f;
    }

    for (int i = 0;i < matBRows;i++) {
        matB_h[i] = ((rand() % 100) - 50.f) / (float)2.f;
    }

    // Memcpy H->D
    CUDA_CHECK(cudaMemcpyAsync(matA_d, matA_h, matARows * matACols * sizeof(float), cudaMemcpyHostToDevice, stream));
    CUDA_CHECK(cudaMemcpyAsync(matB_d, matB_h, matBRows * sizeof(float), cudaMemcpyHostToDevice, stream));

    generateValidationOutput(matA_d, matB_d, matX_d, matARows, matACols, stream);

    CUDA_CHECK(cudaMemcpyAsync(matX_h, matX_d, matXRows * sizeof(float), cudaMemcpyDeviceToHost, stream));

    for (int i = 0;i < matXRows;i++) {
        printf("%f, ", matX_h[i]);
    }
    printf("\n\n");


    free(matA_h);
    free(matB_h);
    free(matX_h);

    CUDA_CHECK(cudaFreeAsync(matA_d, stream));
    CUDA_CHECK(cudaFreeAsync(matB_d, stream));
    CUDA_CHECK(cudaFreeAsync(matX_d, stream));
}