CUDA cusolverDnSSgels error when number-of-right-handside is large (crosspost)

The original question is here: c++ - CUDA cusolverDnSSgels error when specific shapes are used (program to find rectangular Penrose matrix inverse) - Stack Overflow

#include <cuda_runtime.h>
#include <cusolverDn.h>

#include <armadillo>

#include <iostream>
#include <string>
#include <exception>
#include <stdint.h>
#include <type_traits>
#include <limits.h>
#include <assert.h>
#include <chrono>
using namespace std;

inline void gpuAssert(cudaError_t code, const char *file, int line, bool printing = false)
{   
  if (code != cudaSuccess)
  {
    std::string mess = std::string("GPUassert: ") + std::string(cudaGetErrorString(code)) 
                  + " " + std::string(file) + " " + std::to_string(line);
    if (printing) std::cout << mess << std::endl;
    throw std::runtime_error(mess.c_str());
  }

  auto lastError = cudaGetLastError();
  if (lastError != cudaSuccess)
  {
    std::string mess = std::string("GPUassert: ") + std::string(cudaGetErrorString(lastError)) 
                  + " " + std::string(file) + " " + std::to_string(line);
    std::cout << "UNDETECTED_ERROR " << mess << std::endl;
    throw std::runtime_error(mess.c_str());
  }
}

// CUDA API error checking
#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__, true); }
#define CUDA_CHECK_NOLOG(ans) { gpuAssert((ans), __FILE__, __LINE__); }

static const char *cusolverGetErrorEnumInternal(cusolverStatus_t error)
{
    switch (error)
    {
    case CUSOLVER_STATUS_SUCCESS:
      return "CUSOLVER_SUCCESS";

    case CUSOLVER_STATUS_NOT_INITIALIZED:
      return "CUSOLVER_STATUS_NOT_INITIALIZED";

    case CUSOLVER_STATUS_ALLOC_FAILED:
      return "CUSOLVER_STATUS_ALLOC_FAILED";

    case CUSOLVER_STATUS_INVALID_VALUE:
      return "CUSOLVER_STATUS_INVALID_VALUE";

    case CUSOLVER_STATUS_ARCH_MISMATCH:
      return "CUSOLVER_STATUS_ARCH_MISMATCH";

    case CUSOLVER_STATUS_EXECUTION_FAILED:
      return "CUSOLVER_STATUS_EXECUTION_FAILED";

    case CUSOLVER_STATUS_INTERNAL_ERROR:
      return "CUSOLVER_STATUS_INTERNAL_ERROR";

    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    case CUSOLVER_STATUS_MAPPING_ERROR:
      return "CUSOLVER_STATUS_MAPPING_ERROR";
    
    case CUSOLVER_STATUS_NOT_SUPPORTED:
      return "CUSOLVER_STATUS_NOT_SUPPORTED";
    
    case CUSOLVER_STATUS_ZERO_PIVOT:
      return "CUSOLVER_STATUS_ZERO_PIVOT";
    
    case CUSOLVER_STATUS_INVALID_LICENSE:
      return "CUSOLVER_STATUS_INVALID_LICENSE";
    
    case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED:
      return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED";
    
    case CUSOLVER_STATUS_IRS_PARAMS_INVALID:
      return "CUSOLVER_STATUS_IRS_PARAMS_INVALID";
    
    case CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC:
      return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC";
    
    case CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE:
      return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE";
    
    case CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER:
      return "CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER";
    
    case CUSOLVER_STATUS_IRS_INTERNAL_ERROR:
      return "CUSOLVER_STATUS_IRS_INTERNAL_ERROR";
    
    case CUSOLVER_STATUS_IRS_NOT_SUPPORTED:
      return "CUSOLVER_STATUS_IRS_NOT_SUPPORTED";
    
    case CUSOLVER_STATUS_IRS_OUT_OF_RANGE:
      return "CUSOLVER_STATUS_IRS_OUT_OF_RANGE";
    
    case CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES:
      return "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES";
    
    case CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED:
      return "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED";
    
    case CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED:
      return "CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED";
    
    case CUSOLVER_STATUS_IRS_MATRIX_SINGULAR:
      return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR";
    
    case CUSOLVER_STATUS_INVALID_WORKSPACE:
      return "CUSOLVER_STATUS_INVALID_WORKSPACE";      
    }

    return "CUSOLVER_UNKNOWN_ERROR";
}

inline void cusolveSafeCallInternal(cusolverStatus_t err, const char *file, const int line)
{
  if (CUSOLVER_STATUS_SUCCESS != err) {
    std::string mess = std::string("CUSOLVER error in file ") + std::string(file) + ", line " + std::to_string(line)
                        + ", error: " + std::string(cusolverGetErrorEnumInternal(err));
    std::cout << mess << std::endl;
    throw std::runtime_error(mess);     
    }
}

#define cusolveSafeCall(err) { cusolveSafeCallInternal((err), __FILE__, __LINE__); }

class MyTimer {
    std::chrono::time_point<std::chrono::system_clock> start;

public:
    void startCounter() {
        start = std::chrono::system_clock::now();
    }

    int64_t getCounterNs() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
    }

    int64_t getCounterMs() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
    }

    double getCounterMsPrecise() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
                / 1000000.0;
    }
};

double eps_ = 1e-6;

template <typename num_t = double>
num_t rcmp(double a, double b, double eps = eps_) {
  if (std::isnan(a) && std::isnan(b)) return 0;
  if (std::isnan(a + b)) return NAN;
  num_t t = (a - b) / std::max((num_t)1, std::max(std::abs(a), std::abs(b)));
  return t < -eps ? -1 : +eps < t;
}

//----------------------------
//----------------------------
//----------------------------

template <typename T>
void test(int64_t M, int64_t N, int64_t nrhs)
{   
    int error_code = 0;

    arma::Mat<T> A(M, N, arma::fill::randn);
    arma::Mat<T> X(N, nrhs, arma::fill::randn);
    arma::Mat<T> B = A * X;

    X = solve(A, B);

    //------------------------
    cusolverDnHandle_t cusolverH;
    cusolveSafeCall(cusolverDnCreate(&cusolverH));

    int64_t ldda = M;
    int64_t lddb = M;
    int64_t lddx = N;

    T *d_A, *d_B, *d_X;
    CUDA_CHECK(cudaMallocManaged(&d_A, M * N * sizeof(T)));
    CUDA_CHECK(cudaMallocManaged(&d_X, N * nrhs * sizeof(T)));
    CUDA_CHECK(cudaMallocManaged(&d_B, M * nrhs * sizeof(T)));
    CUDA_CHECK(cudaMemcpy(d_A, A.memptr(), M * N * sizeof(T), cudaMemcpyDefault));
    CUDA_CHECK(cudaMemcpy(d_B, B.memptr(), M * nrhs * sizeof(T), cudaMemcpyDefault));
    CUDA_CHECK(cudaDeviceSynchronize());

    size_t lwork_bytes = 0;
    if constexpr(std::is_same_v<T, float>) {
        cusolveSafeCall(cusolverDnSSgels_bufferSize(
            cusolverH,
            M, N, nrhs,
            d_A, ldda,
            d_B, lddb,
            d_X, lddx,
            NULL, &lwork_bytes
        ));
    }
    if constexpr(std::is_same_v<T, double>) {
        cusolveSafeCall(cusolverDnDDgels_bufferSize(
            cusolverH,
            M, N, nrhs,
            d_A, ldda,
            d_B, lddb,
            d_X, lddx,
            NULL, &lwork_bytes
        ));
    }

    void* d_work;
    int* d_info;
    CUDA_CHECK(cudaMallocManaged(&d_work, sizeof(T) * lwork_bytes));
    CUDA_CHECK(cudaMallocManaged(&d_info, sizeof(int)));
    CUDA_CHECK(cudaDeviceSynchronize());

    int iter = 1e9;
    int info = 1e9;
    try {
        if constexpr(std::is_same_v<T, float>) {
            cusolveSafeCall(cusolverDnSSgels(
                cusolverH,
                M, N, nrhs,
                d_A, ldda,
                d_B, lddb,
                d_X, lddx,
                d_work, lwork_bytes,
                &iter, d_info
            ));
        }
        if constexpr(std::is_same_v<T, double>) {
            cusolveSafeCall(cusolverDnDDgels(
                cusolverH,
                M, N, nrhs,
                d_A, ldda,
                d_B, lddb,
                d_X, lddx,
                d_work, lwork_bytes,
                &iter, d_info
            ));
        }
    } catch (const std::exception &e) {
        error_code = 1;
        cout << e.what() << "\n";
    }
    cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDefault);
    CUDA_CHECK(cudaDeviceSynchronize());

    cout << "niters = " << iter << ", info = " << info << "\n";

    arma::Mat<T> X2(N, nrhs);
    memcpy(X2.memptr(), d_X, N * nrhs * sizeof(T));
    arma::Mat<T> B2 = A * X2;

    int wrong = 0;
    for (int i = 0; i < M; i++)
    for (int j = 0; j < nrhs; j++) {
        auto res1 = B(i, j);
        auto res2 = B2(i, j);//d_X[i + j * M];
        if (rcmp(res1, res2, 1e-2)) {
            cout << " wrong at " << i << " " << j << " " << res1 << " " << res2 << std::endl;
            wrong++;
            if (wrong >= 10) goto CLEANUP;
        }
    }

CLEANUP:
    cudaFree(d_A);
    cudaFree(d_X);
    cudaFree(d_B);
    cudaFree(d_work);
    cudaFree(d_info);
}

int main()
{
    // (2000, 1000, 2000) -> error
    // (2000, 1000, 1200) -> error
    // (2000, 1000, 500) -> correct
    // (2000, 500, 2000) -> error
    // (2000, 500, 1000) -> error
    // (2000, 500, 500) -> correct
    // (2000, 50, 100) -> correct
    int ntest = 5;
    for (int t = 1; t <= ntest; t++) {
        test<float>(2000, 1000, 2000);
        cout << "Finish test " << t << "\n";
    }
}

To compile: nvcc -o main main.cu -O3 -std=c++17 -lcublas -lcusolver -larmadillo

I tested on on CUDA 12.1, 2080ti, gcc 10.3

The problem with this program is that it randomly gives CUSOLVER_STATUS_INVALID_VALUE error when nrhs is big enough (somewhere > 300). For small values, it seems to work okay.

Checking Nvidia doc (cuSOLVER) entry below cusolverDnSSgels, and find CUSOLVER_STATUS_INVALID_VALUE, it mentions invalid parameter. But all 4 parameters in the example they use are correct in this program. Also it outputs niters = info = 0, which isn’t a possible situation in the doc.

The code includes a few param combination that are okay/not okay. What might be the problem here?
Thanks!