How can I set the number of rows in a calculation using cuSolver?

Hi!

I would like to use cuSolverDn in the CUDA toolkit to eventually calculate [A]{x}={B} in a large number of rows, such as 10000×10000.
However, in the code below, the calculation is not performed when the number of rows is larger than 500 x 500. Is it possible to determine the size of the rows somewhere? I would also appreciate any advice on how to make this code work more efficiently.
The GPU we are using is NVIDIA GeForce RTX 2070 SUPER.

#include <cuda_runtime.h>
#include <cusolverDn.h> // dense LAPACK

#include <cassert>
#include <iostream>
using namespace std;

template<typename T>
inline size_t bytesof(unsigned int s) { return s * sizeof(T); }

template<typename T>
T* allocate(unsigned int size) {
    T* result = nullptr;
    cudaError_t status = cudaMalloc(&result, bytesof<T>(size)); 
    assert(status == cudaSuccess); 
    return result;
}

int main() {
    cusolverStatus_t status;

    // dense LAPACK
    cusolverDnHandle_t handle;
    status = cusolverDnCreate(&handle);
    assert(status == CUSOLVER_STATUS_SUCCESS);
    // A
    float A[1000][1000];
    int   i, j, n;
    float       k;

    n = 1000;

    for (i = 0; i < n; i++) {
        for (j = 0; j < n; j++) {
            A[i][j] = 0;
        }
    }

    k = 20.0;
    A[0][0] = k;

    for (i = 0; i < n - 1; i++) {
        for (j = 0; j < n - 1; j++) {
            if (i = j);
            A[i][j] = A[i][j] + k;
            A[i][j + 1] = -k;
            A[i + 1][j] = -k;
            A[i + 1][j + 1] = k;
        }
    }

    float* dA = allocate<float>(n * n);
    cudaMemcpy(dA, A, bytesof<float>(n * n), cudaMemcpyHostToDevice); 
    int lda = n;

    int worksize;
    status = cusolverDnSgetrf_bufferSize(
        handle,
        n,   
        n,   
        dA,  
        lda, 
        &worksize);
    assert(status == CUSOLVER_STATUS_SUCCESS);
#ifdef _DEBUG
    cout << "worksize = " << worksize << endl;
#endif

    float* workspace = allocate<float>(worksize);

    int* devInfo = allocate<int>(1);

    int* pivot = allocate<int>(n);

    status = cusolverDnSgetrf(
        handle,
        n,  
        n,  
        dA, 
        lda,
        workspace,
        pivot,
        devInfo);
#ifdef _DEBUG
    int info;
    cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
    cout << "info = " << info << endl;
#endif
    assert(status == CUSOLVER_STATUS_SUCCESS);

    // B
    float B[1000];

    for (i = 0; i < n; i++) {
        B[i] = 0;
    }
    for (i = 0; i < n; i++) {
        if (i = n - 1);
        B[i] = k;
    }

    float nrhs = 1;
    float* dB = allocate<float>(n * nrhs);
    cudaMemcpy(dB, B, bytesof<float>(n * nrhs), cudaMemcpyHostToDevice);
    int ldb = n;

    // AX = B 
    status = cusolverDnSgetrs(
        handle,
        CUBLAS_OP_T,
        n,    
        nrhs, 
        dA,   
        lda,  
        pivot,
        dB,   
        ldb,  
        devInfo);
#ifdef _DEBUG
    cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
    cout << "info = " << info << endl;
#endif
    assert(status == CUSOLVER_STATUS_SUCCESS);


    float X[16];
    cudaMemcpy(X, dB, bytesof<float>(n * nrhs), cudaMemcpyDeviceToHost);
    for (int i = 0; i < nrhs; ++i) {

        float* q = B + i * n;
        float* a = X + i * n;
        for (int i = 0; i < n; i++) {
            cout // << "B[" << i << "] : " << B[i]
                << "\t answer : "
                << "X[" << i << "] : " << a[i] << endl;

        }
    }
    cudaFree(workspace);
    cudaFree(dA);
    cudaFree(dB);
    cudaFree(devInfo);
    cudaFree(pivot);

    cusolverDnDestroy(handle);
}

This is a really bad idea:

It creates a large stack based variable. As you make it larger, things will eventually not work.

Thanks for the reply!
What can I do to make it work with larger values?

learn about how to do a dynamic allocation with C or C++

this has nothing to do with CUDA