# How can I set the number of rows in a calculation using cuSolver?

Hi!

I would like to use cuSolverDn in the CUDA toolkit to eventually calculate [A]{x}={B} in a large number of rows, such as 10000×10000.
However, in the code below, the calculation is not performed when the number of rows is larger than 500 x 500. Is it possible to determine the size of the rows somewhere? I would also appreciate any advice on how to make this code work more efficiently.
The GPU we are using is NVIDIA GeForce RTX 2070 SUPER.

``````#include <cuda_runtime.h>
#include <cusolverDn.h> // dense LAPACK

#include <cassert>
#include <iostream>
using namespace std;

template<typename T>
inline size_t bytesof(unsigned int s) { return s * sizeof(T); }

template<typename T>
T* allocate(unsigned int size) {
T* result = nullptr;
cudaError_t status = cudaMalloc(&result, bytesof<T>(size));
assert(status == cudaSuccess);
return result;
}

int main() {
cusolverStatus_t status;

// dense LAPACK
cusolverDnHandle_t handle;
status = cusolverDnCreate(&handle);
assert(status == CUSOLVER_STATUS_SUCCESS);
// A
float A[1000][1000];
int   i, j, n;
float       k;

n = 1000;

for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
A[i][j] = 0;
}
}

k = 20.0;
A[0][0] = k;

for (i = 0; i < n - 1; i++) {
for (j = 0; j < n - 1; j++) {
if (i = j);
A[i][j] = A[i][j] + k;
A[i][j + 1] = -k;
A[i + 1][j] = -k;
A[i + 1][j + 1] = k;
}
}

float* dA = allocate<float>(n * n);
cudaMemcpy(dA, A, bytesof<float>(n * n), cudaMemcpyHostToDevice);
int lda = n;

int worksize;
status = cusolverDnSgetrf_bufferSize(
handle,
n,
n,
dA,
lda,
&worksize);
assert(status == CUSOLVER_STATUS_SUCCESS);
#ifdef _DEBUG
cout << "worksize = " << worksize << endl;
#endif

float* workspace = allocate<float>(worksize);

int* devInfo = allocate<int>(1);

int* pivot = allocate<int>(n);

status = cusolverDnSgetrf(
handle,
n,
n,
dA,
lda,
workspace,
pivot,
devInfo);
#ifdef _DEBUG
int info;
cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
cout << "info = " << info << endl;
#endif
assert(status == CUSOLVER_STATUS_SUCCESS);

// B
float B[1000];

for (i = 0; i < n; i++) {
B[i] = 0;
}
for (i = 0; i < n; i++) {
if (i = n - 1);
B[i] = k;
}

float nrhs = 1;
float* dB = allocate<float>(n * nrhs);
cudaMemcpy(dB, B, bytesof<float>(n * nrhs), cudaMemcpyHostToDevice);
int ldb = n;

// AX = B
status = cusolverDnSgetrs(
handle,
CUBLAS_OP_T,
n,
nrhs,
dA,
lda,
pivot,
dB,
ldb,
devInfo);
#ifdef _DEBUG
cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
cout << "info = " << info << endl;
#endif
assert(status == CUSOLVER_STATUS_SUCCESS);

float X[16];
cudaMemcpy(X, dB, bytesof<float>(n * nrhs), cudaMemcpyDeviceToHost);
for (int i = 0; i < nrhs; ++i) {

float* q = B + i * n;
float* a = X + i * n;
for (int i = 0; i < n; i++) {
cout // << "B[" << i << "] : " << B[i]
<< "X[" << i << "] : " << a[i] << endl;

}
}
cudaFree(workspace);
cudaFree(dA);
cudaFree(dB);
cudaFree(devInfo);
cudaFree(pivot);

cusolverDnDestroy(handle);
}
``````

This is a really bad idea:

It creates a large stack based variable. As you make it larger, things will eventually not work.