Hi!
I would like to use cuSolverDn in the CUDA toolkit to eventually calculate [A]{x}={B} in a large number of rows, such as 10000×10000.
However, in the code below, the calculation is not performed when the number of rows is larger than 500 x 500. Is it possible to determine the size of the rows somewhere? I would also appreciate any advice on how to make this code work more efficiently.
The GPU we are using is NVIDIA GeForce RTX 2070 SUPER.
#include <cuda_runtime.h>
#include <cusolverDn.h> // dense LAPACK
#include <cassert>
#include <iostream>
using namespace std;
template<typename T>
inline size_t bytesof(unsigned int s) { return s * sizeof(T); }
template<typename T>
T* allocate(unsigned int size) {
T* result = nullptr;
cudaError_t status = cudaMalloc(&result, bytesof<T>(size));
assert(status == cudaSuccess);
return result;
}
int main() {
cusolverStatus_t status;
// dense LAPACK
cusolverDnHandle_t handle;
status = cusolverDnCreate(&handle);
assert(status == CUSOLVER_STATUS_SUCCESS);
// A
float A[1000][1000];
int i, j, n;
float k;
n = 1000;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
A[i][j] = 0;
}
}
k = 20.0;
A[0][0] = k;
for (i = 0; i < n - 1; i++) {
for (j = 0; j < n - 1; j++) {
if (i = j);
A[i][j] = A[i][j] + k;
A[i][j + 1] = -k;
A[i + 1][j] = -k;
A[i + 1][j + 1] = k;
}
}
float* dA = allocate<float>(n * n);
cudaMemcpy(dA, A, bytesof<float>(n * n), cudaMemcpyHostToDevice);
int lda = n;
int worksize;
status = cusolverDnSgetrf_bufferSize(
handle,
n,
n,
dA,
lda,
&worksize);
assert(status == CUSOLVER_STATUS_SUCCESS);
#ifdef _DEBUG
cout << "worksize = " << worksize << endl;
#endif
float* workspace = allocate<float>(worksize);
int* devInfo = allocate<int>(1);
int* pivot = allocate<int>(n);
status = cusolverDnSgetrf(
handle,
n,
n,
dA,
lda,
workspace,
pivot,
devInfo);
#ifdef _DEBUG
int info;
cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
cout << "info = " << info << endl;
#endif
assert(status == CUSOLVER_STATUS_SUCCESS);
// B
float B[1000];
for (i = 0; i < n; i++) {
B[i] = 0;
}
for (i = 0; i < n; i++) {
if (i = n - 1);
B[i] = k;
}
float nrhs = 1;
float* dB = allocate<float>(n * nrhs);
cudaMemcpy(dB, B, bytesof<float>(n * nrhs), cudaMemcpyHostToDevice);
int ldb = n;
// AX = B
status = cusolverDnSgetrs(
handle,
CUBLAS_OP_T,
n,
nrhs,
dA,
lda,
pivot,
dB,
ldb,
devInfo);
#ifdef _DEBUG
cudaMemcpy(&info, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
cout << "info = " << info << endl;
#endif
assert(status == CUSOLVER_STATUS_SUCCESS);
float X[16];
cudaMemcpy(X, dB, bytesof<float>(n * nrhs), cudaMemcpyDeviceToHost);
for (int i = 0; i < nrhs; ++i) {
float* q = B + i * n;
float* a = X + i * n;
for (int i = 0; i < n; i++) {
cout // << "B[" << i << "] : " << B[i]
<< "\t answer : "
<< "X[" << i << "] : " << a[i] << endl;
}
}
cudaFree(workspace);
cudaFree(dA);
cudaFree(dB);
cudaFree(devInfo);
cudaFree(pivot);
cusolverDnDestroy(handle);
}