I need to solve ~10M 11 by 11 hermitian eigen decompositions. I used cusolver_syevjBatched_example to write my own cusolverDnCheevjBatched based eigen solver. I built it as .so and it runs successfully when I call from standalone executable on Linux. But when I call from my project where I would like to use this, I get cusolver error 1 at cusolverDnCheevjBatched_bufferSize. Any advice that could be causing this?
My code in .cuh:
class CudaProcessor
{
public:
CudaProcessor(std::vector& dataDim);
~CudaProcessor();
void InitWorkSpace(std::vector<int>& dataDim);
void ComputeBatchedEigenDecomposition(cuComplex* data, std::vector<cuComplex>& evec,
std::vector<float>& eval, int numVal);
// cuda objects
cusolverDnHandle_t m_handle;
cudaStream_t m_stream;
syevjInfo_t m_params;
const cusolverEigMode_t m_eigMode;
const cublasFillMode_t m_matFillMode;
const float m_tol;
const int max_sweeps;
// devise objects
cuComplex* d_work;
float* d_eval;
int* d_info;
int m_height;
int m_width;
int m_batchSize;
int m_sizeWork;
};
code in .cu:
CudaProcessor::CudaProcessor(std::vector& dataDim)
: m_eigMode(CUSOLVER_EIG_MODE_VECTOR),
m_matFillMode(CUBLAS_FILL_MODE_LOWER),
m_tol(1.e-7),
max_sweeps(15),
m_handle(nullptr),
m_stream(nullptr),
m_params(nullptr)
{
CUDA_CHECK(cudaSetDevice(0));
// create all objects for EigenDecomposition
/* step 1: create cusolver handle, bind a stream */
CUSOLVER_CHECK(cusolverDnCreate(&m_handle));
CUDA_CHECK(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
CUSOLVER_CHECK(cusolverDnSetStream(m_handle, m_stream));
/* step 2: configuration of syevj */
CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&m_params));
/* default value of tolerance is machine zero */
CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(m_params, m_tol));
/* default value of max. sweeps is 100 */
CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(m_params, max_sweeps));
InitWorkSpace(dataDim);
}
CudaProcessor::~CudaProcessor()
{
CUDA_CHECK(cudaFree(d_work));
CUDA_CHECK(cudaFree(d_eval));
CUDA_CHECK(cudaFree(d_info));
CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(m_params));
CUSOLVER_CHECK(cusolverDnDestroy(m_handle));
CUDA_CHECK(cudaStreamDestroy(m_stream));
CUDA_CHECK(cudaDeviceReset());
}
void CudaProcessor::InitWorkSpace(std::vector& dataDim)
{
cuComplex* d_data = nullptr;
d_work = nullptr;
d_eval = nullptr;
d_info = nullptr;
m_sizeWork = 0;
m_height = dataDim[0];
m_width = dataDim[1];
m_batchSize = dataDim[2];
std::vector<cuComplex> A(m_height * m_width * m_batchSize, make_cuComplex(3.f, 2.f));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_data),
sizeof(cuComplex) * m_height * m_width * m_batchSize));
CUDA_CHECK(
cudaMalloc(reinterpret_cast<void**>(&d_eval), sizeof(float) * m_height * m_batchSize));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int) * m_batchSize));
CUDA_CHECK(cudaMemcpyAsync(d_data, A.data(), sizeof(cuComplex) * A.size(),
cudaMemcpyHostToDevice, m_stream));
/* step 4: query working space of syevj */
CUSOLVER_CHECK(cusolverDnCheevjBatched_bufferSize(m_handle, m_eigMode, m_matFillMode, m_height,
d_data, m_width, d_eval, &m_sizeWork,
m_params, m_batchSize));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * m_sizeWork));
/* free resources */
CUDA_CHECK(cudaFree(d_data));
}
void CudaProcessor::ComputeBatchedEigenDecomposition(cuComplex* data, std::vector& evec,
std::vector& eval, int numVal)
{
int valSize = m_width * m_batchSize; // numVal * m_batchSize;
int vecSize = m_width * m_height * m_batchSize; // numVal * m_batchSize * m_height;
std::vector evecTemp(vecSize);
std::vector evalTemp(valSize);
cuComplex* d_data = nullptr;
CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_data),
sizeof(cuComplex) * m_height * m_width * m_batchSize));
CUDA_CHECK(cudaMemcpyAsync(d_data, data, sizeof(cuComplex) * m_height * m_width * m_batchSize,
cudaMemcpyHostToDevice, m_stream));
/* step 5: compute eigen-pair */
CUSOLVER_CHECK(cusolverDnCheevjBatched(m_handle, m_eigMode, m_matFillMode, m_height, d_data,
m_width, d_eval, d_work, m_sizeWork, d_info, m_params,
m_batchSize));
// move data on GPU to get only first numVal in vector and value
// NOTE: improve performance by only copying first n numVal for data
// copy data from GPU
CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<cuComplex*>(evecTemp.data()), d_data,
sizeof(cuComplex) * vecSize, cudaMemcpyDeviceToHost, m_stream));
CUDA_CHECK(cudaMemcpyAsync(evalTemp.data(), d_eval, sizeof(float) * valSize,
cudaMemcpyDeviceToHost, m_stream));
CUDA_CHECK(cudaStreamSynchronize(m_stream));
/* free resources */
CUDA_CHECK(cudaFree(d_data));
// TODO: move to cuda kernel
for (int ii = 0; ii < m_batchSize; ++ii)
{
std::copy(evecTemp.data() + ii * m_width * m_height,
evecTemp.data() + ii * m_width * m_height + numVal * m_height,
evec.data() + ii * numVal * m_height);
std::copy(evalTemp.data() + ii * m_width, evalTemp.data() + ii * m_width + numVal,
eval.data() + ii * numVal);
}
}