I get the error CUSOLVER_STATUS_IRS_PARAMS_INVALID (Error Status: 13) when I try to compute the buffer size using the cusolverDnSSgels_bufferSize API.
The input matrix A is 5x3, matrix B is 5x1 and matrix X is 3x1. All matrixes are in row-major format.
CUDA Version: 12.8
Any help in identifying and solving this error? Thanks in advance.
void generateValidationOutput(float* matA_d, float* matB_d, float* matX_d, int M, int N, cudaStream_t stream) {
int inputRowsA = M;
int inputColsA = N;
int inputRowsX = N;
int outputRowsB = M;
cusolverDnHandle_t handle;
cusolverDnCreate(&handle);
cusolverDnSetStream(handle, stream);
cusolverDnIRSParams_t params;
CUSOLVER_CHECK(cusolverDnIRSParamsCreate(¶ms));
CUSOLVER_CHECK(cusolverDnIRSParamsSetMaxIters(params, INT_MAX));
CUSOLVER_CHECK(cusolverDnIRSParamsSetMaxItersInner(params, INT_MAX - 10));
int nIters = 0;
int* dInfo_d = nullptr;
size_t workspaceSize = 0;
void* workspacePtr = nullptr;
CUSOLVER_CHECK(cusolverDnSSgels_bufferSize(handle, inputRowsA, inputColsA, 1, matA_d, N, matB_d, 1, matX_d, 1, NULL, &workspaceSize));
CUDA_CHECK(cudaMallocAsync(&workspacePtr, workspaceSize, stream));
CUDA_CHECK(cudaMallocAsync((void**)&dInfo_d, sizeof(int), stream));
CUSOLVER_CHECK(cusolverDnSSgels(handle, M, N, 1, matA_d, N, matB_d, 1, matX_d, 1, workspacePtr, workspaceSize, &nIters, dInfo_d));
int dInfo_h = 0;
CUDA_CHECK(cudaMemcpyAsync(&dInfo_h, dInfo_d, sizeof(int), cudaMemcpyDeviceToHost, stream));
if (nIters > 0)
printf("Success coverging cuSolver: %d\n", nIters);
else
printf("Failure converging cuSolver\n");
if (dInfo_h == 0)
printf("Success using cuSolver\n");
else
printf("Failure using cuSolver at %d\n", dInfo_h);
}
void main() {
// Memory allocations
float* matA_h = (float*)malloc(5 * 3 * sizeof(float));
float* matX_h = (float*)malloc(3 * sizeof(float));
float* matB_h = (float*)malloc(5 * sizeof(float));
// GPU Allocations
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
float* matA_d = nullptr;
float* matX_d = nullptr;
float* matB_d = nullptr;
CUDA_CHECK(cudaMallocAsync((void**)&matA_d, 5 * 3 * sizeof(float), stream));
CUDA_CHECK(cudaMallocAsync((void**)&matX_d, 3 * sizeof(float), stream));
CUDA_CHECK(cudaMallocAsync((void**)&matB_d, 5 * sizeof(float), stream));
int matARows = 5;
int matACols = 3;
int matXRows = 3;
int matBRows = 5;
// Generate input and validation outputs
for (int i = 0;i < matARows * matACols;i++) {
matA_h[i] = ((rand() % 100) - 50.f) / (float)2.f;
}
for (int i = 0;i < matBRows;i++) {
matB_h[i] = ((rand() % 100) - 50.f) / (float)2.f;
}
// Memcpy H->D
CUDA_CHECK(cudaMemcpyAsync(matA_d, matA_h, matARows * matACols * sizeof(float), cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(matB_d, matB_h, matBRows * sizeof(float), cudaMemcpyHostToDevice, stream));
generateValidationOutput(matA_d, matB_d, matX_d, matARows, matACols, stream);
CUDA_CHECK(cudaMemcpyAsync(matX_h, matX_d, matXRows * sizeof(float), cudaMemcpyDeviceToHost, stream));
for (int i = 0;i < matXRows;i++) {
printf("%f, ", matX_h[i]);
}
printf("\n\n");
free(matA_h);
free(matB_h);
free(matX_h);
CUDA_CHECK(cudaFreeAsync(matA_d, stream));
CUDA_CHECK(cudaFreeAsync(matB_d, stream));
CUDA_CHECK(cudaFreeAsync(matX_d, stream));
}