Exception Error cublasSgetrsBatched while cublasSgetrfBatched has no issues (cuda12.8)

hnewToCuda · September 24, 2025, 1:14pm

include <cublas_v2.h>
include <cuda_runtime.h>
include
include

bool SolveLUDenseBatched_CUDA(
const std::vector<float*>& d_A_batch, // each is device pointer to N x N matrix
const std::vector<float*>& d_B_batch, // each is device pointer to N x nrhs matrix
int N,
int nrhs,
int batchSize)
{
int lda = N, ldb = N;

// Device arrays of pointers
float **d_A_ptrs, **d_B_ptrs;
cudaMalloc(&d_A_ptrs, batchSize * sizeof(float*));
cudaMalloc(&d_B_ptrs, batchSize * sizeof(float*));
cudaMemcpy(d_A_ptrs, d_A_batch.data(), batchSize * sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_ptrs, d_B_batch.data(), batchSize * sizeof(float*), cudaMemcpyHostToDevice);

int *d_PivotArray, *d_InfoArray;
cudaMalloc(&d_PivotArray, batchSize * N * sizeof(int));
cudaMalloc(&d_InfoArray, batchSize * sizeof(int));

cublasHandle_t handle;
cublasCreate(&handle);

// LU factorization
cublasStatus_t status = cublasSgetrfBatched(handle, N, d_A_ptrs, lda, d_PivotArray, d_InfoArray, batchSize);
cudaDeviceSynchronize();

std::vector<int> h_InfoArray(batchSize);
cudaMemcpy(h_InfoArray.data(), d_InfoArray, batchSize * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < batchSize; ++i) {
    if (h_InfoArray[i] != 0) {
        printf("LU factorization failed for batch %d, info = %d\n", i, h_InfoArray[i]);
        cublasDestroy(handle);
        cudaFree(d_A_ptrs);
        cudaFree(d_B_ptrs);
        cudaFree(d_PivotArray);
        cudaFree(d_InfoArray);
        return false;
    }
}

// Solve AX = B
status = cublasSgetrsBatched(handle, CUBLAS_OP_N, N, nrhs, d_A_ptrs, lda, d_PivotArray,
                             d_B_ptrs, ldb, d_InfoArray, batchSize);
cudaDeviceSynchronize();

cudaMemcpy(h_InfoArray.data(), d_InfoArray, batchSize * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < batchSize; ++i) {
    if (h_InfoArray[i] != 0) {
        printf("Solve failed for batch %d, info = %d\n", i, h_InfoArray[i]);
        cublasDestroy(handle);
        cudaFree(d_A_ptrs);
        cudaFree(d_B_ptrs);
        cudaFree(d_PivotArray);
        cudaFree(d_InfoArray);
        return false;
    }
}

cudaFree(d_A_ptrs);
cudaFree(d_B_ptrs);
cudaFree(d_PivotArray);
cudaFree(d_InfoArray);
cublasDestroy(handle);

return true;

}

And running with following example

include
include <cuda_runtime.h>
include

// Fills two batches of N x N matrices and N x nrhs right-hand sides, then solves
void TestBatchedLUSolve()
{
const int N = 3;
const int nrhs = 2;
const int batchSize = 2;

// Host data for two matrices and two RHS
std::vector<float> h_A0 = { 1, 2, 3, 4, 5, 6, 7, 8, 10 }; // Matrix 0
std::vector<float> h_A1 = { 2, 1, 1, 3, 5, 2, 4, 6, 8 };  // Matrix 1
std::vector<float> h_B0 = { 1, 2, 3, 4, 5, 6 }; // RHS 0
std::vector<float> h_B1 = { 7, 8, 9, 10, 11, 12 }; // RHS 1

// Allocate device memory for matrices and RHS
std::vector<float*> d_A_batch(batchSize), d_B_batch(batchSize);
for (int i = 0; i < batchSize; ++i) {
    cudaMalloc(&d_A_batch[i], N * N * sizeof(float));
    cudaMalloc(&d_B_batch[i], N * nrhs * sizeof(float));
}

// Copy host data to device
cudaMemcpy(d_A_batch[0], h_A0.data(), N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_A_batch[1], h_A1.data(), N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_batch[0], h_B0.data(), N * nrhs * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_batch[1], h_B1.data(), N * nrhs * sizeof(float), cudaMemcpyHostToDevice);

// Call the batched LU solver
bool ok = SolveLUDenseBatched_CUDA(d_A_batch, d_B_batch, N, nrhs, batchSize);

// Copy results back and print
std::vector<float> h_X(N * nrhs);
for (int i = 0; i < batchSize; ++i) {
    cudaMemcpy(h_X.data(), d_B_batch[i], N * nrhs * sizeof(float), cudaMemcpyDeviceToHost);
    printf("Batch %d solution:\n", i);
    for (int row = 0; row < N; ++row) {
        for (int col = 0; col < nrhs; ++col) {
            printf("%g ", h_X[row + col * N]);
        }
        printf("\n");
    }
}

// Free device memory
for (int i = 0; i < batchSize; ++i) {
    cudaFree(d_A_batch[i]);
    cudaFree(d_B_batch[i]);
}

}

Here is the error I get
error: SEH exception with code 0xc0000005 thrown in the test body.

where as the error is during cublasSgetrsBatched call, and getting error “Access violation writing location 0x000000131A000E00” where this device pointer is with d_InfoArray.

Topic		Replies	Views
Segmentation fault using cublas<T>getrsBatched GPU-Accelerated Libraries cuda , cublas	3	1792	October 12, 2021
cublas_cublasDgetrsBatched_problem GPU-Accelerated Libraries cublas , cusolver	5	1034	October 6, 2022
Problem with using CUBLAS getrfbatched() as it returns error: Illegal memory access was encountered GPU-Accelerated Libraries	3	4591	February 2, 2018
Cublas batched lu decomposition get segmentation fault GPU-Accelerated Libraries	3	1246	April 23, 2014
cublasDtrsmBatched giving error when to solve array linear systems GPU-Accelerated Libraries	0	513	October 25, 2017
Segmentation fault : cublasDgetrfBatched and cublasDgetriBatched Legacy PGI Compilers	5	1822	July 10, 2019
cublasDgelsBatched help GPU-Accelerated Libraries	1	1306	November 21, 2014
Error parameter number 10 cublasSgemmBatched GPU-Accelerated Libraries	4	1855	June 23, 2016
Something wrong after cublasSmatinvBatched！！ GPU-Accelerated Libraries cublas	12	615	December 31, 2023
cuSOLVER/cuBLAS solving of LUx=b with batched interface GPU-Accelerated Libraries cublas , cusolver	2	971	January 7, 2022

Exception Error cublasSgetrsBatched while cublasSgetrfBatched has no issues (cuda12.8)

Related topics