Exception Error cublasSgetrsBatched while cublasSgetrfBatched has no issues (cuda12.8)

include <cublas_v2.h>
include <cuda_runtime.h>
include
include

bool SolveLUDenseBatched_CUDA(
const std::vector<float*>& d_A_batch, // each is device pointer to N x N matrix
const std::vector<float*>& d_B_batch, // each is device pointer to N x nrhs matrix
int N,
int nrhs,
int batchSize)
{
int lda = N, ldb = N;

// Device arrays of pointers
float **d_A_ptrs, **d_B_ptrs;
cudaMalloc(&d_A_ptrs, batchSize * sizeof(float*));
cudaMalloc(&d_B_ptrs, batchSize * sizeof(float*));
cudaMemcpy(d_A_ptrs, d_A_batch.data(), batchSize * sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_ptrs, d_B_batch.data(), batchSize * sizeof(float*), cudaMemcpyHostToDevice);

int *d_PivotArray, *d_InfoArray;
cudaMalloc(&d_PivotArray, batchSize * N * sizeof(int));
cudaMalloc(&d_InfoArray, batchSize * sizeof(int));

cublasHandle_t handle;
cublasCreate(&handle);

// LU factorization
cublasStatus_t status = cublasSgetrfBatched(handle, N, d_A_ptrs, lda, d_PivotArray, d_InfoArray, batchSize);
cudaDeviceSynchronize();

std::vector<int> h_InfoArray(batchSize);
cudaMemcpy(h_InfoArray.data(), d_InfoArray, batchSize * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < batchSize; ++i) {
    if (h_InfoArray[i] != 0) {
        printf("LU factorization failed for batch %d, info = %d\n", i, h_InfoArray[i]);
        cublasDestroy(handle);
        cudaFree(d_A_ptrs);
        cudaFree(d_B_ptrs);
        cudaFree(d_PivotArray);
        cudaFree(d_InfoArray);
        return false;
    }
}

// Solve AX = B
status = cublasSgetrsBatched(handle, CUBLAS_OP_N, N, nrhs, d_A_ptrs, lda, d_PivotArray,
                             d_B_ptrs, ldb, d_InfoArray, batchSize);
cudaDeviceSynchronize();

cudaMemcpy(h_InfoArray.data(), d_InfoArray, batchSize * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < batchSize; ++i) {
    if (h_InfoArray[i] != 0) {
        printf("Solve failed for batch %d, info = %d\n", i, h_InfoArray[i]);
        cublasDestroy(handle);
        cudaFree(d_A_ptrs);
        cudaFree(d_B_ptrs);
        cudaFree(d_PivotArray);
        cudaFree(d_InfoArray);
        return false;
    }
}

cudaFree(d_A_ptrs);
cudaFree(d_B_ptrs);
cudaFree(d_PivotArray);
cudaFree(d_InfoArray);
cublasDestroy(handle);

return true;

}

And running with following example

include
include <cuda_runtime.h>
include

// Fills two batches of N x N matrices and N x nrhs right-hand sides, then solves
void TestBatchedLUSolve()
{
const int N = 3;
const int nrhs = 2;
const int batchSize = 2;

// Host data for two matrices and two RHS
std::vector<float> h_A0 = { 1, 2, 3, 4, 5, 6, 7, 8, 10 }; // Matrix 0
std::vector<float> h_A1 = { 2, 1, 1, 3, 5, 2, 4, 6, 8 };  // Matrix 1
std::vector<float> h_B0 = { 1, 2, 3, 4, 5, 6 }; // RHS 0
std::vector<float> h_B1 = { 7, 8, 9, 10, 11, 12 }; // RHS 1

// Allocate device memory for matrices and RHS
std::vector<float*> d_A_batch(batchSize), d_B_batch(batchSize);
for (int i = 0; i < batchSize; ++i) {
    cudaMalloc(&d_A_batch[i], N * N * sizeof(float));
    cudaMalloc(&d_B_batch[i], N * nrhs * sizeof(float));
}

// Copy host data to device
cudaMemcpy(d_A_batch[0], h_A0.data(), N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_A_batch[1], h_A1.data(), N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_batch[0], h_B0.data(), N * nrhs * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_batch[1], h_B1.data(), N * nrhs * sizeof(float), cudaMemcpyHostToDevice);

// Call the batched LU solver
bool ok = SolveLUDenseBatched_CUDA(d_A_batch, d_B_batch, N, nrhs, batchSize);

// Copy results back and print
std::vector<float> h_X(N * nrhs);
for (int i = 0; i < batchSize; ++i) {
    cudaMemcpy(h_X.data(), d_B_batch[i], N * nrhs * sizeof(float), cudaMemcpyDeviceToHost);
    printf("Batch %d solution:\n", i);
    for (int row = 0; row < N; ++row) {
        for (int col = 0; col < nrhs; ++col) {
            printf("%g ", h_X[row + col * N]);
        }
        printf("\n");
    }
}

// Free device memory
for (int i = 0; i < batchSize; ++i) {
    cudaFree(d_A_batch[i]);
    cudaFree(d_B_batch[i]);
}

}

Here is the error I get
error: SEH exception with code 0xc0000005 thrown in the test body.

where as the error is during cublasSgetrsBatched call, and getting error “Access violation writing location 0x000000131A000E00” where this device pointer is with d_InfoArray.