
I couldn’t find any examples of cublasDgetrsBatched.
when I used cublasDgetrsBatched, the return value of my program is not 0, is my parameter wrong?

A_matrix : batchSize * 64(8 * 8)
B_matrix: batchSize * 8(8 * 1)
A_matrix and B_matrix both 2_dimension
cublasDgetrsBatched(handle, CUBLAS_OP_N, m, 1, A_matrix, lda, NULL, B_matrix, ldb, d_info, batchSize);

here is the info of cublasDgetrsBatched, actually I don’t understand the meaning of n,nrhs , lda, ldb;

Param. Memory In/out Meaning
handle input handle to the cuBLAS library context.
trans input operation op(A) that is non- or (conj.) transpose.
n input number of rows and columns of Aarray[i].
nrhs input number of columns of Barray[i].
Aarray device input array of pointers to array, with each array of dim. n x n with lda>=max(1,n).
lda input leading dimension of two-dimensional array used to store each matrix Aarray[i].
devIpiv device input array of size n x batchSize that contains the pivoting sequence of each factorization of Aarray[i] stored in a linear fashion. If devIpiv is nil, pivoting for all Aarray[i] is ignored.
Barray device input/output array of pointers to array, with each array of dim. n x nrhs with ldb>=max(1,n). Matrices Barray[i] should not overlap; otherwise, undefined behavior is expected.
ldb input leading dimension of two-dimensional array used to store each solution matrix Barray[i].
info host output If info=0, the execution is successful.

If info = -j, the j-th parameter had an illegal value.|
|batchSize||input|number of pointers contained in A|

Can you look at the potrfBatched example? It includes potrsBatched, which is similar to getrsBatched.

thanks for your reply, I tested the portBatched example, and then I change the API to getrfBatched and getrsBatched, I got the same problem, program exited , the code -1073741819. I’m very confused.

here is the code

int main(int argc, char* argv[x]) {
    cusolverDnHandle_t cusolverH = NULL;
    cublasHandle_t handle = NULL;
    cudaStream_t stream = NULL;

    const cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
    const int batchSize = 2;
    const int nrhs = 1;
    const int m = 3;
    const int lda = m;
    const int ldb = m;
     *      | 1     2     3 |
     * A0 = | 2     5     5 | = L0 * L0**T
     *      | 3     5    12 |
     *            | 1.0000         0         0 |
     * where L0 = | 2.0000    1.0000         0 |
     *            | 3.0000   -1.0000    1.4142 |
     *      | 1     2     3 |
     * A1 = | 2     4     5 | is not s.p.d., failed at row 2
     *      | 3     5    12 |

    const std::vector<double> A0 = { 1.0, 2.0, 3.0, 2.0, 5.0, 5.0, 3.0, 5.0, 12.0 };
    const std::vector<double> A1 = { 1.0, 2.0, 3.0, 2.0, 4.0, 5.0, 3.0, 5.0, 12.0 };
    const std::vector<double> B0 = { 1.0, 1.0, 1.0 };
    std::vector<double> X0(m, 0);             /* X0 = A0\B0 */
    std::vector<int> infoArray(batchSize, 0); /* host copy of error info */

    std::vector<double> L0(lda * m); /* cholesky factor of A0 */

    std::vector<double*> Aarray(batchSize, nullptr);
    std::vector<double*> Barray(batchSize, nullptr);

    double** d_Aarray = nullptr;
    double** d_Barray = nullptr;
    int* d_infoArray = nullptr;

    std::printf("A0 = (matlab base-1)\n");
    print_matrix(m, m,, lda);

    std::printf("A1 = (matlab base-1)\n");
    print_matrix(m, m,, lda);

    std::printf("B0 = (matlab base-1)\n");
    print_matrix(m, 1,, ldb);

    /* step 1: create cusolver handle, bind a stream */

    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));
    CUBLAS_CHECK(cublasSetStream(handle, stream));

    /* step 2: copy A to device */
    for (int j = 0; j < batchSize; j++) {
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&Aarray[j]), sizeof(double) * lda * m));
        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&Barray[j]), sizeof(double) * ldb * nrhs));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_infoArray), sizeof(int) * infoArray.size()));

    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_Aarray), sizeof(double*) * Aarray.size()));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_Barray), sizeof(double*) * Barray.size()));

    CUDA_CHECK(cudaMemcpyAsync(Aarray[0],, sizeof(double) * A0.size(),
        cudaMemcpyHostToDevice, stream));
    CUDA_CHECK(cudaMemcpyAsync(Aarray[1],, sizeof(double) * A1.size(),
        cudaMemcpyHostToDevice, stream));

        cudaMemcpyAsync(Barray[0],, sizeof(double) * B0.size(), cudaMemcpyHostToDevice, stream));
        cudaMemcpyAsync(Barray[1],, sizeof(double) * B0.size(), cudaMemcpyHostToDevice, stream));

    CUDA_CHECK(cudaMemcpyAsync(d_Aarray,, sizeof(double) * Aarray.size(),
        cudaMemcpyHostToDevice, stream));
    CUDA_CHECK(cudaMemcpyAsync(d_Barray,, sizeof(double) * Barray.size(),
        cudaMemcpyHostToDevice, stream));

    /* step 3: Cholesky factorization */
        //cusolverDnDpotrfBatched(cusolverH, uplo, m, d_Aarray, lda, d_infoArray, batchSize));

    CUBLAS_CHECK(cublasDgetrfBatched(handle, m, d_Aarray, lda, NULL, d_infoArray, batchSize));

    CUDA_CHECK(cudaMemcpyAsync(, d_infoArray, sizeof(int) * infoArray.size(),
        cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(, Aarray[0], sizeof(double) * lda * m,
        cudaMemcpyDeviceToHost, stream));


    for (int j = 0; j < batchSize; j++) {
        std::printf("info[%d] = %d\n", j, infoArray[j]);

    assert(0 == infoArray[0]);
    /* A1 is singular */
    assert(2 == infoArray[1]);

    std::printf("L = (matlab base-1), upper triangle is don't care \n");
    print_matrix(m, m,, lda);

     * step 4: solve A0*X0 = B0
     *        | 1 |        | 10.5 |
     *   B0 = | 1 |,  X0 = | -2.5 |
     *        | 1 |        | -1.5 |
    //CUSOLVER_CHECK(cusolverDnDpotrsBatched(cusolverH, uplo, m, nrhs, /* only support rhs = 1*/
        //d_Aarray, lda, d_Barray, ldb, d_infoArray, batchSize));

    cublasDgetrsBatched(handle, CUBLAS_OP_N, m, nrhs, d_Aarray, lda, NULL, d_Barray, ldb, d_infoArray, batchSize);

    CUDA_CHECK(cudaMemcpyAsync(, d_infoArray, sizeof(int), cudaMemcpyDeviceToHost,
        cudaMemcpyAsync(, Barray[0], sizeof(double) * X0.size(), cudaMemcpyDeviceToHost, stream));


    std::printf("after potrsBatched: infoArray[0] = %d\n", infoArray[0]);
    if (0 > infoArray[0]) {
        std::printf("%d-th parameter is wrong \n", -infoArray[0]);

    std::printf("X0 = (matlab base-1)\n");
    print_matrix(m, 1,, ldb);

    /* free resources */
    for (int j = 0; j < batchSize; j++) {




    return EXIT_SUCCESS;

here is a full example of cublasSgetrsBatched. The conversion from S to D should be trivial. Your code as posted has a seg fault in it. That is associated with host code. If I were attempting to identify problems with it, the first thing I would do is identify the specific line of code that is causing a seg fault. The process to do that has nothing to do with CUDA, and could be a simple as a binary search using printf() statements.

I find the stupid mistake, I take these two info both in the device memory for granted. Thanks for answering my question.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.