In the following csrqr code, all values come out as Nan

The code below is the code from csrqr examples changed to work only in one batch.

At least with the examples on the site, this code seems to work just fine.
//IA = { 1, 2, 3, 4, 8 };
//JA = { 1, 2, 3, 1, 2, 3, 4 };
//A = { 1.0, 2.0, 3.0, 0.1, 0.1, 0.1, 4.0 };
//b = { 1.0, 1.0, 1.0, 1.0 };

However, in the case of a larger CSR matrix I have, only Nan is output as a result.
My matrix has already been verified by calculating Ab=x using cusparse. But when I use csrqr to calculate Ax=b, an error occurs.
What part am I missing? Thanks for your help.

CSR_data.zip (43.6 KB)

//
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <cuda_runtime.h>
#include <cusolverSp.h>
#include <cusparse.h>

#include "cusolver_utils.h"

int main(int argc, char *argv[]) {
    cusolverSpHandle_t cusolverH = NULL;
    csrqrInfo_t info = NULL;
    cusparseMatDescr_t descrA = NULL;
    cudaStream_t stream = NULL;

    int i;


    int *d_csrRowPtrA = nullptr;
    int *d_csrColIndA = nullptr;
    double *d_csrValA = nullptr;
    double *d_b = nullptr; // batchSize * m
    double *d_x = nullptr; // batchSize * m
    
    FILE* fidA, * fidIA, * fidJA, * fidVec, * fidSolution, * fidtest;


    fidIA = fopen("IA.txt", "r");
    fidJA = fopen("JA.txt", "r");
    fidA = fopen("A.txt", "r");
    fidVec = fopen("vec.txt", "r");
    fidSolution = fopen("solution.txt", "r");
    fidtest = fopen("test.txt", "w");


    size_t size_qr = 0;
    size_t size_internal = 0;
    void *buffer_qr = nullptr; // working space for numerical factorization


    const int m = 1180;
    const int nnzA = 30010;

    std::vector<int> csrRowPtrA(m+1);//IA
    std::vector<int> csrColIndA(nnzA);//JA
    std::vector<double> csrValA(nnzA);//A
    std::vector<double> b(m);//soulution

    const int batchSize = 1;// reference_result

    std::vector<double> csrValABatch(nnzA, 0);
    std::vector<double> bBatch(m, 0);
    std::vector<double> xBatch(m, 0);


    for (i = 0; i < m + 1; i++)
    {
        fscanf(fidIA, "%d\n", &csrRowPtrA[i]);
    }
 

    for (i = 0; i < nnzA; i++)
    {
        fscanf(fidJA, "%d\n", &csrColIndA[i]);

        fscanf(fidA, "%lf\n", &csrValA[i]);
    }

    for (i = 0; i < m; i++)
    {
        fscanf(fidSolution, "%lf\n", &b[i]);
    }


    // step 1: prepare Aj and bj on host

    for (int colidx = 0; colidx < nnzA; colidx++) {
        double Areg = csrValA[colidx];
        csrValABatch[colidx] = Areg;
    }

    for (int j = 0; j < m; j++) {
        double breg = b[j];
        bBatch[j] = breg;
    }



    // step 2: create cusolver handle, qr info and matrix descriptor
    cusolverSpCreate(&cusolverH);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cusolverSpSetStream(cusolverH, stream);


    cusparseCreateMatDescr(&descrA);
    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE); // base-1
    cusolverSpCreateCsrqrInfo(&info);

    // step 3: copy Aj and bj to device
    cudaMalloc(reinterpret_cast<void**>(&d_csrValA), sizeof(double) * csrValABatch.size());
    cudaMalloc(reinterpret_cast<void**>(&d_csrColIndA), sizeof(int) * csrColIndA.size());
    cudaMalloc(reinterpret_cast<void**>(&d_csrRowPtrA), sizeof(int) * csrRowPtrA.size());
    cudaMalloc(reinterpret_cast<void**>(&d_b), sizeof(double) * bBatch.size());
    cudaMalloc(reinterpret_cast<void**>(&d_x), sizeof(double) * xBatch.size());

    cudaMemcpyAsync(d_csrValA, csrValABatch.data(), sizeof(double) * csrValABatch.size(),cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_csrColIndA, csrColIndA.data(), sizeof(int) * csrColIndA.size(),cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_csrRowPtrA, csrRowPtrA.data(), sizeof(int) * csrRowPtrA.size(),cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_b, bBatch.data(), sizeof(double) * bBatch.size(), cudaMemcpyHostToDevice, stream);

    // step 4: symbolic analysis
    cusolverSpXcsrqrAnalysisBatched(cusolverH, m, m, nnzA, descrA, d_csrRowPtrA,d_csrColIndA, info);

    // step 5: prepare working space
    cusolverSpDcsrqrBufferInfoBatched(cusolverH, m, m, nnzA, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, batchSize, info, &size_internal, &size_qr);




    cudaStreamSynchronize(stream);
    cudaMalloc(reinterpret_cast<void**>(&buffer_qr), size_qr);

    // step 6: numerical factorization
    // assume device memory is big enough to compute all matrices.
    cusolverSpDcsrqrsvBatched(cusolverH, m, m, nnzA, descrA, d_csrValA, d_csrRowPtrA,d_csrColIndA, d_b, d_x, batchSize, info, buffer_qr);

    // step 7: check residual
    // xBatch = [x0, x1, x2, ...]
    cudaMemcpyAsync(xBatch.data(), d_x, sizeof(double) * xBatch.size(), cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);
 

    const int baseA = (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
  
 
    // measure |bj - Aj*xj|
    double* csrValAj = csrValABatch.data();
    double* xj = xBatch.data();
    double* bj = bBatch.data();
    // sup| bj - Aj*xj|
    double sup_res = 0;
    for (int row = 0; row < m; row++) {
        const int start = csrRowPtrA[row] - baseA;
        const int end = csrRowPtrA[row + 1] - baseA;
        double Ax = 0.0; // Aj(row,:)*xj
        for (int colidx = start; colidx < end; colidx++) {
            const int col = csrColIndA[colidx] - baseA;
            const double Areg = csrValAj[colidx];
            const double xreg = xj[col];
            Ax = Ax + Areg * xreg;
        }
        double r = bj[row] - Ax;
        sup_res = (sup_res > fabs(r)) ? sup_res : fabs(r);
    }
   

        for (int i = 0; i < m; i++) {
            fprintf(fidtest, "%f\n", xj[i]);
        }
    /* free resources */
    cudaFree(d_csrRowPtrA);
    cudaFree(d_csrColIndA);
    cudaFree(d_csrValA);
    cudaFree(d_b);
    cudaFree(d_x);
    cudaFree(buffer_qr);

    cusolverSpDestroy(cusolverH);

    cudaStreamDestroy(stream);

    cudaDeviceReset();

  

    return true;
}