OpenMP + cuBLAS for batched matrix solves using LU

piersonguthrey · June 10, 2020, 11:39pm

I am attempting to combine OpenMP with cuBLAS to perform batched matrix solves.
The idea is that each thread will form a matrix.
Here is a pseudocode outline of the omp loop:

int batchSize = omp_get_num_threads();
double *Aarray[batchSize];
double *Barray[batchSize];
int *PivotArray[batchSize];
int *InfoArray[batchSize];

#pragma omp parallel for schedule(dynamic)
  ~for loop ~
{
    create matrices inMAT and the vector inRHS
   each thread creates its own inMAT and inRHS 
   also each thread initializes outSOLN 
      
   then I call 
    GaussElim::GaussElimCUBLAS(  inMAT  ,
                               inRHS ,
                               outSOLN ,
                               Aarray,
                              Barray,
                             PivotArray,
                                InfoArray);

  }

Now, the way I define my GaussElimCUBLAS is an attempt to mimic what I’ve seen here

Here is my attempt:

//---------------------------------------------------------------------
void GaussElim::GaussElimCUBLAS(DblArray& inMAT, // matrix
                          DblArray& inRHS,  // rhs
                          DblArray& outSOLN,  // solution
                          double **Aarray,  // shared between all threads
                          double **Barray,
                          int **PivotArray,
                          int **InfoArray)
{

    //---------------------------------------------
    //  PART 1: Initialization
    //---------------------------------------------

    // Create the cublas context on the master thread
    cublasHandle_t handle = NULL;
    #pragma omp master
       cublasCreate( &handle);

    // How many OMP threads?
    int batchSize = omp_get_num_threads();
    // Which thread am I on?
    int batch_id = omp_get_thread_num();

    #pragma omp critical
    {
    std::cout <<  "  batch id =  " << batch_id << " of  " << batchSize  << std::endl;
    std::fflush(stdout);
    }
    #pragma omp barrier

    // Matrix checks
    assert_eq(inMAT.get_num_ind(),2);
    assert_eq(inRHS.get_num_ind(),1);
    assert_eq(outSOLN.get_num_ind(),1);
    const int n1 = inMAT.get_ind_length(1);
    const int m1 = inMAT.get_ind_length(2);
    const int n2 = inRHS.get_ind_length(1);
    const int n3 = outSOLN.get_ind_length(1);
    assert_eq(n1,m1);
    assert_eq(n1,m1);
    assert_eq(n3,n1);

    // Matrix is square with size n^2
    const int n = n1;

    //---------------------------------------------
    //  PART 2: Allocations
    //---------------------------------------------

    // Initialize device arrays
    double **d_Aarray = NULL;
    double **d_Barray = NULL;
    int *d_InfoArray = NULL;
    int *d_PivotArray = NULL;

    // Allocate array pointers?
    cudaStat1 = cudaMalloc ((void**)&Aarray[batch_id], sizeof(double) * n * n);
    assert(cudaSuccess == cudaStat1);
    cudaStat1 = cudaMalloc ((void**)&Barray[batch_id], sizeof(double) * n );
    assert(cudaSuccess == cudaStat1);
    cudaStat1 = cudaMalloc ((void**)&PivotArray[batch_id], sizeof(int) * n );
    assert(cudaSuccess == cudaStat1);
    cudaStat1 = cudaMalloc ((void**)&InfoArray[batch_id], sizeof(int) * n );
    assert(cudaSuccess == cudaStat1);

    // Allocate device batch arrays?
    #pragma omp master
    {
       cudaStat1 = cudaMalloc ((void**)&d_Aarray, sizeof(double*) * batchSize);
       assert(cudaSuccess == cudaStat1);
       cudaStat1 = cudaMalloc ((void**)&d_Barray, sizeof(double*) * batchSize);
       assert(cudaSuccess == cudaStat1);
       cudaStat1 = cudaMalloc ((void**)&d_PivotArray, sizeof(int) * batchSize);
       assert(cudaSuccess == cudaStat1);
       cudaStat1 = cudaMalloc ((void**)&d_InfoArray, sizeof(int) * batchSize);
       assert(cudaSuccess == cudaStat1);
    }

    //---------------------------------------------
    //  PART 3: Host-device Transfers
    //---------------------------------------------
    // Each thread transfers its Array
    cudaStat1 = cudaMemcpy(Aarray[batch_id], &inMAT.fetch(1,1) , sizeof(double) * n * n, cudaMemcpyHostToDevice);
    std::cout << cudaStat1 << std::endl;
    // code fails to pass this assertion
    assert(cudaSuccess == cudaStat1);

    // Each thread transfers  its Barray
    cudaStat1 = cudaMemcpy(Barray[batch_id], &inRHS.fetch(1) , sizeof(double) * n, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);

    // The master thread transfers the large batch arrays
    #pragma omp master
    {
       cudaStat1 = cudaMemcpy(d_Aarray, Aarray, sizeof(double*)*batchSize, cudaMemcpyHostToDevice);
       assert(cudaSuccess == cudaStat1);
       cudaStat1 = cudaMemcpy(d_Barray, Barray, sizeof(double*)*batchSize, cudaMemcpyHostToDevice);
       assert(cudaSuccess == cudaStat1);
    }

    //---------------------------------------------
    //  PART 4: Factorizations
    //---------------------------------------------

    // Master thread instructs the device to perform a batched LU factorization
    #pragma omp master
    {
        cublasStatus_t cbstatus = CUBLAS_STATUS_SUCCESS;
        cbstatus = cublasDgetrfBatched(
            handle,
            n,
            d_Aarray,
            n,
            d_PivotArray,
            d_InfoArray,
            batchSize );
         assert(CUBLAS_STATUS_SUCCESS == cbstatus);
    }

    //---------------------------------------------
    //  PART 5: Solve LUx = b
    //---------------------------------------------
    // The master thread instructs the device to solve LUx = b
    int info = 0;
    #pragma omp master
    {
       cublasStatus_t cbstatus = CUBLAS_STATUS_SUCCESS;
       cbstatus = cublasDgetrsBatched(
          handle,
          CUBLAS_OP_T,
          n,
          1, // nrhs
          d_Aarray,
          n,
          d_PivotArray,
          d_Barray,
          n,
          &info,
          batchSize );
       assert(CUBLAS_STATUS_SUCCESS == cbstatus);
    }

    //---------------------------------------------
    //  PART 6: Transfer solution to host
    //---------------------------------------------

    cudaStat1 = cudaMemcpy( &outSOLN.fetch(1) , Barray[batch_id] , sizeof(double)*n, cudaMemcpyDeviceToHost);
    assert(cudaSuccess == cudaStat1);

}

If I use a single openmp thread, this code works.

However, when I use say 4 openmp threads, the code fails at the beginning of part 3, as per my comment with cudaStat1=1.

Any idea why this is happening?

Using cuda-10.1.243.