I am attempting to combine OpenMP with cuBLAS to perform batched matrix solves.
The idea is that each thread will form a matrix.
Here is a pseudocode outline of the omp loop:
int batchSize = omp_get_num_threads();
double *Aarray[batchSize];
double *Barray[batchSize];
int *PivotArray[batchSize];
int *InfoArray[batchSize];
#pragma omp parallel for schedule(dynamic)
~for loop ~
{
create matrices inMAT and the vector inRHS
each thread creates its own inMAT and inRHS
also each thread initializes outSOLN
then I call
GaussElim::GaussElimCUBLAS( inMAT ,
inRHS ,
outSOLN ,
Aarray,
Barray,
PivotArray,
InfoArray);
}
Now, the way I define my GaussElimCUBLAS is an attempt to mimic what I’ve seen here
Here is my attempt:
//---------------------------------------------------------------------
void GaussElim::GaussElimCUBLAS(DblArray& inMAT, // matrix
DblArray& inRHS, // rhs
DblArray& outSOLN, // solution
double **Aarray, // shared between all threads
double **Barray,
int **PivotArray,
int **InfoArray)
{
//---------------------------------------------
// PART 1: Initialization
//---------------------------------------------
// Create the cublas context on the master thread
cublasHandle_t handle = NULL;
#pragma omp master
cublasCreate( &handle);
// How many OMP threads?
int batchSize = omp_get_num_threads();
// Which thread am I on?
int batch_id = omp_get_thread_num();
#pragma omp critical
{
std::cout << " batch id = " << batch_id << " of " << batchSize << std::endl;
std::fflush(stdout);
}
#pragma omp barrier
// Matrix checks
assert_eq(inMAT.get_num_ind(),2);
assert_eq(inRHS.get_num_ind(),1);
assert_eq(outSOLN.get_num_ind(),1);
const int n1 = inMAT.get_ind_length(1);
const int m1 = inMAT.get_ind_length(2);
const int n2 = inRHS.get_ind_length(1);
const int n3 = outSOLN.get_ind_length(1);
assert_eq(n1,m1);
assert_eq(n1,m1);
assert_eq(n3,n1);
// Matrix is square with size n^2
const int n = n1;
//---------------------------------------------
// PART 2: Allocations
//---------------------------------------------
// Initialize device arrays
double **d_Aarray = NULL;
double **d_Barray = NULL;
int *d_InfoArray = NULL;
int *d_PivotArray = NULL;
// Allocate array pointers?
cudaStat1 = cudaMalloc ((void**)&Aarray[batch_id], sizeof(double) * n * n);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMalloc ((void**)&Barray[batch_id], sizeof(double) * n );
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMalloc ((void**)&PivotArray[batch_id], sizeof(int) * n );
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMalloc ((void**)&InfoArray[batch_id], sizeof(int) * n );
assert(cudaSuccess == cudaStat1);
// Allocate device batch arrays?
#pragma omp master
{
cudaStat1 = cudaMalloc ((void**)&d_Aarray, sizeof(double*) * batchSize);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMalloc ((void**)&d_Barray, sizeof(double*) * batchSize);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMalloc ((void**)&d_PivotArray, sizeof(int) * batchSize);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMalloc ((void**)&d_InfoArray, sizeof(int) * batchSize);
assert(cudaSuccess == cudaStat1);
}
//---------------------------------------------
// PART 3: Host-device Transfers
//---------------------------------------------
// Each thread transfers its Array
cudaStat1 = cudaMemcpy(Aarray[batch_id], &inMAT.fetch(1,1) , sizeof(double) * n * n, cudaMemcpyHostToDevice);
std::cout << cudaStat1 << std::endl;
// code fails to pass this assertion
assert(cudaSuccess == cudaStat1);
// Each thread transfers its Barray
cudaStat1 = cudaMemcpy(Barray[batch_id], &inRHS.fetch(1) , sizeof(double) * n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
// The master thread transfers the large batch arrays
#pragma omp master
{
cudaStat1 = cudaMemcpy(d_Aarray, Aarray, sizeof(double*)*batchSize, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(d_Barray, Barray, sizeof(double*)*batchSize, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
}
//---------------------------------------------
// PART 4: Factorizations
//---------------------------------------------
// Master thread instructs the device to perform a batched LU factorization
#pragma omp master
{
cublasStatus_t cbstatus = CUBLAS_STATUS_SUCCESS;
cbstatus = cublasDgetrfBatched(
handle,
n,
d_Aarray,
n,
d_PivotArray,
d_InfoArray,
batchSize );
assert(CUBLAS_STATUS_SUCCESS == cbstatus);
}
//---------------------------------------------
// PART 5: Solve LUx = b
//---------------------------------------------
// The master thread instructs the device to solve LUx = b
int info = 0;
#pragma omp master
{
cublasStatus_t cbstatus = CUBLAS_STATUS_SUCCESS;
cbstatus = cublasDgetrsBatched(
handle,
CUBLAS_OP_T,
n,
1, // nrhs
d_Aarray,
n,
d_PivotArray,
d_Barray,
n,
&info,
batchSize );
assert(CUBLAS_STATUS_SUCCESS == cbstatus);
}
//---------------------------------------------
// PART 6: Transfer solution to host
//---------------------------------------------
cudaStat1 = cudaMemcpy( &outSOLN.fetch(1) , Barray[batch_id] , sizeof(double)*n, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
}
If I use a single openmp thread, this code works.
However, when I use say 4 openmp threads, the code fails at the beginning of part 3, as per my comment with cudaStat1=1.
Any idea why this is happening?
Using cuda-10.1.243.