When I call the cublasDgemmBatched it correctly computes the matrix multiplications but returns a cublasStatus_t value of 13 (CUBLAS_STATUS_EXECUTION_FAILED). I don’t understand why it is returning that value. When I call cudaGetLastError() it returns “no error”.
It returns something like:
!! GPU program execution error on line 268 : stat=13, cudaError=0,(no error)
but subsequent kernel calls seem to fail.
#define CUBLAS_BATCHED_MM(Handle, OP1, OP2, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, nSegs)
stat = cublasDgemmBatched(Handle, OP1, OP2, M, N, K, &alpha, (const double **)A, lda, (const double **)B, ldb, &beta, C, ldc, nSegs);
if(stat != CUBLAS_STATUS_SUCCESS)
{
error = cudaGetLastError();
fprintf( stderr, "!! GPU program execution error on line %d : stat=%d, cudaError=%d,(%s)
", __LINE__, stat, error, cudaGetErrorString(error));
}
error = cudaThreadSynchronize();
if(error != cudaSuccess)
{
fprintf( stderr, "!! GPU program execution error on cudaThreadSynchronize on line %d: cudaError=%d,(%s)
", __LINE__, error,cudaGetErrorString(error));
}
void SetupBatchedMatrices(double *A, double ***devPtrA_dev, const int stride, const int N)
{
FreeMatrix(*devPtrA_dev);
double **devPtrA = NULL;
devPtrA = (double**)malloc(N * sizeof(double *));
//copy array memory locations
for(int i = 0; i < N; i++)
{
devPtrA[i] = A + stride * i;
}
//cudamalloc array of pointers
cudaError_t err1 = cudaMalloc((void**)devPtrA_dev, N * sizeof(double *));
if(err1 != cudaSuccess)
{
free(devPtrA);
fprintf(stderr, "!!!! GPU memory allocation error
");
}
err1 = cudaMemcpy(*devPtrA_dev, devPtrA, N * sizeof(double *), cudaMemcpyHostToDevice);
if(err1 != cudaSuccess)
{
free(devPtrA);
fprintf(stderr, "!!!! Cannot copy pointer array to device
");
}
free(devPtrA);
}
void cudaHDG<double>::GenMatrices_Z(Vector_d &Z_mats, Vector_d &E_mats, Vector_d &D1_mats, Vector_d &D2_mats,
Vector_d &M_mats, IdxVector_d &EdgeMap)
{
double **devPtrTemp_dev = NULL;
double **devPtrTemp1_dev = NULL;
double **devPtrTemp2_dev = NULL;
int N = m_nMats;
dim3 BLOCK, GRID;
Vector_d Temp_mats(N * (m_nC * m_nC), 0);
Vector_d Temp1_mats(N * (m_nC * m_nC), 0);
Vector_d Temp2_mats(N * (m_nC * m_nC), 0);
SetupBatchedMatrices(TPC(&Temp_mats[0]), &devPtrTemp_dev, m_nC * m_nC, N);
SetupBatchedMatrices(TPC(&Temp1_mats[0]), &devPtrTemp1_dev, m_nC * m_nC, N);
SetupBatchedMatrices(TPC(&Temp2_mats[0]), &devPtrTemp2_dev, m_nC * m_nC, N);
double alpha = 1.0, beta = 0.0;
// D1 * M * D1^T
CUBLAS_BATCHED_MM(m_cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, m_nC, m_nC, m_nC, alpha, (const double **)m_devPtrM_dev, m_nC,
(const double **)m_devPtrD1_dev, m_nC, beta, devPtrTemp_dev, m_nC, N);
CUBLAS_BATCHED_MM(m_cuHandle, CUBLAS_OP_T, CUBLAS_OP_N, m_nC, m_nC, m_nC, alpha, (const double **)m_devPtrD1_dev, m_nC,
(const double **)devPtrTemp_dev, m_nC, beta, devPtrTemp1_dev, m_nC, N);
// D2 * M * D2^T
CUBLAS_BATCHED_MM(m_cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, m_nC, m_nC, m_nC, alpha, (const double **)m_devPtrM_dev, m_nC,
(const double **)m_devPtrD2_dev, m_nC, beta, devPtrTemp_dev, m_nC, N);
CUBLAS_BATCHED_MM(m_cuHandle, CUBLAS_OP_T, CUBLAS_OP_N, m_nC, m_nC, m_nC, alpha, (const double **)m_devPtrD2_dev, m_nC,
(const double **)devPtrTemp_dev, m_nC, beta, devPtrTemp2_dev, m_nC, N);
int group = 256 / (m_nEC*m_nEC);
GRID = dim3(iDivUp(N, group), 1, 1);
BLOCK = dim3(m_nEC, m_nEC, group);
Z_matrix_kernel <<<GRID, BLOCK>>> (TPC(&Z_mats[0]), TPC(&E_mats[0]), TPC(&EdgeMap[0]), m_nMats, m_nC, m_nEC);
cudaThreadSynchronize();
cudaCheckError();
cusp::blas::axpbypcz(Temp1_mats, Temp2_mats, Z_mats, Z_mats, 1.0, 1.0, 1.0);
cudaThreadSynchronize();
cudaCheckError();
GRID = dim3(N, 1, 1);
BLOCK = dim3(m_nC, m_nC, 1);
if(m_nC <= 32)
Inverse_Kernel<32> <<<GRID, BLOCK>>> (TPC(&Z_mats[0]), m_nC);
else
Inverse_Kernel<64> <<<GRID, BLOCK>>> (TPC(&Z_mats[0]), m_nC);
cudaThreadSynchronize();
cudaCheckError();
FreeMatrix(devPtrTemp_dev);
FreeMatrix(devPtrTemp1_dev);
FreeMatrix(devPtrTemp2_dev);
}