CUBLAS Batched Matrix Multiplies?

Sabaron · February 21, 2012, 5:31pm

Trying to test out the new CUBLAS batched matrix multiply routines in CUDA 4.1 but I can’t seem to get it working at all. I simplified my code down to calling cublasCgemmBatched with a batch count of 1 and it still fails with CUDA_ERROR_LAUNCH_FAILED every time. I’m sure I’m missing something simple but I’m not seeing it?

cublasHandle_t lhHandle;

   CudaUtil::CheckError(cublasCreate(&lhHandle), THIS_FILE,

      "Failed to create CUBLAS handle");

   CudaUtil::CheckError(cublasSetPointerMode(lhHandle,

      CUBLAS_POINTER_MODE_HOST), THIS_FILE,

      "Failed to set CUBLAS pointer mode");

cuComplex lsComplexOne;

   lsComplexOne.x = 1.0;

   lsComplexOne.y = 0.0;

   cuComplex lsComplexZero;

   lsComplexZero.x = 0.0;

   lsComplexZero.y = 0.0;

int lnARows = 16;

   int lnAColumns = 24;

   int lnBRows = 32;

   int lnBColumns = lnARows;

CUdeviceptr lnA, lnB, lnC;

CudaUtil::CheckError(cuMemAlloc(&lnA,

      lnARows * lnAColumns * sizeof(tsComplex)),

      THIS_FILE, "Failed to allocate A buffer");

   CudaUtil::CheckError(cuMemAlloc(&lnB,

      lnBRows * lnBColumns * sizeof(tsComplex)),

      THIS_FILE, "Failed to allocate B buffer");

   CudaUtil::CheckError(cuMemAlloc(&lnC,

      lnARows * lnBColumns * sizeof(tsComplex)),

      THIS_FILE, "Failed to allocate C buffer");

CudaUtil::CheckError(cublasCgemm(lhHandle, CUBLAS_OP_N, CUBLAS_OP_N,

      lnAColumns, lnBRows, lnBColumns, &lsComplexOne,

      reinterpret_cast<cuComplex*>(lnA), lnAColumns,

      reinterpret_cast<cuComplex*>(lnB), lnBColumns,

      &lsComplexZero,

      reinterpret_cast<cuComplex*>(lnC), lnAColumns),

      THIS_FILE, "Failed to execute CUBLAS complex matrix multiply");

CudaUtil::CheckError(cuCtxSynchronize(), THIS_FILE,

      "Failed to perform synchronize");

const cuComplex *lasA[] = { reinterpret_cast<const cuComplex*>(lnA) };

   const cuComplex *lasB[] = { reinterpret_cast<const cuComplex*>(lnB) };

   cuComplex *lasC[] = { reinterpret_cast<cuComplex*>(lnC) };

CudaUtil::CheckError(cublasCgemmBatched(lhHandle, CUBLAS_OP_N, CUBLAS_OP_N,

      lnAColumns, lnBRows, lnBColumns, &lsComplexOne,

      lasA, lnAColumns,

      lasB, lnBColumns,

      &lsComplexZero,

      lasC, lnAColumns, 1),

      THIS_FILE, "Failed to execute CUBLAS batched complex matrix multiply");

CudaUtil::CheckError(cuCtxSynchronize(), THIS_FILE,

      "Failed to perform batch synchronize");

philippev · February 27, 2012, 6:08am

Trying to test out the new CUBLAS batched matrix multiply routines in CUDA 4.1 but I can’t seem to get it working at all. I simplified my code down to calling cublasCgemmBatched with a batch count of 1 and it still fails with CUDA_ERROR_LAUNCH_FAILED every time. I’m sure I’m missing something simple but I’m not seeing it?

cublasHandle_t lhHandle;

   CudaUtil::CheckError(cublasCreate(&lhHandle), THIS_FILE,

      "Failed to create CUBLAS handle");

   CudaUtil::CheckError(cublasSetPointerMode(lhHandle,

      CUBLAS_POINTER_MODE_HOST), THIS_FILE,

      "Failed to set CUBLAS pointer mode");

cuComplex lsComplexOne;

   lsComplexOne.x = 1.0;

   lsComplexOne.y = 0.0;

   cuComplex lsComplexZero;

   lsComplexZero.x = 0.0;

   lsComplexZero.y = 0.0;

int lnARows = 16;

   int lnAColumns = 24;

   int lnBRows = 32;

   int lnBColumns = lnARows;

CUdeviceptr lnA, lnB, lnC;

CudaUtil::CheckError(cuMemAlloc(&lnA,

      lnARows * lnAColumns * sizeof(tsComplex)),

      THIS_FILE, "Failed to allocate A buffer");

   CudaUtil::CheckError(cuMemAlloc(&lnB,

      lnBRows * lnBColumns * sizeof(tsComplex)),

      THIS_FILE, "Failed to allocate B buffer");

   CudaUtil::CheckError(cuMemAlloc(&lnC,

      lnARows * lnBColumns * sizeof(tsComplex)),

      THIS_FILE, "Failed to allocate C buffer");

CudaUtil::CheckError(cublasCgemm(lhHandle, CUBLAS_OP_N, CUBLAS_OP_N,

      lnAColumns, lnBRows, lnBColumns, &lsComplexOne,

      reinterpret_cast<cuComplex*>(lnA), lnAColumns,

      reinterpret_cast<cuComplex*>(lnB), lnBColumns,

      &lsComplexZero,

      reinterpret_cast<cuComplex*>(lnC), lnAColumns),

      THIS_FILE, "Failed to execute CUBLAS complex matrix multiply");

CudaUtil::CheckError(cuCtxSynchronize(), THIS_FILE,

      "Failed to perform synchronize");

const cuComplex *lasA[] = { reinterpret_cast<const cuComplex*>(lnA) };

   const cuComplex *lasB[] = { reinterpret_cast<const cuComplex*>(lnB) };

   cuComplex *lasC[] = { reinterpret_cast<cuComplex*>(lnC) };

CudaUtil::CheckError(cublasCgemmBatched(lhHandle, CUBLAS_OP_N, CUBLAS_OP_N,

      lnAColumns, lnBRows, lnBColumns, &lsComplexOne,

      lasA, lnAColumns,

      lasB, lnBColumns,

      &lsComplexZero,

      lasC, lnAColumns, 1),

      THIS_FILE, "Failed to execute CUBLAS batched complex matrix multiply");

CudaUtil::CheckError(cuCtxSynchronize(), THIS_FILE,

      "Failed to perform batch synchronize");

philippev · February 27, 2012, 6:11am

The 3 arrays of device Pointer must also be located on the device. In your case lasA,lasB,lasC must also be on the device.
It is a bit cumbersome but more flexible than having a fixed stride between the batched matrices.

Sabaron · February 27, 2012, 2:34pm

Perfect, seems to be working now. Thanks for the information. I went back to the documentation and it says device memory but with the extra array indirection I figured it didn’t mean that as well.

Steve12456 · March 12, 2012, 7:44pm

Could you please provide the example of how you accomplished the array allocation?