Trying to test out the new CUBLAS batched matrix multiply routines in CUDA 4.1 but I can’t seem to get it working at all. I simplified my code down to calling cublasCgemmBatched with a batch count of 1 and it still fails with CUDA_ERROR_LAUNCH_FAILED every time. I’m sure I’m missing something simple but I’m not seeing it?
cublasHandle_t lhHandle;
CudaUtil::CheckError(cublasCreate(&lhHandle), THIS_FILE,
"Failed to create CUBLAS handle");
CudaUtil::CheckError(cublasSetPointerMode(lhHandle,
CUBLAS_POINTER_MODE_HOST), THIS_FILE,
"Failed to set CUBLAS pointer mode");
cuComplex lsComplexOne;
lsComplexOne.x = 1.0;
lsComplexOne.y = 0.0;
cuComplex lsComplexZero;
lsComplexZero.x = 0.0;
lsComplexZero.y = 0.0;
int lnARows = 16;
int lnAColumns = 24;
int lnBRows = 32;
int lnBColumns = lnARows;
CUdeviceptr lnA, lnB, lnC;
CudaUtil::CheckError(cuMemAlloc(&lnA,
lnARows * lnAColumns * sizeof(tsComplex)),
THIS_FILE, "Failed to allocate A buffer");
CudaUtil::CheckError(cuMemAlloc(&lnB,
lnBRows * lnBColumns * sizeof(tsComplex)),
THIS_FILE, "Failed to allocate B buffer");
CudaUtil::CheckError(cuMemAlloc(&lnC,
lnARows * lnBColumns * sizeof(tsComplex)),
THIS_FILE, "Failed to allocate C buffer");
CudaUtil::CheckError(cublasCgemm(lhHandle, CUBLAS_OP_N, CUBLAS_OP_N,
lnAColumns, lnBRows, lnBColumns, &lsComplexOne,
reinterpret_cast<cuComplex*>(lnA), lnAColumns,
reinterpret_cast<cuComplex*>(lnB), lnBColumns,
&lsComplexZero,
reinterpret_cast<cuComplex*>(lnC), lnAColumns),
THIS_FILE, "Failed to execute CUBLAS complex matrix multiply");
CudaUtil::CheckError(cuCtxSynchronize(), THIS_FILE,
"Failed to perform synchronize");
const cuComplex *lasA[] = { reinterpret_cast<const cuComplex*>(lnA) };
const cuComplex *lasB[] = { reinterpret_cast<const cuComplex*>(lnB) };
cuComplex *lasC[] = { reinterpret_cast<cuComplex*>(lnC) };
CudaUtil::CheckError(cublasCgemmBatched(lhHandle, CUBLAS_OP_N, CUBLAS_OP_N,
lnAColumns, lnBRows, lnBColumns, &lsComplexOne,
lasA, lnAColumns,
lasB, lnBColumns,
&lsComplexZero,
lasC, lnAColumns, 1),
THIS_FILE, "Failed to execute CUBLAS batched complex matrix multiply");
CudaUtil::CheckError(cuCtxSynchronize(), THIS_FILE,
"Failed to perform batch synchronize");